{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989658738366081, "eval_steps": 500, "global_step": 966, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 172.6666717529297, "completions/mean_terminated_length": 172.6666717529297, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.001034126163391934, "grad_norm": 5.071864592095285, "kl": 0.3359375, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 78552.0, "reward": 0.9166666865348816, "reward_std": 0.40627965331077576, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 165.5416717529297, "completions/mean_terminated_length": 165.5416717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.002068252326783868, "grad_norm": 4.059461930492566, "kl": 0.0036773681640625, "learning_rate": 9.999973613218312e-07, "loss": 0.0001, "num_tokens": 161365.0, "reward": 0.8333333730697632, "reward_std": 0.4685417115688324, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 188.95834350585938, "completions/mean_terminated_length": 188.95834350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.0031023784901758012, "grad_norm": 3.468953003174482, "kl": 0.0027923583984375, "learning_rate": 9.999894453151758e-07, "loss": 0.0001, "num_tokens": 239820.0, "reward": 0.6666666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.004136504653567736, "grad_norm": 3.413335444191301, "kl": 0.002685546875, "learning_rate": 9.999762520635849e-07, "loss": 0.0001, "num_tokens": 318868.0, "reward": 1.0833333730697632, "reward_std": 0.25634264945983887, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.371054083108902, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 209.45834350585938, "completions/mean_terminated_length": 209.45834350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.005170630816959669, "grad_norm": 3.8321189551506687, "kl": 0.0026397705078125, "learning_rate": 9.99957781706309e-07, "loss": 0.0001, "num_tokens": 402655.0, "reward": 1.027777910232544, "reward_std": 0.3323635756969452, "rewards/reasoning_reward/mean": 1.0277777910232544, "rewards/reasoning_reward/std": 0.5443310141563416, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 181.6666717529297, "completions/mean_terminated_length": 181.6666717529297, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.0062047569803516025, "grad_norm": 4.016570280789986, "kl": 0.00323486328125, "learning_rate": 9.999340344382978e-07, "loss": 0.0001, "num_tokens": 484391.0, "reward": 1.3958333730697632, "reward_std": 0.3663109242916107, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.642332136631012, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 213.08334350585938, "completions/mean_terminated_length": 213.08334350585938, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.007238883143743537, "grad_norm": 2.9810718864115326, "kl": 0.0038604736328125, "learning_rate": 9.99905010510197e-07, "loss": 0.0002, "num_tokens": 572137.0, "reward": 0.7083333730697632, "reward_std": 0.3535533845424652, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5089774131774902, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 168.58334350585938, "completions/mean_terminated_length": 168.58334350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.008273009307135471, "grad_norm": 0.033223094376599045, "kl": 0.0020904541015625, "learning_rate": 9.998707102283457e-07, "loss": 0.0001, "num_tokens": 650255.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 166.4166717529297, "completions/mean_terminated_length": 166.4166717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.009307135470527405, "grad_norm": 2.0403595074712557, "kl": 0.0030975341796875, "learning_rate": 9.998311339547733e-07, "loss": 0.0001, "num_tokens": 734225.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 185.1666717529297, "completions/mean_terminated_length": 185.1666717529297, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.010341261633919338, "grad_norm": 3.389792798216633, "kl": 0.0031890869140625, "learning_rate": 9.997862821071964e-07, "loss": 0.0001, "num_tokens": 819653.0, "reward": 0.875, "reward_std": 0.40812820196151733, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 186.95834350585938, "completions/mean_terminated_length": 186.95834350585938, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.011375387797311272, "grad_norm": 2.331094314362588, "kl": 0.0029296875, "learning_rate": 9.997361551590132e-07, "loss": 0.0001, "num_tokens": 904148.0, "reward": 0.5, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 147.2916717529297, "completions/mean_terminated_length": 147.2916717529297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.012409513960703205, "grad_norm": 3.413766004232087, "kl": 0.0036163330078125, "learning_rate": 9.996807536392989e-07, "loss": 0.0001, "num_tokens": 984211.0, "reward": 0.8541666865348816, "reward_std": 0.2587745785713196, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.01344364012409514, "grad_norm": 3.9659481687377354, "kl": 0.0033416748046875, "learning_rate": 9.996200781328011e-07, "loss": 0.0001, "num_tokens": 1067738.0, "reward": 0.9375, "reward_std": 0.41282182931900024, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.5954993963241577, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 172.9166717529297, "completions/mean_terminated_length": 172.9166717529297, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.014477766287487074, "grad_norm": 2.6924838670848303, "kl": 0.0023956298828125, "learning_rate": 9.99554129279932e-07, "loss": 0.0001, "num_tokens": 1146640.0, "reward": 0.8333333730697632, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 164.9166717529297, "completions/mean_terminated_length": 164.9166717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.015511892450879007, "grad_norm": 4.1278369792222325, "kl": 0.0029144287109375, "learning_rate": 9.99482907776763e-07, "loss": 0.0001, "num_tokens": 1224398.0, "reward": 0.9583333730697632, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 170.6666717529297, "completions/mean_terminated_length": 170.6666717529297, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.016546018614270942, "grad_norm": 0.03303182322940017, "kl": 0.002471923828125, "learning_rate": 9.994064143750165e-07, "loss": 0.0001, "num_tokens": 1303678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 185.20834350585938, "completions/mean_terminated_length": 185.20834350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.017580144777662874, "grad_norm": 4.190633270648795, "kl": 0.003997802734375, "learning_rate": 9.993246498820587e-07, "loss": 0.0002, "num_tokens": 1387411.0, "reward": 0.875, "reward_std": 0.47920867800712585, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5943574905395508, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.01861427094105481, "grad_norm": 4.124744220407965, "kl": 0.0032958984375, "learning_rate": 9.992376151608897e-07, "loss": 0.0001, "num_tokens": 1465757.0, "reward": 0.8333333730697632, "reward_std": 0.4198887050151825, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.7469745874404907, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 122.29167175292969, "completions/mean_terminated_length": 122.29167175292969, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.01964839710444674, "grad_norm": 4.2126320398251105, "kl": 0.0028839111328125, "learning_rate": 9.991453111301365e-07, "loss": 0.0001, "num_tokens": 1546284.0, "reward": 0.3333333432674408, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.3333333432674408, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.020682523267838676, "grad_norm": 3.9035749186278075, "kl": 0.00433349609375, "learning_rate": 9.990477387640415e-07, "loss": 0.0002, "num_tokens": 1629112.0, "reward": 0.7083333730697632, "reward_std": 0.43810173869132996, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4871538281440735, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 183.33334350585938, "completions/mean_terminated_length": 183.33334350585938, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.02171664943123061, "grad_norm": 2.7742036215600043, "kl": 0.0037078857421875, "learning_rate": 9.989448990924528e-07, "loss": 0.0001, "num_tokens": 1706600.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.022750775594622543, "grad_norm": 2.9495492352769728, "kl": 0.007080078125, "learning_rate": 9.988367932008138e-07, "loss": 0.0003, "num_tokens": 1791203.0, "reward": 0.9375, "reward_std": 0.30551642179489136, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.37044334411621094, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 208.70834350585938, "completions/mean_terminated_length": 208.70834350585938, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.023784901758014478, "grad_norm": 3.890766517596311, "kl": 0.00811767578125, "learning_rate": 9.98723422230151e-07, "loss": 0.0003, "num_tokens": 1871036.0, "reward": 1.1666667461395264, "reward_std": 0.40397143363952637, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.6197241544723511, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.02481902792140641, "grad_norm": 3.3269487482285345, "kl": 0.0029144287109375, "learning_rate": 9.986047873770624e-07, "loss": 0.0001, "num_tokens": 1949213.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 146.1666717529297, "completions/mean_terminated_length": 146.1666717529297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.025853154084798345, "grad_norm": 3.9477140851301993, "kl": 0.0059814453125, "learning_rate": 9.98480889893705e-07, "loss": 0.0002, "num_tokens": 2028025.0, "reward": 0.9791666865348816, "reward_std": 0.5605560541152954, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.6507381796836853, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 219.9166717529297, "completions/mean_terminated_length": 219.9166717529297, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.02688728024819028, "grad_norm": 4.080956404876102, "kl": 0.00860595703125, "learning_rate": 9.98351731087781e-07, "loss": 0.0003, "num_tokens": 2107687.0, "reward": 0.875, "reward_std": 0.46288391947746277, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5757792592048645, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 163.5416717529297, "completions/mean_terminated_length": 163.5416717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.027921406411582212, "grad_norm": 3.5156776593492745, "kl": 0.0048828125, "learning_rate": 9.982173123225243e-07, "loss": 0.0002, "num_tokens": 2186948.0, "reward": 0.8541666865348816, "reward_std": 0.40529346466064453, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.4995468854904175, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.028955532574974147, "grad_norm": 2.437538390860398, "kl": 0.0029449462890625, "learning_rate": 9.980776350166867e-07, "loss": 0.0001, "num_tokens": 2267470.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 151.7916717529297, "completions/mean_terminated_length": 151.7916717529297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.02998965873836608, "grad_norm": 3.9269223056078095, "kl": 0.004730224609375, "learning_rate": 9.979327006445216e-07, "loss": 0.0002, "num_tokens": 2344097.0, "reward": 0.4583333432674408, "reward_std": 0.5049939155578613, "rewards/reasoning_reward/mean": 0.4583333432674408, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 186.20834350585938, "completions/mean_terminated_length": 186.20834350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.031023784901758014, "grad_norm": 3.386462451637892, "kl": 0.004547119140625, "learning_rate": 9.977825107357702e-07, "loss": 0.0002, "num_tokens": 2425350.0, "reward": 0.75, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 178.6666717529297, "completions/mean_terminated_length": 178.6666717529297, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.03205791106514995, "grad_norm": 3.3979916126388687, "kl": 0.01055908203125, "learning_rate": 9.976270668756433e-07, "loss": 0.0004, "num_tokens": 2510054.0, "reward": 1.0277777910232544, "reward_std": 0.3074157238006592, "rewards/reasoning_reward/mean": 1.0277777910232544, "rewards/reasoning_reward/std": 0.4627858102321625, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 155.7916717529297, "completions/mean_terminated_length": 155.7916717529297, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.033092037228541885, "grad_norm": 2.8184150341115375, "kl": 0.004058837890625, "learning_rate": 9.974663707048065e-07, "loss": 0.0002, "num_tokens": 2588921.0, "reward": 0.6666666865348816, "reward_std": 0.35634833574295044, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 154.70834350585938, "completions/mean_terminated_length": 154.70834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.03412616339193381, "grad_norm": 2.828806479597615, "kl": 0.005157470703125, "learning_rate": 9.973004239193618e-07, "loss": 0.0002, "num_tokens": 2671218.0, "reward": 1.1319444179534912, "reward_std": 0.13749298453330994, "rewards/reasoning_reward/mean": 1.1319444179534912, "rewards/reasoning_reward/std": 0.3474515974521637, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.03516028955532575, "grad_norm": 3.0486725384758566, "kl": 0.005401611328125, "learning_rate": 9.971292282708296e-07, "loss": 0.0002, "num_tokens": 2755530.0, "reward": 0.8819444179534912, "reward_std": 0.3624235987663269, "rewards/reasoning_reward/mean": 0.8819444179534912, "rewards/reasoning_reward/std": 0.6226000189781189, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 190.70834350585938, "completions/mean_terminated_length": 190.70834350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.03619441571871768, "grad_norm": 2.8675194738593675, "kl": 0.01416015625, "learning_rate": 9.969527855661307e-07, "loss": 0.0006, "num_tokens": 2845651.0, "reward": 1.3333333730697632, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.434057354927063, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 167.6666717529297, "completions/mean_terminated_length": 167.6666717529297, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.03722854188210962, "grad_norm": 2.8365389717558775, "kl": 0.0047607421875, "learning_rate": 9.967710976675674e-07, "loss": 0.0002, "num_tokens": 2924235.0, "reward": 0.4166666865348816, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.4166666567325592, "rewards/reasoning_reward/std": 0.5036101937294006, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 160.20834350585938, "completions/mean_terminated_length": 160.20834350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.038262668045501554, "grad_norm": 3.3596940088485066, "kl": 0.00372314453125, "learning_rate": 9.965841664928032e-07, "loss": 0.0001, "num_tokens": 3004232.0, "reward": 0.7916666865348816, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 1217.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 256.79168701171875, "completions/mean_terminated_length": 215.04348754882812, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.03929679420889348, "grad_norm": 3.713051017958645, "kl": 0.006988525390625, "learning_rate": 9.963919940148428e-07, "loss": 0.0003, "num_tokens": 3095475.0, "reward": 1.2569445371627808, "reward_std": 0.36115893721580505, "rewards/reasoning_reward/mean": 1.2569445371627808, "rewards/reasoning_reward/std": 0.5223950147628784, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 203.5416717529297, "completions/mean_terminated_length": 203.5416717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.04033092037228542, "grad_norm": 4.058262730903875, "kl": 0.01019287109375, "learning_rate": 9.961945822620118e-07, "loss": 0.0004, "num_tokens": 3184992.0, "reward": 1.375, "reward_std": 0.6197125911712646, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.6954823136329651, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 255.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 177.83334350585938, "completions/mean_terminated_length": 174.478271484375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.04136504653567735, "grad_norm": 3.0112482413882797, "kl": 0.0054931640625, "learning_rate": 9.959919333179344e-07, "loss": 0.0002, "num_tokens": 3266916.0, "reward": 0.8333333730697632, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 144.9166717529297, "completions/mean_terminated_length": 144.9166717529297, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.04239917269906929, "grad_norm": 3.571922005370386, "kl": 0.006378173828125, "learning_rate": 9.957840493215116e-07, "loss": 0.0003, "num_tokens": 3351314.0, "reward": 1.0625, "reward_std": 0.3972596824169159, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.5578004121780396, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 174.2916717529297, "completions/mean_terminated_length": 174.2916717529297, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.04343329886246122, "grad_norm": 4.032955623530826, "kl": 0.006561279296875, "learning_rate": 9.955709324668997e-07, "loss": 0.0003, "num_tokens": 3429881.0, "reward": 0.9166666865348816, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.8427009582519531, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.04446742502585315, "grad_norm": 3.509751245460736, "kl": 0.0074462890625, "learning_rate": 9.953525850034856e-07, "loss": 0.0003, "num_tokens": 3520448.0, "reward": 0.8541666865348816, "reward_std": 0.3871031701564789, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.6672325730323792, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.045501551189245086, "grad_norm": 3.847890323513219, "kl": 0.006622314453125, "learning_rate": 9.951290092358645e-07, "loss": 0.0003, "num_tokens": 3603924.0, "reward": 0.9652777910232544, "reward_std": 0.46292293071746826, "rewards/reasoning_reward/mean": 0.9652777314186096, "rewards/reasoning_reward/std": 0.7121769785881042, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 146.45834350585938, "completions/mean_terminated_length": 146.45834350585938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.04653567735263702, "grad_norm": 3.670364425188319, "kl": 0.0072021484375, "learning_rate": 9.949002075238139e-07, "loss": 0.0003, "num_tokens": 3684711.0, "reward": 0.875, "reward_std": 0.4082186222076416, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.39699962735176086, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 155.08334350585938, "completions/mean_terminated_length": 155.08334350585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.047569803516028956, "grad_norm": 3.726045501753792, "kl": 0.0076904296875, "learning_rate": 9.9466618228227e-07, "loss": 0.0003, "num_tokens": 3762713.0, "reward": 0.8333333730697632, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.04860392967942089, "grad_norm": 3.592630193734655, "kl": 0.00811767578125, "learning_rate": 9.944269359813026e-07, "loss": 0.0003, "num_tokens": 3840002.0, "reward": 1.027777910232544, "reward_std": 0.40778279304504395, "rewards/reasoning_reward/mean": 1.0277777910232544, "rewards/reasoning_reward/std": 0.5972292423248291, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 180.1666717529297, "completions/mean_terminated_length": 180.1666717529297, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.04963805584281282, "grad_norm": 4.29924922955248, "kl": 0.00848388671875, "learning_rate": 9.941824711460871e-07, "loss": 0.0003, "num_tokens": 3922982.0, "reward": 0.8333333730697632, "reward_std": 0.48678088188171387, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.7019641399383545, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.050672182006204755, "grad_norm": 2.8310868538516525, "kl": 0.0054931640625, "learning_rate": 9.9393279035688e-07, "loss": 0.0002, "num_tokens": 4001698.0, "reward": 0.625, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 204.9166717529297, "completions/mean_terminated_length": 204.9166717529297, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.05170630816959669, "grad_norm": 3.525954172380883, "kl": 0.0079345703125, "learning_rate": 9.936778962489902e-07, "loss": 0.0003, "num_tokens": 4081344.0, "reward": 0.4652777910232544, "reward_std": 0.31936562061309814, "rewards/reasoning_reward/mean": 0.4652777910232544, "rewards/reasoning_reward/std": 0.442268967628479, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.052740434332988625, "grad_norm": 4.351659425272402, "kl": 0.01177978515625, "learning_rate": 9.934177915127515e-07, "loss": 0.0005, "num_tokens": 4169460.0, "reward": 1.125, "reward_std": 0.4082186818122864, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.8501917719841003, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 186.08334350585938, "completions/mean_terminated_length": 186.08334350585938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.05377456049638056, "grad_norm": 3.5125629537492675, "kl": 0.00982666015625, "learning_rate": 9.931524788934949e-07, "loss": 0.0004, "num_tokens": 4252750.0, "reward": 0.5416666865348816, "reward_std": 0.5090917348861694, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.7210599780082703, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.05480868665977249, "grad_norm": 3.193225562666632, "kl": 0.01190185546875, "learning_rate": 9.928819611915188e-07, "loss": 0.0005, "num_tokens": 4330064.0, "reward": 1.0416667461395264, "reward_std": 0.20693820714950562, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.7058246731758118, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.055842812823164424, "grad_norm": 2.9953108717900974, "kl": 0.01116943359375, "learning_rate": 9.9260624126206e-07, "loss": 0.0004, "num_tokens": 4415991.0, "reward": 1.1666667461395264, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 176.9166717529297, "completions/mean_terminated_length": 176.9166717529297, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.05687693898655636, "grad_norm": 3.5740247241093708, "kl": 0.01312255859375, "learning_rate": 9.923253220152627e-07, "loss": 0.0005, "num_tokens": 4498133.0, "reward": 0.6458333730697632, "reward_std": 0.3310800790786743, "rewards/reasoning_reward/mean": 0.6458333134651184, "rewards/reasoning_reward/std": 0.5610387921333313, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 216.0416717529297, "completions/mean_terminated_length": 216.0416717529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.057911065149948295, "grad_norm": 4.117392785596258, "kl": 0.01153564453125, "learning_rate": 9.92039206416149e-07, "loss": 0.0005, "num_tokens": 4576190.0, "reward": 0.4583333432674408, "reward_std": 0.3574431836605072, "rewards/reasoning_reward/mean": 0.4583333432674408, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 153.83334350585938, "completions/mean_terminated_length": 153.83334350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.05894519131334023, "grad_norm": 4.837645018388818, "kl": 0.00958251953125, "learning_rate": 9.917478974845873e-07, "loss": 0.0004, "num_tokens": 4653658.0, "reward": 0.5416666865348816, "reward_std": 0.46288391947746277, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 221.6666717529297, "completions/mean_terminated_length": 221.6666717529297, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05997931747673216, "grad_norm": 3.7641592955112637, "kl": 0.011962890625, "learning_rate": 9.914513982952592e-07, "loss": 0.0005, "num_tokens": 4733554.0, "reward": 0.5902777910232544, "reward_std": 0.5594554543495178, "rewards/reasoning_reward/mean": 0.5902777910232544, "rewards/reasoning_reward/std": 0.5646151304244995, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.06101344364012409, "grad_norm": 3.2142277856073393, "kl": 0.01385498046875, "learning_rate": 9.911497119776286e-07, "loss": 0.0006, "num_tokens": 4819339.0, "reward": 1.1041667461395264, "reward_std": 0.34730279445648193, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.43353530764579773, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 223.70834350585938, "completions/mean_terminated_length": 223.70834350585938, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.06204756980351603, "grad_norm": 3.6323983394895354, "kl": 0.01446533203125, "learning_rate": 9.908428417159078e-07, "loss": 0.0006, "num_tokens": 4902092.0, "reward": 0.6666666865348816, "reward_std": 0.4685417115688324, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.7613869905471802, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 210.08334350585938, "completions/mean_terminated_length": 210.08334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.06308169596690796, "grad_norm": 4.256974764936185, "kl": 0.01409912109375, "learning_rate": 9.905307907490242e-07, "loss": 0.0006, "num_tokens": 4982646.0, "reward": 0.7986111044883728, "reward_std": 0.7822409868240356, "rewards/reasoning_reward/mean": 0.7986111044883728, "rewards/reasoning_reward/std": 0.800028920173645, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 125.83333587646484, "completions/mean_terminated_length": 125.83333587646484, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.0641158221302999, "grad_norm": 3.6447804056269857, "kl": 0.0086669921875, "learning_rate": 9.902135623705864e-07, "loss": 0.0003, "num_tokens": 5061682.0, "reward": 0.8333333730697632, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.06514994829369183, "grad_norm": 2.2300859017750354, "kl": 0.01177978515625, "learning_rate": 9.898911599288483e-07, "loss": 0.0005, "num_tokens": 5146090.0, "reward": 1.25, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.06618407445708377, "grad_norm": 3.6675785111838213, "kl": 0.0224609375, "learning_rate": 9.895635868266754e-07, "loss": 0.0009, "num_tokens": 5224032.0, "reward": 1.0833333730697632, "reward_std": 0.3493061661720276, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.4340573847293854, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.0672182006204757, "grad_norm": 3.434155813310191, "kl": 0.00909423828125, "learning_rate": 9.892308465215079e-07, "loss": 0.0004, "num_tokens": 5303200.0, "reward": 0.6666666865348816, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.06825232678386763, "grad_norm": 2.512958656172137, "kl": 0.01190185546875, "learning_rate": 9.888929425253235e-07, "loss": 0.0005, "num_tokens": 5378760.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 164.5416717529297, "completions/mean_terminated_length": 164.5416717529297, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.06928645294725956, "grad_norm": 3.4490407712658686, "kl": 0.0157470703125, "learning_rate": 9.885498784046023e-07, "loss": 0.0006, "num_tokens": 5468669.0, "reward": 1.2083333730697632, "reward_std": 0.31285393238067627, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.4402732849121094, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 166.45834350585938, "completions/mean_terminated_length": 166.45834350585938, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.0703205791106515, "grad_norm": 3.4607132042836923, "kl": 0.01129150390625, "learning_rate": 9.882016577802873e-07, "loss": 0.0005, "num_tokens": 5547952.0, "reward": 0.8472222089767456, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.8472221493721008, "rewards/reasoning_reward/std": 0.7221757173538208, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 151.7916717529297, "completions/mean_terminated_length": 151.7916717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.07135470527404343, "grad_norm": 2.7650585026718284, "kl": 0.00799560546875, "learning_rate": 9.878482843277468e-07, "loss": 0.0003, "num_tokens": 5627683.0, "reward": 0.75, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.07238883143743537, "grad_norm": 4.449011636323865, "kl": 0.01300048828125, "learning_rate": 9.874897617767367e-07, "loss": 0.0005, "num_tokens": 5707485.0, "reward": 0.9166666865348816, "reward_std": 0.3535533845424652, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 459.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 268.29168701171875, "completions/mean_terminated_length": 260.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.0734229576008273, "grad_norm": 3.5424047636885625, "kl": 0.014892578125, "learning_rate": 9.871260939113595e-07, "loss": 0.0006, "num_tokens": 5787164.0, "reward": 1.2708333730697632, "reward_std": 0.3915778398513794, "rewards/reasoning_reward/mean": 1.2708333730697632, "rewards/reasoning_reward/std": 0.48107290267944336, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 118.54167175292969, "completions/mean_terminated_length": 118.54167175292969, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.07445708376421924, "grad_norm": 3.340246386770097, "kl": 0.00933837890625, "learning_rate": 9.867572845700245e-07, "loss": 0.0004, "num_tokens": 5866153.0, "reward": 0.7083333730697632, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 169.08334350585938, "completions/mean_terminated_length": 169.08334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.07549120992761117, "grad_norm": 4.82302140773541, "kl": 0.01165771484375, "learning_rate": 9.863833376454086e-07, "loss": 0.0005, "num_tokens": 5951243.0, "reward": 0.8402777910232544, "reward_std": 0.5060732960700989, "rewards/reasoning_reward/mean": 0.8402777314186096, "rewards/reasoning_reward/std": 0.5349594354629517, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 183.58334350585938, "completions/mean_terminated_length": 183.58334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.07652533609100311, "grad_norm": 2.6240055600050596, "kl": 0.017578125, "learning_rate": 9.86004257084414e-07, "loss": 0.0007, "num_tokens": 6027409.0, "reward": 0.8333333730697632, "reward_std": 0.24339044094085693, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.5247498154640198, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 150.0416717529297, "completions/mean_terminated_length": 150.0416717529297, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.07755946225439504, "grad_norm": 3.592297508316741, "kl": 0.01373291015625, "learning_rate": 9.856200468881274e-07, "loss": 0.0006, "num_tokens": 6110490.0, "reward": 1.0, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.7223151326179504, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 154.4166717529297, "completions/mean_terminated_length": 154.4166717529297, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.07859358841778696, "grad_norm": 9.783346222849755, "kl": 0.01300048828125, "learning_rate": 9.85230711111777e-07, "loss": 0.0005, "num_tokens": 6188476.0, "reward": 0.75, "reward_std": 0.5423438549041748, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 180.83334350585938, "completions/mean_terminated_length": 180.83334350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0796277145811789, "grad_norm": 4.348565558470908, "kl": 0.0186767578125, "learning_rate": 9.848362538646898e-07, "loss": 0.0007, "num_tokens": 6272632.0, "reward": 1.2083333730697632, "reward_std": 0.33723291754722595, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.4871537983417511, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 213.70834350585938, "completions/mean_terminated_length": 213.70834350585938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.08066184074457083, "grad_norm": 3.2258066953920532, "kl": 0.01544189453125, "learning_rate": 9.844366793102487e-07, "loss": 0.0006, "num_tokens": 6361513.0, "reward": 1.3958333730697632, "reward_std": 0.13607725501060486, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.32900264859199524, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 173.08334350585938, "completions/mean_terminated_length": 173.08334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.08169596690796277, "grad_norm": 3.863377237898613, "kl": 0.01416015625, "learning_rate": 9.840319916658487e-07, "loss": 0.0006, "num_tokens": 6446875.0, "reward": 0.868055522441864, "reward_std": 0.4124408960342407, "rewards/reasoning_reward/mean": 0.868055522441864, "rewards/reasoning_reward/std": 0.5793948769569397, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 166.0416717529297, "completions/mean_terminated_length": 166.0416717529297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0827300930713547, "grad_norm": 3.9380960605825535, "kl": 0.0107421875, "learning_rate": 9.836221952028512e-07, "loss": 0.0004, "num_tokens": 6526732.0, "reward": 0.4166666865348816, "reward_std": 0.46854168176651, "rewards/reasoning_reward/mean": 0.4166666567325592, "rewards/reasoning_reward/std": 0.5036101937294006, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 203.2916717529297, "completions/mean_terminated_length": 203.2916717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.08376421923474664, "grad_norm": 2.968433728304001, "kl": 0.013427734375, "learning_rate": 9.832072942465403e-07, "loss": 0.0005, "num_tokens": 6613659.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.8340576887130737, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.08479834539813857, "grad_norm": 4.153184905003268, "kl": 0.01348876953125, "learning_rate": 9.827872931760762e-07, "loss": 0.0005, "num_tokens": 6692039.0, "reward": 1.0416667461395264, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.7506036162376404, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08583247156153051, "grad_norm": 3.143774778506864, "kl": 0.01519775390625, "learning_rate": 9.823621964244499e-07, "loss": 0.0006, "num_tokens": 6776582.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.48900964856147766, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.08686659772492245, "grad_norm": 3.829881249796472, "kl": 0.01422119140625, "learning_rate": 9.81932008478435e-07, "loss": 0.0006, "num_tokens": 6853528.0, "reward": 0.9583333730697632, "reward_std": 0.4563409090042114, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.4871538281440735, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 198.33334350585938, "completions/mean_terminated_length": 198.33334350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.08790072388831438, "grad_norm": 4.080367426621312, "kl": 0.01171875, "learning_rate": 9.814967338785423e-07, "loss": 0.0005, "num_tokens": 6934072.0, "reward": 0.8125, "reward_std": 0.5672780275344849, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.6280721426010132, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 152.08334350585938, "completions/mean_terminated_length": 152.08334350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0889348500517063, "grad_norm": 2.3347185584192713, "kl": 0.01202392578125, "learning_rate": 9.810563772189695e-07, "loss": 0.0005, "num_tokens": 7012290.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 228.3478240966797, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.08996897621509824, "grad_norm": 2.48983908009315, "kl": 0.01373291015625, "learning_rate": 9.806109431475548e-07, "loss": 0.0006, "num_tokens": 7092732.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 198.5416717529297, "completions/mean_terminated_length": 198.5416717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09100310237849017, "grad_norm": 3.8019105243494167, "kl": 0.01519775390625, "learning_rate": 9.80160436365727e-07, "loss": 0.0006, "num_tokens": 7170505.0, "reward": 0.4791666865348816, "reward_std": 0.5551774501800537, "rewards/reasoning_reward/mean": 0.4791666567325592, "rewards/reasoning_reward/std": 0.5800893306732178, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 198.6666717529297, "completions/mean_terminated_length": 198.6666717529297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.09203722854188211, "grad_norm": 4.310499510741145, "kl": 0.0174560546875, "learning_rate": 9.797048616284557e-07, "loss": 0.0007, "num_tokens": 7256361.0, "reward": 0.7986111044883728, "reward_std": 0.5306869745254517, "rewards/reasoning_reward/mean": 0.7986111044883728, "rewards/reasoning_reward/std": 0.6984785795211792, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 169.45834350585938, "completions/mean_terminated_length": 168.86956787109375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.09307135470527404, "grad_norm": 3.256888913160928, "kl": 0.01373291015625, "learning_rate": 9.792442237442013e-07, "loss": 0.0006, "num_tokens": 7333220.0, "reward": 0.75, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 246.45834350585938, "completions/mean_terminated_length": 246.45834350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.09410548086866598, "grad_norm": 4.176429595136656, "kl": 0.0230712890625, "learning_rate": 9.787785275748643e-07, "loss": 0.0009, "num_tokens": 7413559.0, "reward": 1.2291667461395264, "reward_std": 0.6453183889389038, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.7220014929771423, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 186.70834350585938, "completions/mean_terminated_length": 182.6521759033203, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.09513960703205791, "grad_norm": 4.0765925380672225, "kl": 0.0146484375, "learning_rate": 9.783077780357338e-07, "loss": 0.0006, "num_tokens": 7494080.0, "reward": 0.680555522441864, "reward_std": 0.5991425514221191, "rewards/reasoning_reward/mean": 0.680555522441864, "rewards/reasoning_reward/std": 0.6077531576156616, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.09617373319544985, "grad_norm": 3.060802289470635, "kl": 0.016845703125, "learning_rate": 9.778319800954364e-07, "loss": 0.0007, "num_tokens": 7577778.0, "reward": 0.9236111044883728, "reward_std": 0.4386470317840576, "rewards/reasoning_reward/mean": 0.9236111044883728, "rewards/reasoning_reward/std": 0.533829391002655, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 172.45834350585938, "completions/mean_terminated_length": 172.45834350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.09720785935884178, "grad_norm": 5.03422567888706, "kl": 0.01513671875, "learning_rate": 9.773511387758821e-07, "loss": 0.0006, "num_tokens": 7656581.0, "reward": 0.8541666865348816, "reward_std": 0.4434394836425781, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.878229022026062, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 179.9166717529297, "completions/mean_terminated_length": 179.9166717529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.09824198552223372, "grad_norm": 3.3926662846097773, "kl": 0.01263427734375, "learning_rate": 9.768652591522133e-07, "loss": 0.0005, "num_tokens": 7735547.0, "reward": 0.75, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 217.5416717529297, "completions/mean_terminated_length": 217.5416717529297, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.09927611168562564, "grad_norm": 4.22083012756532, "kl": 0.0191650390625, "learning_rate": 9.763743463527496e-07, "loss": 0.0008, "num_tokens": 7819328.0, "reward": 0.7083333730697632, "reward_std": 0.45032867789268494, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5500329732894897, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10031023784901758, "grad_norm": 3.7501772543377743, "kl": 0.0177001953125, "learning_rate": 9.758784055589346e-07, "loss": 0.0007, "num_tokens": 7897641.0, "reward": 0.625, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.10134436401240951, "grad_norm": 4.3294563512717215, "kl": 0.0191650390625, "learning_rate": 9.753774420052807e-07, "loss": 0.0008, "num_tokens": 7974623.0, "reward": 0.8958333730697632, "reward_std": 0.4929513931274414, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.5311834216117859, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.10237849017580145, "grad_norm": 4.1235008690948165, "kl": 0.0220947265625, "learning_rate": 9.748714609793147e-07, "loss": 0.0009, "num_tokens": 8054901.0, "reward": 0.4166666865348816, "reward_std": 0.5099153518676758, "rewards/reasoning_reward/mean": 0.4166666567325592, "rewards/reasoning_reward/std": 0.524749755859375, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 205.58334350585938, "completions/mean_terminated_length": 205.58334350585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.10341261633919338, "grad_norm": 3.113383684598118, "kl": 0.01806640625, "learning_rate": 9.743604678215205e-07, "loss": 0.0007, "num_tokens": 8136203.0, "reward": 0.7777777910232544, "reward_std": 0.3232055902481079, "rewards/reasoning_reward/mean": 0.7777777314186096, "rewards/reasoning_reward/std": 0.8145219683647156, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.10444674250258532, "grad_norm": 3.7183447125419007, "kl": 0.0211181640625, "learning_rate": 9.738444679252843e-07, "loss": 0.0008, "num_tokens": 8215654.0, "reward": 0.4791666865348816, "reward_std": 0.4671442210674286, "rewards/reasoning_reward/mean": 0.4791666567325592, "rewards/reasoning_reward/std": 0.5413181781768799, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.10548086866597725, "grad_norm": 3.1873320574218478, "kl": 0.02392578125, "learning_rate": 9.733234667368368e-07, "loss": 0.001, "num_tokens": 8304221.0, "reward": 1.1388888359069824, "reward_std": 0.24966806173324585, "rewards/reasoning_reward/mean": 1.1388888359069824, "rewards/reasoning_reward/std": 0.5353825092315674, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.10651499482936919, "grad_norm": 3.681506481032909, "kl": 0.016357421875, "learning_rate": 9.727974697551958e-07, "loss": 0.0007, "num_tokens": 8384285.0, "reward": 0.4236111044883728, "reward_std": 0.47574663162231445, "rewards/reasoning_reward/mean": 0.4236110746860504, "rewards/reasoning_reward/std": 0.49874040484428406, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 239.0416717529297, "completions/mean_terminated_length": 239.0416717529297, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.10754912099276112, "grad_norm": 2.7098677939554507, "kl": 0.0281982421875, "learning_rate": 9.722664825321082e-07, "loss": 0.0011, "num_tokens": 8481646.0, "reward": 1.2916667461395264, "reward_std": 0.1498909741640091, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.6004225611686707, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.10858324715615306, "grad_norm": 3.7431786732254246, "kl": 0.0157470703125, "learning_rate": 9.717305106719916e-07, "loss": 0.0006, "num_tokens": 8560781.0, "reward": 0.7083333730697632, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5500329732894897, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 186.20834350585938, "completions/mean_terminated_length": 186.20834350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.10961737331954498, "grad_norm": 3.620350204008683, "kl": 0.033447265625, "learning_rate": 9.71189559831875e-07, "loss": 0.0013, "num_tokens": 8650034.0, "reward": 1.6875, "reward_std": 0.33768826723098755, "rewards/reasoning_reward/mean": 1.6875, "rewards/reasoning_reward/std": 0.4618605971336365, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 187.70834350585938, "completions/mean_terminated_length": 187.70834350585938, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.11065149948293691, "grad_norm": 2.325484346203483, "kl": 0.0216064453125, "learning_rate": 9.70643635721339e-07, "loss": 0.0009, "num_tokens": 8727995.0, "reward": 1.0625, "reward_std": 0.19795583188533783, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.3398369252681732, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 206.5416717529297, "completions/mean_terminated_length": 206.5416717529297, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11168562564632885, "grad_norm": 3.929289600039794, "kl": 0.0341796875, "learning_rate": 9.70092744102456e-07, "loss": 0.0014, "num_tokens": 8817472.0, "reward": 1.4305555820465088, "reward_std": 0.32358893752098083, "rewards/reasoning_reward/mean": 1.4305554628372192, "rewards/reasoning_reward/std": 0.5222504734992981, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.11271975180972078, "grad_norm": 3.692465779722894, "kl": 0.0284423828125, "learning_rate": 9.695368907897286e-07, "loss": 0.0011, "num_tokens": 8910545.0, "reward": 1.3958333730697632, "reward_std": 0.31444376707077026, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.5243180394172668, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 200.95834350585938, "completions/mean_terminated_length": 200.95834350585938, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.11375387797311272, "grad_norm": 3.73349102770315, "kl": 0.037109375, "learning_rate": 9.689760816500284e-07, "loss": 0.0015, "num_tokens": 9005792.0, "reward": 1.1666667461395264, "reward_std": 0.31443244218826294, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 228.95834350585938, "completions/mean_terminated_length": 228.95834350585938, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.11478800413650465, "grad_norm": 2.8820257320459106, "kl": 0.0201416015625, "learning_rate": 9.684103226025355e-07, "loss": 0.0008, "num_tokens": 9088655.0, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5565811395645142, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 204.58334350585938, "completions/mean_terminated_length": 204.58334350585938, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.11582213029989659, "grad_norm": 1.9153457674664132, "kl": 0.0242919921875, "learning_rate": 9.678396196186738e-07, "loss": 0.001, "num_tokens": 9172229.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 228.08334350585938, "completions/mean_terminated_length": 228.08334350585938, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.11685625646328852, "grad_norm": 4.091930192339665, "kl": 0.0234375, "learning_rate": 9.67263978722049e-07, "loss": 0.0009, "num_tokens": 9250847.0, "reward": 0.7361111044883728, "reward_std": 0.4770033359527588, "rewards/reasoning_reward/mean": 0.7361111044883728, "rewards/reasoning_reward/std": 0.581345796585083, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 174.7916717529297, "completions/mean_terminated_length": 174.7916717529297, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.11789038262668046, "grad_norm": 3.8333319654298315, "kl": 0.0223388671875, "learning_rate": 9.666834059883856e-07, "loss": 0.0009, "num_tokens": 9336570.0, "reward": 0.7708333730697632, "reward_std": 0.50337815284729, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.5706435441970825, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1189245087900724, "grad_norm": 4.4406021802940545, "kl": 0.0303955078125, "learning_rate": 9.66097907545462e-07, "loss": 0.0012, "num_tokens": 9416634.0, "reward": 0.8472222089767456, "reward_std": 0.5322877764701843, "rewards/reasoning_reward/mean": 0.8472221493721008, "rewards/reasoning_reward/std": 0.5644814372062683, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 155.5416717529297, "completions/mean_terminated_length": 155.5416717529297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.11995863495346432, "grad_norm": 2.161252175635768, "kl": 0.017822265625, "learning_rate": 9.655074895730462e-07, "loss": 0.0007, "num_tokens": 9501367.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 185.33334350585938, "completions/mean_terminated_length": 185.33334350585938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.12099276111685625, "grad_norm": 4.450081255781044, "kl": 0.0238037109375, "learning_rate": 9.649121583028299e-07, "loss": 0.0009, "num_tokens": 9580399.0, "reward": 0.8125, "reward_std": 0.5099677443504333, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.5067479610443115, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.12202688728024819, "grad_norm": 3.213840485912054, "kl": 0.025390625, "learning_rate": 9.643119200183637e-07, "loss": 0.001, "num_tokens": 9669941.0, "reward": 1.1458333730697632, "reward_std": 0.24056154489517212, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.7144345045089722, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.12306101344364012, "grad_norm": 3.122769680653565, "kl": 0.0184326171875, "learning_rate": 9.637067810549906e-07, "loss": 0.0007, "num_tokens": 9749513.0, "reward": 1.0555555820465088, "reward_std": 0.2238859385251999, "rewards/reasoning_reward/mean": 1.0555554628372192, "rewards/reasoning_reward/std": 0.31724458932876587, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 178.7916717529297, "completions/mean_terminated_length": 178.7916717529297, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.12409513960703206, "grad_norm": 4.639227790783391, "kl": 0.02978515625, "learning_rate": 9.63096747799778e-07, "loss": 0.0012, "num_tokens": 9838452.0, "reward": 1.4583333730697632, "reward_std": 0.5078567266464233, "rewards/reasoning_reward/mean": 1.4583333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 131.83334350585938, "completions/mean_terminated_length": 131.83334350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.125129265770424, "grad_norm": 2.4639138077233618, "kl": 0.0234375, "learning_rate": 9.624818266914519e-07, "loss": 0.0009, "num_tokens": 9916816.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 232.9166717529297, "completions/mean_terminated_length": 232.9166717529297, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.12616339193381593, "grad_norm": 3.8580416088238216, "kl": 0.0281982421875, "learning_rate": 9.618620242203278e-07, "loss": 0.0011, "num_tokens": 10002838.0, "reward": 1.1388888359069824, "reward_std": 0.3499924838542938, "rewards/reasoning_reward/mean": 1.1388888359069824, "rewards/reasoning_reward/std": 0.5870310664176941, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.12719751809720786, "grad_norm": 2.83715772505631, "kl": 0.016845703125, "learning_rate": 9.612373469282428e-07, "loss": 0.0007, "num_tokens": 10081507.0, "reward": 0.7916666865348816, "reward_std": 0.3535533845424652, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.6580052971839905, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 144.33334350585938, "completions/mean_terminated_length": 144.33334350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1282316442605998, "grad_norm": 1.8523241506048544, "kl": 0.0299072265625, "learning_rate": 9.606078014084863e-07, "loss": 0.0012, "num_tokens": 10165467.0, "reward": 0.8333333730697632, "reward_std": 0.19920477271080017, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.40824830532073975, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.12926577042399173, "grad_norm": 3.493963599295082, "kl": 0.03076171875, "learning_rate": 9.5997339430573e-07, "loss": 0.0012, "num_tokens": 10241968.0, "reward": 0.7083333730697632, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 153.33334350585938, "completions/mean_terminated_length": 153.33334350585938, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.13029989658738367, "grad_norm": 3.815467992926747, "kl": 0.02294921875, "learning_rate": 9.59334132315959e-07, "loss": 0.0009, "num_tokens": 10321768.0, "reward": 0.7916666865348816, "reward_std": 0.4082186818122864, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 184.9166717529297, "completions/mean_terminated_length": 184.9166717529297, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1313340227507756, "grad_norm": 3.1667726238296843, "kl": 0.020751953125, "learning_rate": 9.586900221863996e-07, "loss": 0.0008, "num_tokens": 10400854.0, "reward": 0.7083333730697632, "reward_std": 0.6100153923034668, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.6064269542694092, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 164.0416717529297, "completions/mean_terminated_length": 164.0416717529297, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.13236814891416754, "grad_norm": 3.1674735991804623, "kl": 0.0181884765625, "learning_rate": 9.580410707154494e-07, "loss": 0.0007, "num_tokens": 10481695.0, "reward": 0.5625, "reward_std": 0.28302299976348877, "rewards/reasoning_reward/mean": 0.5625, "rewards/reasoning_reward/std": 0.49590715765953064, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.13340227507755947, "grad_norm": 2.4728156183528083, "kl": 0.02294921875, "learning_rate": 9.573872847526048e-07, "loss": 0.0009, "num_tokens": 10561455.0, "reward": 1.0, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1344364012409514, "grad_norm": 4.017961732422752, "kl": 0.0291748046875, "learning_rate": 9.567286711983885e-07, "loss": 0.0012, "num_tokens": 10641076.0, "reward": 0.6736111044883728, "reward_std": 0.45231467485427856, "rewards/reasoning_reward/mean": 0.6736111044883728, "rewards/reasoning_reward/std": 0.5948230028152466, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 200.9166717529297, "completions/mean_terminated_length": 198.9130401611328, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.13547052740434332, "grad_norm": 3.138708740315387, "kl": 0.0189208984375, "learning_rate": 9.560652370042771e-07, "loss": 0.0008, "num_tokens": 10719122.0, "reward": 1.0, "reward_std": 0.19500279426574707, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.2553769648075104, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.13650465356773525, "grad_norm": 3.840304727799911, "kl": 0.0203857421875, "learning_rate": 9.553969891726289e-07, "loss": 0.0008, "num_tokens": 10799388.0, "reward": 1.0833333730697632, "reward_std": 0.4198887050151825, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 159.83334350585938, "completions/mean_terminated_length": 159.83334350585938, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1375387797311272, "grad_norm": 1.7596178724801423, "kl": 0.01904296875, "learning_rate": 9.547239347566068e-07, "loss": 0.0008, "num_tokens": 10880672.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 188.58334350585938, "completions/mean_terminated_length": 188.58334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.13857290589451912, "grad_norm": 4.414440886337846, "kl": 0.0269775390625, "learning_rate": 9.540460808601069e-07, "loss": 0.0011, "num_tokens": 10958862.0, "reward": 1.625, "reward_std": 0.19801273941993713, "rewards/reasoning_reward/mean": 1.625, "rewards/reasoning_reward/std": 0.3686048984527588, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.13960703205791106, "grad_norm": 0.18154064062767528, "kl": 0.0322265625, "learning_rate": 9.533634346376827e-07, "loss": 0.0013, "num_tokens": 11035034.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 126.16667175292969, "completions/mean_terminated_length": 126.16667175292969, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.140641158221303, "grad_norm": 4.281841546042445, "kl": 0.01324462890625, "learning_rate": 9.526760032944687e-07, "loss": 0.0005, "num_tokens": 11114766.0, "reward": 0.5416666865348816, "reward_std": 0.4082186818122864, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.14167528438469493, "grad_norm": 3.112436935614297, "kl": 0.0211181640625, "learning_rate": 9.519837940861051e-07, "loss": 0.0008, "num_tokens": 11198851.0, "reward": 0.75, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.14270941054808686, "grad_norm": 2.1893308302361865, "kl": 0.020751953125, "learning_rate": 9.512868143186614e-07, "loss": 0.0008, "num_tokens": 11274508.0, "reward": 0.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 148.4166717529297, "completions/mean_terminated_length": 148.4166717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1437435367114788, "grad_norm": 1.9943583574276684, "kl": 0.03076171875, "learning_rate": 9.505850713485588e-07, "loss": 0.0012, "num_tokens": 11355774.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 177.58334350585938, "completions/mean_terminated_length": 177.58334350585938, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.14477766287487073, "grad_norm": 3.019726108756687, "kl": 0.0269775390625, "learning_rate": 9.498785725824927e-07, "loss": 0.0011, "num_tokens": 11432412.0, "reward": 0.7916666865348816, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4402732849121094, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.14581178903826267, "grad_norm": 3.4892636769097214, "kl": 0.0234375, "learning_rate": 9.491673254773544e-07, "loss": 0.0009, "num_tokens": 11508697.0, "reward": 0.4166666865348816, "reward_std": 0.40397143363952637, "rewards/reasoning_reward/mean": 0.4166666567325592, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 186.9166717529297, "completions/mean_terminated_length": 186.9166717529297, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.1468459152016546, "grad_norm": 2.4711223309041235, "kl": 0.0299072265625, "learning_rate": 9.484513375401531e-07, "loss": 0.0012, "num_tokens": 11589767.0, "reward": 0.8472222685813904, "reward_std": 0.3876555860042572, "rewards/reasoning_reward/mean": 0.8472222685813904, "rewards/reasoning_reward/std": 0.6444048285484314, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 137.4166717529297, "completions/mean_terminated_length": 137.4166717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.14788004136504654, "grad_norm": 3.9836312904992175, "kl": 0.0179443359375, "learning_rate": 9.477306163279353e-07, "loss": 0.0007, "num_tokens": 11670673.0, "reward": 1.0416667461395264, "reward_std": 0.43810173869132996, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.6064269542694092, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.14891416752843847, "grad_norm": 3.084825540437868, "kl": 0.042724609375, "learning_rate": 9.470051694477066e-07, "loss": 0.0017, "num_tokens": 11757628.0, "reward": 0.868055522441864, "reward_std": 0.2675723433494568, "rewards/reasoning_reward/mean": 0.868055522441864, "rewards/reasoning_reward/std": 0.6331791877746582, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 136.83334350585938, "completions/mean_terminated_length": 136.83334350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1499482936918304, "grad_norm": 3.603155546437747, "kl": 0.0255126953125, "learning_rate": 9.462750045563502e-07, "loss": 0.001, "num_tokens": 11835120.0, "reward": 0.7291666865348816, "reward_std": 0.33108004927635193, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.48854634165763855, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.15098241985522234, "grad_norm": 3.781720407335475, "kl": 0.060302734375, "learning_rate": 9.45540129360547e-07, "loss": 0.0024, "num_tokens": 11919284.0, "reward": 1.3541667461395264, "reward_std": 0.25392836332321167, "rewards/reasoning_reward/mean": 1.3541666269302368, "rewards/reasoning_reward/std": 0.4293363690376282, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 159.2916717529297, "completions/mean_terminated_length": 159.2916717529297, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.15201654601861428, "grad_norm": 3.8293089763902644, "kl": 0.04296875, "learning_rate": 9.448005516166934e-07, "loss": 0.0017, "num_tokens": 12002627.0, "reward": 0.9583333730697632, "reward_std": 0.45032867789268494, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.6064269542694092, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 245.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 169.3478240966797, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.15305067218200621, "grad_norm": 2.4437113547282188, "kl": 0.0250244140625, "learning_rate": 9.4405627913082e-07, "loss": 0.001, "num_tokens": 12082679.0, "reward": 0.7916666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.15408479834539815, "grad_norm": 3.1876345769180663, "kl": 0.036865234375, "learning_rate": 9.433073197585089e-07, "loss": 0.0015, "num_tokens": 12166192.0, "reward": 1.125, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 185.478271484375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.15511892450879008, "grad_norm": 2.8956791515173244, "kl": 0.0299072265625, "learning_rate": 9.425536814048112e-07, "loss": 0.0012, "num_tokens": 12246877.0, "reward": 1.1875, "reward_std": 0.24052315950393677, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.4922405183315277, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.15615305067218202, "grad_norm": 3.578662637602958, "kl": 0.038330078125, "learning_rate": 9.417953720241633e-07, "loss": 0.0015, "num_tokens": 12331145.0, "reward": 0.875, "reward_std": 0.367926687002182, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 155.83334350585938, "completions/mean_terminated_length": 155.83334350585938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.15718717683557393, "grad_norm": 2.4928642366111045, "kl": 0.040283203125, "learning_rate": 9.410323996203026e-07, "loss": 0.0016, "num_tokens": 12413805.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 163.20834350585938, "completions/mean_terminated_length": 163.20834350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.15822130299896586, "grad_norm": 2.0203710118248086, "kl": 0.0213623046875, "learning_rate": 9.402647722461838e-07, "loss": 0.0009, "num_tokens": 12485866.0, "reward": 0.5, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 182.1666717529297, "completions/mean_terminated_length": 182.1666717529297, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1592554291623578, "grad_norm": 2.745126246673903, "kl": 0.02880859375, "learning_rate": 9.394924980038931e-07, "loss": 0.0012, "num_tokens": 12564518.0, "reward": 1.0347223281860352, "reward_std": 0.2400643527507782, "rewards/reasoning_reward/mean": 1.0347222089767456, "rewards/reasoning_reward/std": 0.3184320330619812, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 167.95834350585938, "completions/mean_terminated_length": 167.95834350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.16028955532574973, "grad_norm": 3.2611636461773656, "kl": 0.01806640625, "learning_rate": 9.387155850445634e-07, "loss": 0.0007, "num_tokens": 12646309.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 162.9166717529297, "completions/mean_terminated_length": 162.9166717529297, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.16132368148914167, "grad_norm": 3.5046780041435315, "kl": 0.049072265625, "learning_rate": 9.379340415682877e-07, "loss": 0.002, "num_tokens": 12728907.0, "reward": 0.8333333730697632, "reward_std": 0.3616904020309448, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.7755316495895386, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 158.0416717529297, "completions/mean_terminated_length": 158.0416717529297, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1623578076525336, "grad_norm": 3.0934531885922616, "kl": 0.049072265625, "learning_rate": 9.371478758240327e-07, "loss": 0.002, "num_tokens": 12819820.0, "reward": 1.2708333730697632, "reward_std": 0.21322892606258392, "rewards/reasoning_reward/mean": 1.2708333730697632, "rewards/reasoning_reward/std": 0.6075461506843567, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16339193381592554, "grad_norm": 0.2825124812867749, "kl": 0.03076171875, "learning_rate": 9.363570961095522e-07, "loss": 0.0012, "num_tokens": 12902054.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.16442605997931747, "grad_norm": 3.4620590744937343, "kl": 0.029296875, "learning_rate": 9.355617107712988e-07, "loss": 0.0012, "num_tokens": 12981463.0, "reward": 0.875, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 186.20834350585938, "completions/mean_terminated_length": 185.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.1654601861427094, "grad_norm": 3.67117533558172, "kl": 0.056396484375, "learning_rate": 9.347617282043361e-07, "loss": 0.0022, "num_tokens": 13061260.0, "reward": 1.0625, "reward_std": 0.294627845287323, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.39870715141296387, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.16649431230610134, "grad_norm": 3.8432953762387436, "kl": 0.033935546875, "learning_rate": 9.339571568522502e-07, "loss": 0.0014, "num_tokens": 13139362.0, "reward": 1.1041667461395264, "reward_std": 0.3766257166862488, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.642332136631012, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.16752843846949328, "grad_norm": 2.827018325830448, "kl": 0.0224609375, "learning_rate": 9.331480052070606e-07, "loss": 0.0009, "num_tokens": 13221897.0, "reward": 1.0625, "reward_std": 0.24185511469841003, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.3398369252681732, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16856256463288521, "grad_norm": 3.849615825816937, "kl": 0.03857421875, "learning_rate": 9.323342818091307e-07, "loss": 0.0015, "num_tokens": 13309802.0, "reward": 0.8472222089767456, "reward_std": 0.38511237502098083, "rewards/reasoning_reward/mean": 0.8472221493721008, "rewards/reasoning_reward/std": 0.8074482083320618, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 214.6666717529297, "completions/mean_terminated_length": 214.6666717529297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.16959669079627715, "grad_norm": 4.0101406070036925, "kl": 0.02783203125, "learning_rate": 9.315159952470765e-07, "loss": 0.0011, "num_tokens": 13391154.0, "reward": 0.7083333730697632, "reward_std": 0.31285393238067627, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5299029350280762, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 175.1666717529297, "completions/mean_terminated_length": 175.1666717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17063081695966908, "grad_norm": 3.5535731972038187, "kl": 0.0289306640625, "learning_rate": 9.306931541576783e-07, "loss": 0.0012, "num_tokens": 13476590.0, "reward": 1.2291667461395264, "reward_std": 0.4883233904838562, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.5706435441970825, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.17166494312306102, "grad_norm": 2.8543563729941903, "kl": 0.041259765625, "learning_rate": 9.29865767225787e-07, "loss": 0.0017, "num_tokens": 13560134.0, "reward": 0.9305555820465088, "reward_std": 0.25030583143234253, "rewards/reasoning_reward/mean": 0.9305555820465088, "rewards/reasoning_reward/std": 0.7659995555877686, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 191.4166717529297, "completions/mean_terminated_length": 191.4166717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.17269906928645296, "grad_norm": 4.219399972626256, "kl": 0.0240478515625, "learning_rate": 9.29033843184234e-07, "loss": 0.001, "num_tokens": 13637928.0, "reward": 0.6041666865348816, "reward_std": 0.5158624053001404, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.551266610622406, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 137.1666717529297, "completions/mean_terminated_length": 137.1666717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.1737331954498449, "grad_norm": 4.058689025510401, "kl": 0.038330078125, "learning_rate": 9.281973908137385e-07, "loss": 0.0015, "num_tokens": 13723524.0, "reward": 0.875, "reward_std": 0.5222300291061401, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.17476732161323683, "grad_norm": 2.82936642039495, "kl": 0.0220947265625, "learning_rate": 9.273564189428149e-07, "loss": 0.0009, "num_tokens": 13809044.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.17580144777662876, "grad_norm": 3.7043384803322907, "kl": 0.0245361328125, "learning_rate": 9.265109364476798e-07, "loss": 0.001, "num_tokens": 13892353.0, "reward": 0.75, "reward_std": 0.41387641429901123, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 216.69566345214844, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1768355739400207, "grad_norm": 3.0294206597876343, "kl": 0.032470703125, "learning_rate": 9.256609522521578e-07, "loss": 0.0013, "num_tokens": 13976627.0, "reward": 0.75, "reward_std": 0.3142080307006836, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.8135328888893127, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1778697001034126, "grad_norm": 3.1395182912560826, "kl": 0.03466796875, "learning_rate": 9.248064753275881e-07, "loss": 0.0014, "num_tokens": 14057114.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 139.2916717529297, "completions/mean_terminated_length": 139.2916717529297, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.17890382626680454, "grad_norm": 2.2716748016429715, "kl": 0.0263671875, "learning_rate": 9.239475146927289e-07, "loss": 0.0011, "num_tokens": 14135657.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.17993795243019647, "grad_norm": 3.706550911084408, "kl": 0.0322265625, "learning_rate": 9.23084079413663e-07, "loss": 0.0013, "num_tokens": 14214229.0, "reward": 1.0833333730697632, "reward_std": 0.5276275873184204, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5472813844680786, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 160.45834350585938, "completions/mean_terminated_length": 160.45834350585938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1809720785935884, "grad_norm": 3.8563106320766187, "kl": 0.0223388671875, "learning_rate": 9.222161786037017e-07, "loss": 0.0009, "num_tokens": 14292528.0, "reward": 0.7083333730697632, "reward_std": 0.36456555128097534, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.550032913684845, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.18200620475698034, "grad_norm": 3.522749129891572, "kl": 0.0235595703125, "learning_rate": 9.213438214232887e-07, "loss": 0.0009, "num_tokens": 14369853.0, "reward": 0.513888955116272, "reward_std": 0.4159068465232849, "rewards/reasoning_reward/mean": 0.5138888955116272, "rewards/reasoning_reward/std": 0.44482171535491943, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 189.18182373046875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.18304033092037228, "grad_norm": 4.470234549353068, "kl": 0.0299072265625, "learning_rate": 9.204670170799034e-07, "loss": 0.0012, "num_tokens": 14446075.0, "reward": 0.7222222089767456, "reward_std": 0.4285862445831299, "rewards/reasoning_reward/mean": 0.7222221493721008, "rewards/reasoning_reward/std": 0.7234289646148682, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 218.83334350585938, "completions/mean_terminated_length": 218.83334350585938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.18407445708376421, "grad_norm": 3.680761822106045, "kl": 0.0284423828125, "learning_rate": 9.195857748279636e-07, "loss": 0.0011, "num_tokens": 14524703.0, "reward": 0.1666666716337204, "reward_std": 0.3616904020309448, "rewards/reasoning_reward/mean": 0.1666666716337204, "rewards/reasoning_reward/std": 0.3806935250759125, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 144.33334350585938, "completions/mean_terminated_length": 144.33334350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.18510858324715615, "grad_norm": 2.0002566843886043, "kl": 0.0277099609375, "learning_rate": 9.187001039687283e-07, "loss": 0.0011, "num_tokens": 14609903.0, "reward": 1.0833333730697632, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 125.45833587646484, "completions/mean_terminated_length": 125.45833587646484, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.18614270941054809, "grad_norm": 3.220863102101657, "kl": 0.0291748046875, "learning_rate": 9.178100138501987e-07, "loss": 0.0012, "num_tokens": 14691482.0, "reward": 1.0, "reward_std": 0.24339044094085693, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.3611575663089752, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.18717683557394002, "grad_norm": 1.9372953316829284, "kl": 0.0308837890625, "learning_rate": 9.169155138670202e-07, "loss": 0.0012, "num_tokens": 14773179.0, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 198.33334350585938, "completions/mean_terminated_length": 198.33334350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.18821096173733196, "grad_norm": 4.409408065166967, "kl": 0.032958984375, "learning_rate": 9.160166134603833e-07, "loss": 0.0013, "num_tokens": 14857035.0, "reward": 1.0625, "reward_std": 0.41124165058135986, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.5379611253738403, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 168.20834350585938, "completions/mean_terminated_length": 168.20834350585938, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1892450879007239, "grad_norm": 2.7356052191198317, "kl": 0.032470703125, "learning_rate": 9.151133221179236e-07, "loss": 0.0013, "num_tokens": 14934480.0, "reward": 1.2083333730697632, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 230.5416717529297, "completions/mean_terminated_length": 230.5416717529297, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.19027921406411583, "grad_norm": 4.0459885315893445, "kl": 0.0576171875, "learning_rate": 9.142056493736214e-07, "loss": 0.0023, "num_tokens": 15024701.0, "reward": 1.0, "reward_std": 0.5209805369377136, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.7661308646202087, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 284.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 188.33334350585938, "completions/mean_terminated_length": 184.17391967773438, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.19131334022750776, "grad_norm": 3.393759487686452, "kl": 0.042724609375, "learning_rate": 9.13293604807702e-07, "loss": 0.0017, "num_tokens": 15119653.0, "reward": 1.5833333730697632, "reward_std": 0.34930619597435, "rewards/reasoning_reward/mean": 1.5833333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 185.83334350585938, "completions/mean_terminated_length": 185.83334350585938, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.1923474663908997, "grad_norm": 1.8836374889884402, "kl": 0.025390625, "learning_rate": 9.123771980465336e-07, "loss": 0.001, "num_tokens": 15200161.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.19338159255429163, "grad_norm": 3.7167415446965446, "kl": 0.03173828125, "learning_rate": 9.114564387625261e-07, "loss": 0.0013, "num_tokens": 15278277.0, "reward": 0.4652777910232544, "reward_std": 0.6122889518737793, "rewards/reasoning_reward/mean": 0.4652777910232544, "rewards/reasoning_reward/std": 0.6756266355514526, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.19441571871768357, "grad_norm": 3.074846245023018, "kl": 0.02587890625, "learning_rate": 9.105313366740295e-07, "loss": 0.001, "num_tokens": 15357622.0, "reward": 1.1527777910232544, "reward_std": 0.2801976501941681, "rewards/reasoning_reward/mean": 1.1527777910232544, "rewards/reasoning_reward/std": 0.35412225127220154, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.1954498448810755, "grad_norm": 0.15074496627824105, "kl": 0.033935546875, "learning_rate": 9.096019015452303e-07, "loss": 0.0014, "num_tokens": 15442109.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 113.41667175292969, "completions/mean_terminated_length": 113.41667175292969, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.19648397104446744, "grad_norm": 3.1874055691182623, "kl": 0.0322265625, "learning_rate": 9.086681431860492e-07, "loss": 0.0013, "num_tokens": 15525791.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.7801894545555115, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.19751809720785937, "grad_norm": 3.24323335305027, "kl": 0.0301513671875, "learning_rate": 9.077300714520377e-07, "loss": 0.0012, "num_tokens": 15609755.0, "reward": 1.1041667461395264, "reward_std": 0.2965203523635864, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.5706435441970825, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.19855222337125128, "grad_norm": 3.5456355924138805, "kl": 0.0306396484375, "learning_rate": 9.067876962442732e-07, "loss": 0.0012, "num_tokens": 15696732.0, "reward": 1.0, "reward_std": 0.44887280464172363, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.7939992547035217, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 150.5416717529297, "completions/mean_terminated_length": 150.5416717529297, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19958634953464321, "grad_norm": 2.245179535057254, "kl": 0.02734375, "learning_rate": 9.058410275092553e-07, "loss": 0.0011, "num_tokens": 15777353.0, "reward": 0.5833333730697632, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.5833333134651184, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.20062047569803515, "grad_norm": 2.2025459230007964, "kl": 0.033447265625, "learning_rate": 9.048900752388004e-07, "loss": 0.0013, "num_tokens": 15862848.0, "reward": 0.9166666865348816, "reward_std": 0.29546841979026794, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20165460186142709, "grad_norm": 3.077504641125675, "kl": 0.04296875, "learning_rate": 9.039348494699366e-07, "loss": 0.0017, "num_tokens": 15945461.0, "reward": 1.0416667461395264, "reward_std": 0.3535533845424652, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.5089774131774902, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 176.45834350585938, "completions/mean_terminated_length": 176.45834350585938, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.20268872802481902, "grad_norm": 1.72449286161736, "kl": 0.0302734375, "learning_rate": 9.029753602847974e-07, "loss": 0.0012, "num_tokens": 16026360.0, "reward": 1.0208333730697632, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.10206207633018494, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 199.20834350585938, "completions/mean_terminated_length": 199.20834350585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.20372285418821096, "grad_norm": 3.9491196867976286, "kl": 0.055908203125, "learning_rate": 9.020116178105153e-07, "loss": 0.0022, "num_tokens": 16115637.0, "reward": 1.1041667461395264, "reward_std": 0.5151108503341675, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.7515081167221069, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2047569803516029, "grad_norm": 2.8175351580004078, "kl": 0.0289306640625, "learning_rate": 9.010436322191155e-07, "loss": 0.0012, "num_tokens": 16199165.0, "reward": 0.8472222089767456, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.8472221493721008, "rewards/reasoning_reward/std": 0.7221757769584656, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 204.58334350585938, "completions/mean_terminated_length": 204.58334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.20579110651499483, "grad_norm": 4.291506948811305, "kl": 0.039794921875, "learning_rate": 9.000714137274077e-07, "loss": 0.0016, "num_tokens": 16277363.0, "reward": 0.75, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 200.9166717529297, "completions/mean_terminated_length": 200.9166717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.20682523267838676, "grad_norm": 3.795582364631847, "kl": 0.033447265625, "learning_rate": 8.990949725968786e-07, "loss": 0.0013, "num_tokens": 16356617.0, "reward": 1.1388888359069824, "reward_std": 0.42590853571891785, "rewards/reasoning_reward/mean": 1.1388888359069824, "rewards/reasoning_reward/std": 0.5992480516433716, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 165.7916717529297, "completions/mean_terminated_length": 165.7916717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2078593588417787, "grad_norm": 4.0373942611533495, "kl": 0.0400390625, "learning_rate": 8.981143191335839e-07, "loss": 0.0016, "num_tokens": 16441756.0, "reward": 0.930555522441864, "reward_std": 0.42342984676361084, "rewards/reasoning_reward/mean": 0.930555522441864, "rewards/reasoning_reward/std": 0.6843958497047424, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.20889348500517063, "grad_norm": 4.339046231792284, "kl": 0.042236328125, "learning_rate": 8.971294636880391e-07, "loss": 0.0017, "num_tokens": 16530366.0, "reward": 1.4375, "reward_std": 0.4006626605987549, "rewards/reasoning_reward/mean": 1.4375, "rewards/reasoning_reward/std": 0.6753286719322205, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.20992761116856257, "grad_norm": 2.8570580431999235, "kl": 0.08154296875, "learning_rate": 8.961404166551103e-07, "loss": 0.0033, "num_tokens": 16615319.0, "reward": 1.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2109617373319545, "grad_norm": 2.980300192320368, "kl": 0.02880859375, "learning_rate": 8.951471884739051e-07, "loss": 0.0012, "num_tokens": 16696703.0, "reward": 1.1041667461395264, "reward_std": 0.22466278076171875, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.36052998900413513, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 191.58334350585938, "completions/mean_terminated_length": 191.58334350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.21199586349534644, "grad_norm": 4.159029061904166, "kl": 0.0439453125, "learning_rate": 8.941497896276613e-07, "loss": 0.0018, "num_tokens": 16778829.0, "reward": 0.8194445371627808, "reward_std": 0.6607243418693542, "rewards/reasoning_reward/mean": 0.819444477558136, "rewards/reasoning_reward/std": 0.7892953753471375, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 219.9166717529297, "completions/mean_terminated_length": 219.9166717529297, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.21302998965873837, "grad_norm": 3.6827623626462063, "kl": 0.06298828125, "learning_rate": 8.931482306436373e-07, "loss": 0.0025, "num_tokens": 16863339.0, "reward": 1.3958333730697632, "reward_std": 0.3719491958618164, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.6015529036521912, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2140641158221303, "grad_norm": 4.5771027363801, "kl": 0.07666015625, "learning_rate": 8.921425220930001e-07, "loss": 0.0031, "num_tokens": 16942642.0, "reward": 0.7569444179534912, "reward_std": 0.5365828275680542, "rewards/reasoning_reward/mean": 0.7569444179534912, "rewards/reasoning_reward/std": 0.5223950147628784, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 148.1666717529297, "completions/mean_terminated_length": 148.1666717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.21509824198552224, "grad_norm": 4.6067336602799935, "kl": 0.072265625, "learning_rate": 8.91132674590715e-07, "loss": 0.0029, "num_tokens": 17028814.0, "reward": 0.9583333730697632, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.46430566906929016, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 161.0416717529297, "completions/mean_terminated_length": 161.0416717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.21613236814891418, "grad_norm": 3.888741740906291, "kl": 0.07861328125, "learning_rate": 8.901186987954319e-07, "loss": 0.0031, "num_tokens": 17112839.0, "reward": 1.0833333730697632, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5835920572280884, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 190.7916717529297, "completions/mean_terminated_length": 190.7916717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.2171664943123061, "grad_norm": 3.5678811440994815, "kl": 0.041259765625, "learning_rate": 8.891006054093739e-07, "loss": 0.0016, "num_tokens": 17193130.0, "reward": 0.9583333730697632, "reward_std": 0.24800793826580048, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.6902530789375305, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.21820062047569805, "grad_norm": 3.3232953547887, "kl": 0.046142578125, "learning_rate": 8.880784051782243e-07, "loss": 0.0018, "num_tokens": 17271196.0, "reward": 0.9791666865348816, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.2321528047323227, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 200.7916717529297, "completions/mean_terminated_length": 200.7916717529297, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.21923474663908996, "grad_norm": 3.17766425227838, "kl": 0.033935546875, "learning_rate": 8.870521088910129e-07, "loss": 0.0014, "num_tokens": 17348999.0, "reward": 0.5208333730697632, "reward_std": 0.30699092149734497, "rewards/reasoning_reward/mean": 0.5208333134651184, "rewards/reasoning_reward/std": 0.49954691529273987, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 192.70834350585938, "completions/mean_terminated_length": 192.70834350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2202688728024819, "grad_norm": 3.8551099627848244, "kl": 0.04296875, "learning_rate": 8.860217273800021e-07, "loss": 0.0017, "num_tokens": 17428608.0, "reward": 0.9791666865348816, "reward_std": 0.5464199781417847, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.5985338091850281, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 154.33334350585938, "completions/mean_terminated_length": 154.33334350585938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.22130299896587383, "grad_norm": 3.824252202511437, "kl": 0.07568359375, "learning_rate": 8.849872715205725e-07, "loss": 0.003, "num_tokens": 17507808.0, "reward": 1.3333333730697632, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.434057354927063, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 193.70834350585938, "completions/mean_terminated_length": 193.70834350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.22233712512926576, "grad_norm": 2.7022340614121916, "kl": 0.033447265625, "learning_rate": 8.839487522311086e-07, "loss": 0.0013, "num_tokens": 17598361.0, "reward": 1.5, "reward_std": 0.24966806173324585, "rewards/reasoning_reward/mean": 1.5, "rewards/reasoning_reward/std": 0.7071067690849304, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 186.70834350585938, "completions/mean_terminated_length": 186.70834350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2233712512926577, "grad_norm": 2.845013705079817, "kl": 0.043701171875, "learning_rate": 8.829061804728834e-07, "loss": 0.0017, "num_tokens": 17687378.0, "reward": 1.1666667461395264, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.4815434515476227, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 142.2916717529297, "completions/mean_terminated_length": 142.2916717529297, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.22440537745604963, "grad_norm": 3.704720807397464, "kl": 0.029296875, "learning_rate": 8.818595672499418e-07, "loss": 0.0012, "num_tokens": 17771825.0, "reward": 0.763888955116272, "reward_std": 0.32886335253715515, "rewards/reasoning_reward/mean": 0.7638888955116272, "rewards/reasoning_reward/std": 0.5152661204338074, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 204.58334350585938, "completions/mean_terminated_length": 204.58334350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.22543950361944157, "grad_norm": 4.235645113018916, "kl": 0.048828125, "learning_rate": 8.808089236089857e-07, "loss": 0.002, "num_tokens": 17849271.0, "reward": 1.2291667461395264, "reward_std": 0.6813797950744629, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.7067864537239075, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 166.6666717529297, "completions/mean_terminated_length": 166.6666717529297, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2264736297828335, "grad_norm": 3.7267555026025962, "kl": 0.055908203125, "learning_rate": 8.797542606392572e-07, "loss": 0.0022, "num_tokens": 17934255.0, "reward": 0.9791666865348816, "reward_std": 0.4042079448699951, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.6507381200790405, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 165.7916717529297, "completions/mean_terminated_length": 165.7916717529297, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.22750775594622544, "grad_norm": 3.259723649327136, "kl": 0.036865234375, "learning_rate": 8.786955894724206e-07, "loss": 0.0015, "num_tokens": 18011010.0, "reward": 0.5416666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 167.08334350585938, "completions/mean_terminated_length": 167.08334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.22854188210961737, "grad_norm": 3.2853923018575095, "kl": 0.040771484375, "learning_rate": 8.776329212824461e-07, "loss": 0.0016, "num_tokens": 18089092.0, "reward": 1.1041667461395264, "reward_std": 0.30551642179489136, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.7067864537239075, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 176.6666717529297, "completions/mean_terminated_length": 176.6666717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2295760082730093, "grad_norm": 3.3736718186548527, "kl": 0.03173828125, "learning_rate": 8.765662672854908e-07, "loss": 0.0013, "num_tokens": 18173180.0, "reward": 0.7916666865348816, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.9197904467582703, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.23061013443640124, "grad_norm": 4.387150850841933, "kl": 0.041748046875, "learning_rate": 8.754956387397814e-07, "loss": 0.0017, "num_tokens": 18252827.0, "reward": 0.7708333730697632, "reward_std": 0.5432543754577637, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.5311833620071411, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 179.6666717529297, "completions/mean_terminated_length": 179.6666717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.23164426059979318, "grad_norm": 3.7927764070085943, "kl": 0.035888671875, "learning_rate": 8.744210469454945e-07, "loss": 0.0014, "num_tokens": 18334267.0, "reward": 0.8819444179534912, "reward_std": 0.3051118850708008, "rewards/reasoning_reward/mean": 0.8819444179534912, "rewards/reasoning_reward/std": 0.5258513689041138, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 178.08334350585938, "completions/mean_terminated_length": 178.08334350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.2326783867631851, "grad_norm": 2.0858422243920622, "kl": 0.035400390625, "learning_rate": 8.73342503244638e-07, "loss": 0.0014, "num_tokens": 18417093.0, "reward": 1.1666667461395264, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 150.5416717529297, "completions/mean_terminated_length": 150.5416717529297, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.23371251292657705, "grad_norm": 1.7365069787778553, "kl": 0.0274658203125, "learning_rate": 8.722600190209303e-07, "loss": 0.0011, "num_tokens": 18494866.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 172.58334350585938, "completions/mean_terminated_length": 172.58334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.23474663908996898, "grad_norm": 4.708061391148886, "kl": 0.034912109375, "learning_rate": 8.711736056996817e-07, "loss": 0.0014, "num_tokens": 18573464.0, "reward": 0.8125, "reward_std": 0.6011933088302612, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.7775728702545166, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 181.7916717529297, "completions/mean_terminated_length": 181.7916717529297, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.23578076525336092, "grad_norm": 3.0293873866544394, "kl": 0.037353515625, "learning_rate": 8.700832747476725e-07, "loss": 0.0015, "num_tokens": 18653739.0, "reward": 0.5277777910232544, "reward_std": 0.24982166290283203, "rewards/reasoning_reward/mean": 0.5277777314186096, "rewards/reasoning_reward/std": 0.7350221872329712, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.23681489141675285, "grad_norm": 2.7748190749466977, "kl": 0.03173828125, "learning_rate": 8.689890376730327e-07, "loss": 0.0013, "num_tokens": 18733685.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2378490175801448, "grad_norm": 2.810129730379906, "kl": 0.036865234375, "learning_rate": 8.678909060251201e-07, "loss": 0.0015, "num_tokens": 18816811.0, "reward": 0.9791666865348816, "reward_std": 0.352710485458374, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.5413181781768799, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.23888314374353672, "grad_norm": 2.4272615508776347, "kl": 0.038818359375, "learning_rate": 8.667888913943988e-07, "loss": 0.0016, "num_tokens": 18893230.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 159.4166717529297, "completions/mean_terminated_length": 159.4166717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.23991726990692863, "grad_norm": 4.0221042286356985, "kl": 0.0390625, "learning_rate": 8.656830054123168e-07, "loss": 0.0016, "num_tokens": 18983048.0, "reward": 1.125, "reward_std": 0.4563485085964203, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.8501917719841003, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 206.45834350585938, "completions/mean_terminated_length": 206.45834350585938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.24095139607032057, "grad_norm": 4.213144645418847, "kl": 0.04052734375, "learning_rate": 8.645732597511825e-07, "loss": 0.0016, "num_tokens": 19060963.0, "reward": 0.875, "reward_std": 0.60628741979599, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.710939347743988, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 173.33334350585938, "completions/mean_terminated_length": 173.33334350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2419855222337125, "grad_norm": 3.05995072143482, "kl": 0.048583984375, "learning_rate": 8.634596661240428e-07, "loss": 0.0019, "num_tokens": 19137963.0, "reward": 0.6666666865348816, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 217.0416717529297, "completions/mean_terminated_length": 217.0416717529297, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.24301964839710444, "grad_norm": 3.020436177796217, "kl": 0.0296630859375, "learning_rate": 8.623422362845582e-07, "loss": 0.0012, "num_tokens": 19215988.0, "reward": 0.625, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.24405377456049637, "grad_norm": 2.7937754256291765, "kl": 0.0296630859375, "learning_rate": 8.612209820268798e-07, "loss": 0.0012, "num_tokens": 19299191.0, "reward": 1.0833333730697632, "reward_std": 0.29546841979026794, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2450879007238883, "grad_norm": 3.360745099795255, "kl": 0.052490234375, "learning_rate": 8.600959151855241e-07, "loss": 0.0021, "num_tokens": 19380468.0, "reward": 0.875, "reward_std": 0.2985045611858368, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5160468220710754, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 167.1666717529297, "completions/mean_terminated_length": 167.1666717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.24612202688728024, "grad_norm": 4.215940666816239, "kl": 0.032470703125, "learning_rate": 8.589670476352484e-07, "loss": 0.0013, "num_tokens": 19465384.0, "reward": 1.0833333730697632, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.6197241544723511, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 226.20834350585938, "completions/mean_terminated_length": 226.20834350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.24715615305067218, "grad_norm": 1.8917075772209162, "kl": 0.041015625, "learning_rate": 8.578343912909252e-07, "loss": 0.0016, "num_tokens": 19545613.0, "reward": 1.0416667461395264, "reward_std": 0.08266931027173996, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.14947572350502014, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 181.1666717529297, "completions/mean_terminated_length": 181.1666717529297, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.2481902792140641, "grad_norm": 3.3661865153565125, "kl": 0.037353515625, "learning_rate": 8.566979581074168e-07, "loss": 0.0015, "num_tokens": 19631889.0, "reward": 1.1666667461395264, "reward_std": 0.24339044094085693, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.5247498154640198, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.24922440537745605, "grad_norm": 28.79446249181958, "kl": 0.67578125, "learning_rate": 8.555577600794488e-07, "loss": 0.0271, "num_tokens": 19711856.0, "reward": 1.0069445371627808, "reward_std": 0.6009811162948608, "rewards/reasoning_reward/mean": 1.0069445371627808, "rewards/reasoning_reward/std": 0.7010675072669983, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.250258531540848, "grad_norm": 2.003121090259284, "kl": 0.037841796875, "learning_rate": 8.54413809241484e-07, "loss": 0.0015, "num_tokens": 19791402.0, "reward": 1.0833333730697632, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2512926577042399, "grad_norm": 0.15042965975061226, "kl": 0.032958984375, "learning_rate": 8.53266117667595e-07, "loss": 0.0013, "num_tokens": 19871107.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.25232678386763185, "grad_norm": 0.13370684582648995, "kl": 0.042236328125, "learning_rate": 8.521146974713363e-07, "loss": 0.0017, "num_tokens": 19950505.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 142.6521759033203, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.2533609100310238, "grad_norm": 2.465382515779942, "kl": 0.0281982421875, "learning_rate": 8.50959560805617e-07, "loss": 0.0011, "num_tokens": 20031090.0, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 161.33334350585938, "completions/mean_terminated_length": 161.33334350585938, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2543950361944157, "grad_norm": 4.076548379222048, "kl": 0.038330078125, "learning_rate": 8.498007198625732e-07, "loss": 0.0015, "num_tokens": 20112338.0, "reward": 1.1458333730697632, "reward_std": 0.22466278076171875, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.40322521328926086, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 163.9166717529297, "completions/mean_terminated_length": 163.9166717529297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.25542916235780766, "grad_norm": 3.9449748133141647, "kl": 0.051513671875, "learning_rate": 8.486381868734378e-07, "loss": 0.0021, "num_tokens": 20196792.0, "reward": 0.9791666865348816, "reward_std": 0.4564814567565918, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.6833288669586182, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 127.91667175292969, "completions/mean_terminated_length": 127.91667175292969, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.2564632885211996, "grad_norm": 4.846992572972704, "kl": 0.040283203125, "learning_rate": 8.474719741084126e-07, "loss": 0.0016, "num_tokens": 20279862.0, "reward": 1.1666667461395264, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 180.70834350585938, "completions/mean_terminated_length": 180.70834350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.25749741468459153, "grad_norm": 5.015013743019392, "kl": 0.06787109375, "learning_rate": 8.463020938765384e-07, "loss": 0.0027, "num_tokens": 20363271.0, "reward": 1.125, "reward_std": 0.5078567266464233, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.8501917719841003, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 158.0416717529297, "completions/mean_terminated_length": 158.0416717529297, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.25853154084798347, "grad_norm": 3.3147927169947553, "kl": 0.06982421875, "learning_rate": 8.451285585255646e-07, "loss": 0.0028, "num_tokens": 20442512.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 133.83334350585938, "completions/mean_terminated_length": 133.83334350585938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2595656670113754, "grad_norm": 1.9961818932049205, "kl": 0.0390625, "learning_rate": 8.439513804418196e-07, "loss": 0.0016, "num_tokens": 20523788.0, "reward": 0.7916666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 148.6666717529297, "completions/mean_terminated_length": 148.6666717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.26059979317476734, "grad_norm": 3.886022313764354, "kl": 0.0380859375, "learning_rate": 8.4277057205008e-07, "loss": 0.0015, "num_tokens": 20606012.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.26163391933815927, "grad_norm": 3.611166967490902, "kl": 0.07421875, "learning_rate": 8.415861458134392e-07, "loss": 0.003, "num_tokens": 20694661.0, "reward": 1.625, "reward_std": 0.3933655619621277, "rewards/reasoning_reward/mean": 1.625, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2626680455015512, "grad_norm": 3.9865927693728582, "kl": 0.076171875, "learning_rate": 8.403981142331758e-07, "loss": 0.003, "num_tokens": 20774098.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.26370217166494314, "grad_norm": 2.6781574727908297, "kl": 0.0556640625, "learning_rate": 8.392064898486215e-07, "loss": 0.0022, "num_tokens": 20853589.0, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 149.70834350585938, "completions/mean_terminated_length": 149.70834350585938, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2647362978283351, "grad_norm": 5.965736009719672, "kl": 0.07080078125, "learning_rate": 8.380112852370296e-07, "loss": 0.0028, "num_tokens": 20936110.0, "reward": 1.1805555820465088, "reward_std": 0.36751919984817505, "rewards/reasoning_reward/mean": 1.1805554628372192, "rewards/reasoning_reward/std": 0.6117146611213684, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 153.5416717529297, "completions/mean_terminated_length": 153.5416717529297, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.265770423991727, "grad_norm": 2.4421141476780917, "kl": 0.038818359375, "learning_rate": 8.368125130134414e-07, "loss": 0.0015, "num_tokens": 21024379.0, "reward": 0.9791666865348816, "reward_std": 0.18766528367996216, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.31204676628112793, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 160.5416717529297, "completions/mean_terminated_length": 160.5416717529297, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.26680455015511895, "grad_norm": 3.149878852517023, "kl": 0.0556640625, "learning_rate": 8.356101858305528e-07, "loss": 0.0022, "num_tokens": 21110432.0, "reward": 1.1875, "reward_std": 0.23709973692893982, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.7042186260223389, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2678386763185109, "grad_norm": 4.173006267735779, "kl": 0.07080078125, "learning_rate": 8.344043163785823e-07, "loss": 0.0028, "num_tokens": 21193913.0, "reward": 0.9583333730697632, "reward_std": 0.5727972984313965, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.8329709768295288, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 161.7916717529297, "completions/mean_terminated_length": 161.7916717529297, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2688728024819028, "grad_norm": 3.930560040382512, "kl": 0.07470703125, "learning_rate": 8.331949173851354e-07, "loss": 0.003, "num_tokens": 21270796.0, "reward": 0.75, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.26990692864529475, "grad_norm": 4.470134236685902, "kl": 0.06298828125, "learning_rate": 8.319820016150706e-07, "loss": 0.0025, "num_tokens": 21353616.0, "reward": 0.9583333730697632, "reward_std": 0.39814266562461853, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.4402732849121094, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 184.08334350585938, "completions/mean_terminated_length": 184.08334350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.27094105480868663, "grad_norm": 2.50371678079303, "kl": 0.06298828125, "learning_rate": 8.307655818703657e-07, "loss": 0.0025, "num_tokens": 21432634.0, "reward": 1.0833333730697632, "reward_std": 0.1259881556034088, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.24077169597148895, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 188.20834350585938, "completions/mean_terminated_length": 188.20834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.27197518097207857, "grad_norm": 2.890189899345422, "kl": 0.0284423828125, "learning_rate": 8.295456709899816e-07, "loss": 0.0011, "num_tokens": 21519879.0, "reward": 0.6666666865348816, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.746974527835846, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 153.33334350585938, "completions/mean_terminated_length": 153.33334350585938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2730093071354705, "grad_norm": 4.254336132840291, "kl": 0.0634765625, "learning_rate": 8.283222818497269e-07, "loss": 0.0025, "num_tokens": 21599551.0, "reward": 0.7083333730697632, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 147.0, "completions/min_length": 30.0, "completions/min_terminated_length": 96.0, "epoch": 0.27404343329886244, "grad_norm": 4.319565951453572, "kl": 0.060302734375, "learning_rate": 8.270954273621228e-07, "loss": 0.0024, "num_tokens": 21682506.0, "reward": 0.75, "reward_std": 0.45846566557884216, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.7071067690849304, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 163.6666717529297, "completions/mean_terminated_length": 163.6666717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.2750775594622544, "grad_norm": 3.4635605581504056, "kl": 0.05859375, "learning_rate": 8.258651204762657e-07, "loss": 0.0023, "num_tokens": 21771346.0, "reward": 1.5416667461395264, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.6580052971839905, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 157.08334350585938, "completions/mean_terminated_length": 157.08334350585938, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.2761116856256463, "grad_norm": 4.0415573834840535, "kl": 0.052490234375, "learning_rate": 8.24631374177691e-07, "loss": 0.0021, "num_tokens": 21850356.0, "reward": 0.8472222089767456, "reward_std": 0.4722983241081238, "rewards/reasoning_reward/mean": 0.8472221493721008, "rewards/reasoning_reward/std": 0.7612547874450684, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 120.41667175292969, "completions/mean_terminated_length": 120.41667175292969, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.27714581178903824, "grad_norm": 3.246047157021264, "kl": 0.058349609375, "learning_rate": 8.233942014882369e-07, "loss": 0.0023, "num_tokens": 21930830.0, "reward": 0.6666666865348816, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.2781799379524302, "grad_norm": 4.562799738250913, "kl": 0.052490234375, "learning_rate": 8.221536154659054e-07, "loss": 0.0021, "num_tokens": 22015157.0, "reward": 1.125, "reward_std": 0.5863928198814392, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.7408866882324219, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 553.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 163.33334350585938, "completions/mean_terminated_length": 146.3913116455078, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.2792140641158221, "grad_norm": 4.181549595465215, "kl": 0.07568359375, "learning_rate": 8.209096292047257e-07, "loss": 0.003, "num_tokens": 22101757.0, "reward": 0.75, "reward_std": 0.4446708858013153, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.675663948059082, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 165.08334350585938, "completions/mean_terminated_length": 165.08334350585938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.28024819027921405, "grad_norm": 7.933565453714079, "kl": 0.2265625, "learning_rate": 8.196622558346152e-07, "loss": 0.0091, "num_tokens": 22180183.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 129.70834350585938, "completions/mean_terminated_length": 132.13043212890625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.281282316442606, "grad_norm": 3.1643095072157363, "kl": 0.07421875, "learning_rate": 8.184115085212413e-07, "loss": 0.003, "num_tokens": 22256120.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 146.33334350585938, "completions/mean_terminated_length": 146.33334350585938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2823164426059979, "grad_norm": 3.819168262230962, "kl": 0.062255859375, "learning_rate": 8.171574004658828e-07, "loss": 0.0025, "num_tokens": 22344216.0, "reward": 1.375, "reward_std": 0.31285393238067627, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 161.08334350585938, "completions/mean_terminated_length": 161.08334350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.28335056876938985, "grad_norm": 3.2604091114453984, "kl": 0.058837890625, "learning_rate": 8.158999449052898e-07, "loss": 0.0024, "num_tokens": 22419034.0, "reward": 1.0625, "reward_std": 0.13607725501060486, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.1689159870147705, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 148.45834350585938, "completions/mean_terminated_length": 148.45834350585938, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2843846949327818, "grad_norm": 5.802068488952265, "kl": 0.1806640625, "learning_rate": 8.146391551115442e-07, "loss": 0.0072, "num_tokens": 22497061.0, "reward": 1.2291667461395264, "reward_std": 0.23144195973873138, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.4164854884147644, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 135.4166717529297, "completions/mean_terminated_length": 135.4166717529297, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.2854188210961737, "grad_norm": 2.5403712266985683, "kl": 0.0615234375, "learning_rate": 8.133750443919205e-07, "loss": 0.0025, "num_tokens": 22575655.0, "reward": 1.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 164.0416717529297, "completions/mean_terminated_length": 164.0416717529297, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.28645294725956566, "grad_norm": 2.660324010030384, "kl": 0.056884765625, "learning_rate": 8.121076260887436e-07, "loss": 0.0023, "num_tokens": 22653648.0, "reward": 0.5, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 155.70834350585938, "completions/mean_terminated_length": 155.70834350585938, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2874870734229576, "grad_norm": 3.979951013030403, "kl": 0.052978515625, "learning_rate": 8.108369135792498e-07, "loss": 0.0021, "num_tokens": 22739169.0, "reward": 0.8333333730697632, "reward_std": 0.39324939250946045, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.7755315899848938, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.28852119958634953, "grad_norm": 2.664386460601889, "kl": 0.07568359375, "learning_rate": 8.095629202754447e-07, "loss": 0.003, "num_tokens": 22816052.0, "reward": 0.6458333730697632, "reward_std": 0.13908717036247253, "rewards/reasoning_reward/mean": 0.6458333134651184, "rewards/reasoning_reward/std": 0.5208514332771301, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.28955532574974147, "grad_norm": 3.1543556872497907, "kl": 0.060791015625, "learning_rate": 8.082856596239613e-07, "loss": 0.0024, "num_tokens": 22893649.0, "reward": 0.5, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.2905894519131334, "grad_norm": 3.3080012627332933, "kl": 0.07275390625, "learning_rate": 8.070051451059188e-07, "loss": 0.0029, "num_tokens": 22971419.0, "reward": 0.7708333730697632, "reward_std": 0.1849137246608734, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.5706435441970825, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 143.83334350585938, "completions/mean_terminated_length": 143.83334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.29162357807652534, "grad_norm": 1.9572479604131818, "kl": 0.0556640625, "learning_rate": 8.057213902367801e-07, "loss": 0.0022, "num_tokens": 23048719.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 176.20834350585938, "completions/mean_terminated_length": 176.20834350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.29265770423991727, "grad_norm": 2.3492148432005755, "kl": 0.0556640625, "learning_rate": 8.044344085662092e-07, "loss": 0.0022, "num_tokens": 23127580.0, "reward": 0.125, "reward_std": 0.24800793826580048, "rewards/reasoning_reward/mean": 0.125, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2936918304033092, "grad_norm": 3.9640676648988027, "kl": 0.076171875, "learning_rate": 8.031442136779271e-07, "loss": 0.0031, "num_tokens": 23210798.0, "reward": 1.0625, "reward_std": 0.3596132695674896, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.7798991799354553, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 184.0416717529297, "completions/mean_terminated_length": 184.0416717529297, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.29472595656670114, "grad_norm": 4.300616429230677, "kl": 0.057861328125, "learning_rate": 8.018508191895712e-07, "loss": 0.0023, "num_tokens": 23294247.0, "reward": 0.9791666865348816, "reward_std": 0.40489405393600464, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.6507381200790405, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2957600827300931, "grad_norm": 2.685078105274284, "kl": 0.06884765625, "learning_rate": 8.005542387525479e-07, "loss": 0.0028, "num_tokens": 23383472.0, "reward": 1.3125, "reward_std": 0.23709973692893982, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.38483479619026184, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.296794208893485, "grad_norm": 3.908207128618161, "kl": 0.056640625, "learning_rate": 7.992544860518915e-07, "loss": 0.0023, "num_tokens": 23464673.0, "reward": 0.8333333730697632, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 174.95834350585938, "completions/mean_terminated_length": 174.95834350585938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.29782833505687695, "grad_norm": 3.2451345960277793, "kl": 0.051513671875, "learning_rate": 7.979515748061181e-07, "loss": 0.0021, "num_tokens": 23549896.0, "reward": 1.0625, "reward_std": 0.3194752335548401, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.47348156571388245, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2988624612202689, "grad_norm": 4.7409903359224295, "kl": 0.060791015625, "learning_rate": 7.966455187670819e-07, "loss": 0.0024, "num_tokens": 23627058.0, "reward": 0.4791666865348816, "reward_std": 0.4373263716697693, "rewards/reasoning_reward/mean": 0.4791666567325592, "rewards/reasoning_reward/std": 0.5985338091850281, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 152.9166717529297, "completions/mean_terminated_length": 152.9166717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2998965873836608, "grad_norm": 1.7959395053181861, "kl": 0.038818359375, "learning_rate": 7.953363317198287e-07, "loss": 0.0016, "num_tokens": 23705888.0, "reward": 1.1597223281860352, "reward_std": 0.12751007080078125, "rewards/reasoning_reward/mean": 1.1597222089767456, "rewards/reasoning_reward/std": 0.3126911520957947, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 231.45834350585938, "completions/mean_terminated_length": 231.45834350585938, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.30093071354705275, "grad_norm": 2.7320267048254983, "kl": 0.046630859375, "learning_rate": 7.940240274824519e-07, "loss": 0.0019, "num_tokens": 23786971.0, "reward": 0.8125, "reward_std": 0.2836732268333435, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.44589513540267944, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 148.83334350585938, "completions/mean_terminated_length": 148.83334350585938, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.3019648397104447, "grad_norm": 3.1680818855775654, "kl": 0.0556640625, "learning_rate": 7.927086199059457e-07, "loss": 0.0022, "num_tokens": 23866479.0, "reward": 1.1875, "reward_std": 0.24185511469841003, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.4618605971336365, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.3029989658738366, "grad_norm": 1.8750379438190574, "kl": 0.045166015625, "learning_rate": 7.913901228740589e-07, "loss": 0.0018, "num_tokens": 23952070.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 138.58334350585938, "completions/mean_terminated_length": 138.58334350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.30403309203722856, "grad_norm": 3.2244709897237893, "kl": 0.0419921875, "learning_rate": 7.90068550303149e-07, "loss": 0.0017, "num_tokens": 24034804.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 143.20834350585938, "completions/mean_terminated_length": 143.20834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3050672182006205, "grad_norm": 3.2322747610220564, "kl": 0.06494140625, "learning_rate": 7.887439161420346e-07, "loss": 0.0026, "num_tokens": 24115961.0, "reward": 0.9583333730697632, "reward_std": 0.21026216447353363, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.6743220090866089, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.30610134436401243, "grad_norm": 4.065413966797424, "kl": 0.050048828125, "learning_rate": 7.874162343718489e-07, "loss": 0.002, "num_tokens": 24197397.0, "reward": 0.8333333730697632, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 156.58334350585938, "completions/mean_terminated_length": 156.58334350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.30713547052740436, "grad_norm": 35.175482523866044, "kl": 0.20703125, "learning_rate": 7.860855190058913e-07, "loss": 0.0083, "num_tokens": 24281619.0, "reward": 1.0625, "reward_std": 0.4130779206752777, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.6806725859642029, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.3081695966907963, "grad_norm": 3.422532515079038, "kl": 0.0654296875, "learning_rate": 7.847517840894803e-07, "loss": 0.0026, "num_tokens": 24361918.0, "reward": 0.8541666865348816, "reward_std": 0.23709973692893982, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.40322521328926086, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 171.08334350585938, "completions/mean_terminated_length": 171.08334350585938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.30920372285418823, "grad_norm": 4.1270544852144475, "kl": 0.0498046875, "learning_rate": 7.834150436998046e-07, "loss": 0.002, "num_tokens": 24443768.0, "reward": 1.1041667461395264, "reward_std": 0.4082317352294922, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.7937139868736267, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 138.45834350585938, "completions/mean_terminated_length": 138.45834350585938, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.31023784901758017, "grad_norm": 3.611002627039542, "kl": 0.0537109375, "learning_rate": 7.820753119457751e-07, "loss": 0.0021, "num_tokens": 24521315.0, "reward": 0.6458333730697632, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 0.6458333134651184, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 149.20834350585938, "completions/mean_terminated_length": 149.20834350585938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3112719751809721, "grad_norm": 3.152472722978425, "kl": 0.068359375, "learning_rate": 7.807326029678753e-07, "loss": 0.0027, "num_tokens": 24603760.0, "reward": 0.8333333730697632, "reward_std": 0.36585909128189087, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.8164966702461243, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 205.4166717529297, "completions/mean_terminated_length": 205.4166717529297, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.31230610134436404, "grad_norm": 2.786475190664922, "kl": 0.051025390625, "learning_rate": 7.793869309380128e-07, "loss": 0.002, "num_tokens": 24683322.0, "reward": 1.0833333730697632, "reward_std": 0.42052432894706726, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.6538625359535217, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3133402275077559, "grad_norm": 4.57864897319015, "kl": 0.09521484375, "learning_rate": 7.780383100593692e-07, "loss": 0.0038, "num_tokens": 24766529.0, "reward": 1.4583333730697632, "reward_std": 0.35799640417099, "rewards/reasoning_reward/mean": 1.4583333730697632, "rewards/reasoning_reward/std": 0.4402732849121094, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 163.95834350585938, "completions/mean_terminated_length": 163.95834350585938, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.31437435367114785, "grad_norm": 3.623601369487776, "kl": 0.053955078125, "learning_rate": 7.766867545662506e-07, "loss": 0.0022, "num_tokens": 24850544.0, "reward": 1.0208333730697632, "reward_std": 0.31726133823394775, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.5800893306732178, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 138.9166717529297, "completions/mean_terminated_length": 138.9166717529297, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.3154084798345398, "grad_norm": 3.8825218639045667, "kl": 0.0732421875, "learning_rate": 7.753322787239365e-07, "loss": 0.0029, "num_tokens": 24931478.0, "reward": 0.9583333730697632, "reward_std": 0.31285393238067627, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.32693126797676086, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 164.4166717529297, "completions/mean_terminated_length": 164.4166717529297, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.3164426059979317, "grad_norm": 2.1908650833519956, "kl": 0.048828125, "learning_rate": 7.739748968285305e-07, "loss": 0.0019, "num_tokens": 25008608.0, "reward": 0.9166666865348816, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.318511039018631, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 145.4166717529297, "completions/mean_terminated_length": 145.4166717529297, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.31747673216132366, "grad_norm": 4.314449957580246, "kl": 0.07275390625, "learning_rate": 7.726146232068083e-07, "loss": 0.0029, "num_tokens": 25090962.0, "reward": 0.9166666865348816, "reward_std": 0.5232069492340088, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.7172814607620239, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 162.0416717529297, "completions/mean_terminated_length": 162.0416717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3185108583247156, "grad_norm": 3.710576665295931, "kl": 0.05859375, "learning_rate": 7.712514722160673e-07, "loss": 0.0023, "num_tokens": 25179115.0, "reward": 1.5416667461395264, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 160.83334350585938, "completions/mean_terminated_length": 160.83334350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.31954498448810753, "grad_norm": 4.256835002888248, "kl": 0.09521484375, "learning_rate": 7.698854582439744e-07, "loss": 0.0038, "num_tokens": 25267727.0, "reward": 1.0555555820465088, "reward_std": 0.6559478044509888, "rewards/reasoning_reward/mean": 1.0555554628372192, "rewards/reasoning_reward/std": 0.7656052112579346, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 188.5416717529297, "completions/mean_terminated_length": 188.5416717529297, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.32057911065149947, "grad_norm": 1.8170985697472053, "kl": 0.052001953125, "learning_rate": 7.685165957084147e-07, "loss": 0.0021, "num_tokens": 25358292.0, "reward": 0.625, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 155.1666717529297, "completions/mean_terminated_length": 155.1666717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.3216132368148914, "grad_norm": 3.812637593183448, "kl": 0.08935546875, "learning_rate": 7.671448990573391e-07, "loss": 0.0036, "num_tokens": 25446560.0, "reward": 1.2708333730697632, "reward_std": 0.5512506365776062, "rewards/reasoning_reward/mean": 1.2708333730697632, "rewards/reasoning_reward/std": 0.6753286719322205, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 170.20834350585938, "completions/mean_terminated_length": 170.20834350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.32264736297828334, "grad_norm": 2.218366125481505, "kl": 0.06689453125, "learning_rate": 7.657703827686115e-07, "loss": 0.0027, "num_tokens": 25523845.0, "reward": 0.6875, "reward_std": 0.25877460837364197, "rewards/reasoning_reward/mean": 0.6875, "rewards/reasoning_reward/std": 0.6562823057174683, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 179.6666717529297, "completions/mean_terminated_length": 179.6666717529297, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.32368148914167527, "grad_norm": 3.4872202871188547, "kl": 0.06298828125, "learning_rate": 7.643930613498561e-07, "loss": 0.0025, "num_tokens": 25606917.0, "reward": 1.1736111640930176, "reward_std": 0.3726871907711029, "rewards/reasoning_reward/mean": 1.173611044883728, "rewards/reasoning_reward/std": 0.464901328086853, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 166.33334350585938, "completions/mean_terminated_length": 166.33334350585938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.3247156153050672, "grad_norm": 3.037599272229024, "kl": 0.06982421875, "learning_rate": 7.630129493383052e-07, "loss": 0.0028, "num_tokens": 25685437.0, "reward": 1.1458333730697632, "reward_std": 0.23709973692893982, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.6833289265632629, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 146.95834350585938, "completions/mean_terminated_length": 146.95834350585938, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.32574974146845914, "grad_norm": 4.078205524302534, "kl": 0.047119140625, "learning_rate": 7.616300613006442e-07, "loss": 0.0019, "num_tokens": 25765932.0, "reward": 0.5416666865348816, "reward_std": 0.48112308979034424, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 133.7916717529297, "completions/mean_terminated_length": 133.7916717529297, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.3267838676318511, "grad_norm": 0.2821083532299412, "kl": 0.06787109375, "learning_rate": 7.602444118328592e-07, "loss": 0.0027, "num_tokens": 25840439.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.327817993795243, "grad_norm": 3.5599707379709162, "kl": 0.091796875, "learning_rate": 7.588560155600823e-07, "loss": 0.0037, "num_tokens": 25921308.0, "reward": 1.1458333730697632, "reward_std": 0.43601590394973755, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.7587054967880249, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 153.6666717529297, "completions/mean_terminated_length": 153.6666717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.32885211995863495, "grad_norm": 3.1856291943344077, "kl": 0.07666015625, "learning_rate": 7.574648871364368e-07, "loss": 0.0031, "num_tokens": 26000556.0, "reward": 1.2013888359069824, "reward_std": 0.1516503393650055, "rewards/reasoning_reward/mean": 1.2013888359069824, "rewards/reasoning_reward/std": 0.3068428933620453, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 152.1666717529297, "completions/mean_terminated_length": 152.1666717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3298862461220269, "grad_norm": 3.283679581462454, "kl": 0.07080078125, "learning_rate": 7.560710412448838e-07, "loss": 0.0028, "num_tokens": 26080200.0, "reward": 0.7708333730697632, "reward_std": 0.32520395517349243, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.416485458612442, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3309203722854188, "grad_norm": 4.160648948343019, "kl": 0.054443359375, "learning_rate": 7.546744925970664e-07, "loss": 0.0022, "num_tokens": 26157397.0, "reward": 0.9583333730697632, "reward_std": 0.3205421268939972, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.35864076018333435, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 209.0869598388672, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.33195449844881075, "grad_norm": 3.5547184484445915, "kl": 0.07373046875, "learning_rate": 7.532752559331539e-07, "loss": 0.003, "num_tokens": 26235427.0, "reward": 0.7916666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3329886246122027, "grad_norm": 4.066280261564092, "kl": 0.0791015625, "learning_rate": 7.518733460216875e-07, "loss": 0.0032, "num_tokens": 26319289.0, "reward": 1.125, "reward_std": 0.499225378036499, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.6298723220825195, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 158.33334350585938, "completions/mean_terminated_length": 158.33334350585938, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.3340227507755946, "grad_norm": 2.980136617187653, "kl": 0.051025390625, "learning_rate": 7.504687776594234e-07, "loss": 0.002, "num_tokens": 26395665.0, "reward": 0.9375, "reward_std": 0.3255884051322937, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.39870715141296387, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 136.83334350585938, "completions/mean_terminated_length": 136.83334350585938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.33505687693898656, "grad_norm": 3.0212039876952943, "kl": 0.0576171875, "learning_rate": 7.490615656711771e-07, "loss": 0.0023, "num_tokens": 26472917.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3360910031023785, "grad_norm": 3.015813746260023, "kl": 0.07568359375, "learning_rate": 7.47651724909667e-07, "loss": 0.003, "num_tokens": 26557080.0, "reward": 0.6041666865348816, "reward_std": 0.3116035461425781, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.7220015525817871, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.33712512926577043, "grad_norm": 4.08859100969837, "kl": 0.06494140625, "learning_rate": 7.46239270255357e-07, "loss": 0.0026, "num_tokens": 26641714.0, "reward": 1.0833333730697632, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 181.6666717529297, "completions/mean_terminated_length": 181.6666717529297, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.33815925542916236, "grad_norm": 4.0866348173332545, "kl": 0.07080078125, "learning_rate": 7.448242166163003e-07, "loss": 0.0028, "num_tokens": 26724962.0, "reward": 1.3333333730697632, "reward_std": 0.32801350951194763, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.48900964856147766, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3391933815925543, "grad_norm": 2.1510475663169353, "kl": 0.057373046875, "learning_rate": 7.434065789279815e-07, "loss": 0.0023, "num_tokens": 26803649.0, "reward": 0.875, "reward_std": 0.14773420989513397, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.7260674238204956, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 159.9166717529297, "completions/mean_terminated_length": 159.9166717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.34022750775594623, "grad_norm": 3.4038270866830955, "kl": 0.07275390625, "learning_rate": 7.41986372153159e-07, "loss": 0.0029, "num_tokens": 26895591.0, "reward": 1.375, "reward_std": 0.273722380399704, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.34126163391933817, "grad_norm": 3.2332472627018083, "kl": 0.060791015625, "learning_rate": 7.405636112817071e-07, "loss": 0.0024, "num_tokens": 26974222.0, "reward": 0.9583333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.35864076018333435, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 164.45834350585938, "completions/mean_terminated_length": 164.45834350585938, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.3422957600827301, "grad_norm": 3.3744277472463686, "kl": 0.07275390625, "learning_rate": 7.391383113304583e-07, "loss": 0.0029, "num_tokens": 27058377.0, "reward": 1.0208333730697632, "reward_std": 0.3508317470550537, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.6833289265632629, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 134.7916717529297, "completions/mean_terminated_length": 134.7916717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.34332988624612204, "grad_norm": 3.766622375322823, "kl": 0.08056640625, "learning_rate": 7.377104873430438e-07, "loss": 0.0032, "num_tokens": 27138204.0, "reward": 0.75, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 134.4166717529297, "completions/mean_terminated_length": 134.4166717529297, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.344364012409514, "grad_norm": 3.8370132926239346, "kl": 0.0859375, "learning_rate": 7.362801543897357e-07, "loss": 0.0034, "num_tokens": 27219846.0, "reward": 1.1041667461395264, "reward_std": 0.23144195973873138, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.7220014929771423, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 259.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 139.6521759033203, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.3453981385729059, "grad_norm": 2.1634706010744544, "kl": 0.058349609375, "learning_rate": 7.348473275672873e-07, "loss": 0.0023, "num_tokens": 27300293.0, "reward": 1.0416667461395264, "reward_std": 0.18722420930862427, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.31565436720848083, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.34643226473629785, "grad_norm": 3.494854866170849, "kl": 0.0751953125, "learning_rate": 7.334120219987741e-07, "loss": 0.003, "num_tokens": 27379068.0, "reward": 0.6666666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3474663908996898, "grad_norm": 0.35319764403800336, "kl": 0.0810546875, "learning_rate": 7.319742528334339e-07, "loss": 0.0032, "num_tokens": 27455982.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 126.70833587646484, "completions/mean_terminated_length": 126.70833587646484, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3485005170630817, "grad_norm": 4.842479635487001, "kl": 0.053955078125, "learning_rate": 7.305340352465071e-07, "loss": 0.0022, "num_tokens": 27535311.0, "reward": 0.75, "reward_std": 0.41387641429901123, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 144.6666717529297, "completions/mean_terminated_length": 144.6666717529297, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.34953464322647365, "grad_norm": 3.960259845440186, "kl": 0.0625, "learning_rate": 7.290913844390765e-07, "loss": 0.0025, "num_tokens": 27613047.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 127.20833587646484, "completions/mean_terminated_length": 127.20833587646484, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.3505687693898656, "grad_norm": 2.53370412691966, "kl": 0.052001953125, "learning_rate": 7.276463156379069e-07, "loss": 0.0021, "num_tokens": 27691236.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 167.56521606445312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3516028955532575, "grad_norm": 13.18852870542549, "kl": 0.390625, "learning_rate": 7.261988440952844e-07, "loss": 0.0157, "num_tokens": 27775724.0, "reward": 1.4305557012557983, "reward_std": 0.40538716316223145, "rewards/reasoning_reward/mean": 1.4305557012557983, "rewards/reasoning_reward/std": 0.648142397403717, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 186.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.35263702171664946, "grad_norm": 3.7251808322917257, "kl": 0.08154296875, "learning_rate": 7.247489850888551e-07, "loss": 0.0033, "num_tokens": 27857497.0, "reward": 1.0833333730697632, "reward_std": 0.3808860182762146, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.40824830532073975, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3536711478800414, "grad_norm": 4.446040086211323, "kl": 0.06884765625, "learning_rate": 7.232967539214643e-07, "loss": 0.0028, "num_tokens": 27938476.0, "reward": 1.5416667461395264, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.4871537983417511, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.35470527404343327, "grad_norm": 4.159860302213038, "kl": 0.04296875, "learning_rate": 7.218421659209948e-07, "loss": 0.0017, "num_tokens": 28018204.0, "reward": 0.6666666865348816, "reward_std": 0.46854168176651, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 135.9166717529297, "completions/mean_terminated_length": 137.04348754882812, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3557394002068252, "grad_norm": 3.9456854124043956, "kl": 0.087890625, "learning_rate": 7.203852364402048e-07, "loss": 0.0035, "num_tokens": 28103106.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 159.70834350585938, "completions/mean_terminated_length": 159.70834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.35677352637021714, "grad_norm": 3.8039878218613024, "kl": 0.07275390625, "learning_rate": 7.189259808565664e-07, "loss": 0.0029, "num_tokens": 28184387.0, "reward": 0.6666666865348816, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 154.95834350585938, "completions/mean_terminated_length": 154.95834350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3578076525336091, "grad_norm": 3.8927448717227846, "kl": 0.07373046875, "learning_rate": 7.174644145721031e-07, "loss": 0.003, "num_tokens": 28262346.0, "reward": 0.6666666865348816, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 174.33334350585938, "completions/mean_terminated_length": 174.33334350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.358841778697001, "grad_norm": 3.9724913747791044, "kl": 0.07373046875, "learning_rate": 7.16000553013227e-07, "loss": 0.003, "num_tokens": 28338434.0, "reward": 0.6041666865348816, "reward_std": 0.4981047809123993, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.5311833620071411, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 154.9166717529297, "completions/mean_terminated_length": 154.9166717529297, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.35987590486039295, "grad_norm": 2.0828938523114524, "kl": 0.07763671875, "learning_rate": 7.145344116305762e-07, "loss": 0.0031, "num_tokens": 28415024.0, "reward": 1.125, "reward_std": 0.10603483021259308, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.25180506706237793, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 159.2916717529297, "completions/mean_terminated_length": 159.2916717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3609100310237849, "grad_norm": 4.458540222973645, "kl": 0.11328125, "learning_rate": 7.13066005898852e-07, "loss": 0.0045, "num_tokens": 28497783.0, "reward": 1.2083333730697632, "reward_std": 0.6395318508148193, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.6412736177444458, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3619441571871768, "grad_norm": 3.7766746412265, "kl": 0.125, "learning_rate": 7.115953513166549e-07, "loss": 0.005, "num_tokens": 28574772.0, "reward": 0.6666666865348816, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 114.16667175292969, "completions/mean_terminated_length": 114.16667175292969, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.36297828335056875, "grad_norm": 3.461708376714296, "kl": 0.0732421875, "learning_rate": 7.101224634063212e-07, "loss": 0.0029, "num_tokens": 28659984.0, "reward": 0.993055522441864, "reward_std": 0.18709687888622284, "rewards/reasoning_reward/mean": 0.993055522441864, "rewards/reasoning_reward/std": 0.7557815313339233, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 127.95833587646484, "completions/mean_terminated_length": 127.95833587646484, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3640124095139607, "grad_norm": 2.183902034581425, "kl": 0.06396484375, "learning_rate": 7.086473577137598e-07, "loss": 0.0026, "num_tokens": 28750087.0, "reward": 1.5, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 1.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 137.7916717529297, "completions/mean_terminated_length": 137.7916717529297, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.3650465356773526, "grad_norm": 3.440182240952645, "kl": 0.07421875, "learning_rate": 7.071700498082873e-07, "loss": 0.003, "num_tokens": 28835346.0, "reward": 0.7916666865348816, "reward_std": 0.4082186222076416, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 159.70834350585938, "completions/mean_terminated_length": 159.70834350585938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.36608066184074456, "grad_norm": 3.7647272928819335, "kl": 0.083984375, "learning_rate": 7.056905552824644e-07, "loss": 0.0033, "num_tokens": 28919371.0, "reward": 1.0625, "reward_std": 0.3944129943847656, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.5954993963241577, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 150.4166717529297, "completions/mean_terminated_length": 150.4166717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.3671147880041365, "grad_norm": 3.3768970946788697, "kl": 0.059814453125, "learning_rate": 7.042088897519307e-07, "loss": 0.0024, "num_tokens": 28997701.0, "reward": 0.7916666865348816, "reward_std": 0.3535533845424652, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4871538281440735, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 163.95834350585938, "completions/mean_terminated_length": 163.95834350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.36814891416752843, "grad_norm": 4.085471865989243, "kl": 0.10986328125, "learning_rate": 7.027250688552399e-07, "loss": 0.0044, "num_tokens": 29086372.0, "reward": 1.5416667461395264, "reward_std": 0.49276697635650635, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.4871537983417511, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.36918304033092036, "grad_norm": 3.1239460885673687, "kl": 0.055908203125, "learning_rate": 7.012391082536955e-07, "loss": 0.0022, "num_tokens": 29163254.0, "reward": 1.1388888359069824, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 1.1388888359069824, "rewards/reasoning_reward/std": 0.6052640676498413, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 181.9166717529297, "completions/mean_terminated_length": 181.9166717529297, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.3702171664943123, "grad_norm": 3.5139169095587626, "kl": 0.091796875, "learning_rate": 6.997510236311846e-07, "loss": 0.0037, "num_tokens": 29245260.0, "reward": 0.7916666865348816, "reward_std": 0.49076026678085327, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.5694518089294434, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 127.41667175292969, "completions/mean_terminated_length": 127.41667175292969, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.37125129265770423, "grad_norm": 3.785316738310359, "kl": 0.06396484375, "learning_rate": 6.982608306940128e-07, "loss": 0.0025, "num_tokens": 29322550.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 177.58334350585938, "completions/mean_terminated_length": 177.58334350585938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.37228541882109617, "grad_norm": 2.241778037699298, "kl": 0.09375, "learning_rate": 6.967685451707383e-07, "loss": 0.0037, "num_tokens": 29399980.0, "reward": 0.3333333432674408, "reward_std": 0.2182178944349289, "rewards/reasoning_reward/mean": 0.3333333432674408, "rewards/reasoning_reward/std": 0.601929247379303, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 178.95834350585938, "completions/mean_terminated_length": 178.95834350585938, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3733195449844881, "grad_norm": 0.25717311965945855, "kl": 0.07666015625, "learning_rate": 6.952741828120062e-07, "loss": 0.0031, "num_tokens": 29484387.0, "reward": 0.8333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.637022078037262, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.37435367114788004, "grad_norm": 4.722848836354921, "kl": 0.09765625, "learning_rate": 6.937777593903817e-07, "loss": 0.0039, "num_tokens": 29568552.0, "reward": 1.1875, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.28788962960243225, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 152.70834350585938, "completions/mean_terminated_length": 152.70834350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.375387797311272, "grad_norm": 3.6743007364748133, "kl": 0.062255859375, "learning_rate": 6.922792907001842e-07, "loss": 0.0025, "num_tokens": 29646721.0, "reward": 0.4166666865348816, "reward_std": 0.33247750997543335, "rewards/reasoning_reward/mean": 0.4166666567325592, "rewards/reasoning_reward/std": 0.5036101937294006, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 181.08334350585938, "completions/mean_terminated_length": 181.08334350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3764219234746639, "grad_norm": 3.384160520560316, "kl": 0.095703125, "learning_rate": 6.9077879255732e-07, "loss": 0.0038, "num_tokens": 29731443.0, "reward": 1.2222222089767456, "reward_std": 0.4460780620574951, "rewards/reasoning_reward/mean": 1.2222222089767456, "rewards/reasoning_reward/std": 0.5787431597709656, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.37745604963805585, "grad_norm": 3.8887377118282904, "kl": 0.087890625, "learning_rate": 6.892762807991159e-07, "loss": 0.0035, "num_tokens": 29814029.0, "reward": 1.0069444179534912, "reward_std": 0.44658660888671875, "rewards/reasoning_reward/mean": 1.0069444179534912, "rewards/reasoning_reward/std": 0.7573778629302979, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 120.66667175292969, "completions/mean_terminated_length": 120.66667175292969, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.3784901758014478, "grad_norm": 2.581428574067341, "kl": 0.08203125, "learning_rate": 6.87771771284152e-07, "loss": 0.0033, "num_tokens": 29897901.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.40824830532073975, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 165.1666717529297, "completions/mean_terminated_length": 165.1666717529297, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.3795243019648397, "grad_norm": 2.9198024039411745, "kl": 0.0634765625, "learning_rate": 6.862652798920938e-07, "loss": 0.0025, "num_tokens": 29976281.0, "reward": 0.8541666865348816, "reward_std": 0.27053868770599365, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.34512653946876526, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 193.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.38055842812823165, "grad_norm": 2.9072434267769616, "kl": 0.09326171875, "learning_rate": 6.84756822523525e-07, "loss": 0.0037, "num_tokens": 30054058.0, "reward": 0.8125, "reward_std": 0.3433460593223572, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.4848240315914154, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 144.58334350585938, "completions/mean_terminated_length": 144.58334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3815925542916236, "grad_norm": 3.5358384891912107, "kl": 0.08984375, "learning_rate": 6.832464150997798e-07, "loss": 0.0036, "num_tokens": 30137656.0, "reward": 1.2986111640930176, "reward_std": 0.20151451230049133, "rewards/reasoning_reward/mean": 1.298611044883728, "rewards/reasoning_reward/std": 0.41985490918159485, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3826266804550155, "grad_norm": 0.19189388153538847, "kl": 0.06298828125, "learning_rate": 6.817340735627745e-07, "loss": 0.0025, "num_tokens": 30223114.0, "reward": 0.3333333432674408, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.3333333432674408, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.38366080661840746, "grad_norm": 2.5621136837530125, "kl": 0.056640625, "learning_rate": 6.802198138748397e-07, "loss": 0.0023, "num_tokens": 30305003.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3846949327817994, "grad_norm": 3.730162920596462, "kl": 0.095703125, "learning_rate": 6.78703652018551e-07, "loss": 0.0038, "num_tokens": 30387952.0, "reward": 1.2777777910232544, "reward_std": 0.4832340478897095, "rewards/reasoning_reward/mean": 1.2777777910232544, "rewards/reasoning_reward/std": 0.6287527680397034, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 143.70834350585938, "completions/mean_terminated_length": 143.70834350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3857290589451913, "grad_norm": 2.227976386200522, "kl": 0.055908203125, "learning_rate": 6.771856039965615e-07, "loss": 0.0022, "num_tokens": 30474265.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.8064504265785217, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.38676318510858326, "grad_norm": 3.974774217751018, "kl": 0.12451171875, "learning_rate": 6.756656858314318e-07, "loss": 0.005, "num_tokens": 30559688.0, "reward": 1.4513888359069824, "reward_std": 0.39070504903793335, "rewards/reasoning_reward/mean": 1.4513888359069824, "rewards/reasoning_reward/std": 0.5613973736763, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 175.58334350585938, "completions/mean_terminated_length": 175.58334350585938, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3877973112719752, "grad_norm": 2.8084365803963354, "kl": 0.08203125, "learning_rate": 6.741439135654612e-07, "loss": 0.0033, "num_tokens": 30639974.0, "reward": 1.1666667461395264, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.7019641399383545, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 143.45834350585938, "completions/mean_terminated_length": 143.45834350585938, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.38883143743536713, "grad_norm": 3.1747783499460858, "kl": 0.07568359375, "learning_rate": 6.726203032605189e-07, "loss": 0.003, "num_tokens": 30721217.0, "reward": 0.75, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 154.20834350585938, "completions/mean_terminated_length": 154.20834350585938, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.38986556359875907, "grad_norm": 2.466045662278419, "kl": 0.1279296875, "learning_rate": 6.710948709978741e-07, "loss": 0.0051, "num_tokens": 30804974.0, "reward": 0.8958333730697632, "reward_std": 0.15268757939338684, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.29411497712135315, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 149.58334350585938, "completions/mean_terminated_length": 149.58334350585938, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.390899689762151, "grad_norm": 4.338062118799447, "kl": 0.06640625, "learning_rate": 6.695676328780256e-07, "loss": 0.0027, "num_tokens": 30886508.0, "reward": 0.9861111044883728, "reward_std": 0.46136391162872314, "rewards/reasoning_reward/mean": 0.9861111044883728, "rewards/reasoning_reward/std": 0.6406455039978027, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 158.45834350585938, "completions/mean_terminated_length": 158.45834350585938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.39193381592554294, "grad_norm": 3.275227479605371, "kl": 0.10693359375, "learning_rate": 6.680386050205332e-07, "loss": 0.0043, "num_tokens": 30969583.0, "reward": 1.3958333730697632, "reward_std": 0.24185511469841003, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.4657664895057678, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 160.6666717529297, "completions/mean_terminated_length": 160.6666717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3929679420889349, "grad_norm": 2.8767415849776694, "kl": 0.11279296875, "learning_rate": 6.665078035638465e-07, "loss": 0.0045, "num_tokens": 31046431.0, "reward": 0.9861111044883728, "reward_std": 0.2921907305717468, "rewards/reasoning_reward/mean": 0.9861111044883728, "rewards/reasoning_reward/std": 0.3867262005805969, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 143.08334350585938, "completions/mean_terminated_length": 143.08334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.3940020682523268, "grad_norm": 2.3645686686795626, "kl": 0.09375, "learning_rate": 6.649752446651352e-07, "loss": 0.0037, "num_tokens": 31125817.0, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 157.33334350585938, "completions/mean_terminated_length": 157.33334350585938, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.39503619441571874, "grad_norm": 3.7353394391876265, "kl": 0.056884765625, "learning_rate": 6.634409445001181e-07, "loss": 0.0023, "num_tokens": 31206177.0, "reward": 0.75, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 173.08334350585938, "completions/mean_terminated_length": 173.08334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3960703205791106, "grad_norm": 3.022792894461881, "kl": 0.08740234375, "learning_rate": 6.619049192628924e-07, "loss": 0.0035, "num_tokens": 31287907.0, "reward": 0.375, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 133.70834350585938, "completions/mean_terminated_length": 133.70834350585938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.39710444674250256, "grad_norm": 2.7266764014944096, "kl": 0.0771484375, "learning_rate": 6.603671851657634e-07, "loss": 0.0031, "num_tokens": 31365276.0, "reward": 1.1041667461395264, "reward_std": 0.12400396913290024, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.25448867678642273, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 115.70833587646484, "completions/mean_terminated_length": 115.70833587646484, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3981385729058945, "grad_norm": 3.4941613374593965, "kl": 0.09228515625, "learning_rate": 6.588277584390725e-07, "loss": 0.0037, "num_tokens": 31442029.0, "reward": 0.75, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 135.83334350585938, "completions/mean_terminated_length": 135.83334350585938, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.39917269906928643, "grad_norm": 3.558030153703449, "kl": 0.09619140625, "learning_rate": 6.572866553310265e-07, "loss": 0.0039, "num_tokens": 31525481.0, "reward": 1.0416667461395264, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.40020682523267836, "grad_norm": 3.432909868088115, "kl": 0.052734375, "learning_rate": 6.557438921075258e-07, "loss": 0.0021, "num_tokens": 31603905.0, "reward": 0.5, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.4012409513960703, "grad_norm": 3.199320861926171, "kl": 0.08251953125, "learning_rate": 6.541994850519933e-07, "loss": 0.0033, "num_tokens": 31684208.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 184.45834350585938, "completions/mean_terminated_length": 184.45834350585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.40227507755946224, "grad_norm": 2.917820029257383, "kl": 0.08935546875, "learning_rate": 6.526534504652013e-07, "loss": 0.0036, "num_tokens": 31768707.0, "reward": 0.9166666865348816, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.6370220184326172, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 147.6666717529297, "completions/mean_terminated_length": 147.6666717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.40330920372285417, "grad_norm": 2.3537166439553805, "kl": 0.0615234375, "learning_rate": 6.511058046651011e-07, "loss": 0.0025, "num_tokens": 31847507.0, "reward": 0.7916666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 183.95834350585938, "completions/mean_terminated_length": 183.95834350585938, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4043433298862461, "grad_norm": 2.9414248666748626, "kl": 0.11572265625, "learning_rate": 6.49556563986649e-07, "loss": 0.0046, "num_tokens": 31932354.0, "reward": 0.9583333730697632, "reward_std": 0.2832478880882263, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 158.33334350585938, "completions/mean_terminated_length": 158.33334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.40537745604963804, "grad_norm": 4.303254887652743, "kl": 0.07666015625, "learning_rate": 6.480057447816355e-07, "loss": 0.0031, "num_tokens": 32017602.0, "reward": 0.7291666865348816, "reward_std": 0.48977774381637573, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.5311833620071411, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 190.20834350585938, "completions/mean_terminated_length": 190.20834350585938, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.40641158221303, "grad_norm": 3.84726477694061, "kl": 0.099609375, "learning_rate": 6.464533634185117e-07, "loss": 0.004, "num_tokens": 32101527.0, "reward": 0.7847222685813904, "reward_std": 0.3665553033351898, "rewards/reasoning_reward/mean": 0.7847222685813904, "rewards/reasoning_reward/std": 0.5394557118415833, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4074457083764219, "grad_norm": 3.5955314731898214, "kl": 0.087890625, "learning_rate": 6.448994362822167e-07, "loss": 0.0035, "num_tokens": 32181610.0, "reward": 0.7916666865348816, "reward_std": 0.3917974829673767, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.40847983453981385, "grad_norm": 2.047840627858506, "kl": 0.087890625, "learning_rate": 6.433439797740049e-07, "loss": 0.0035, "num_tokens": 32258896.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 155.4166717529297, "completions/mean_terminated_length": 155.4166717529297, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4095139607032058, "grad_norm": 3.1714239189466853, "kl": 0.11279296875, "learning_rate": 6.417870103112731e-07, "loss": 0.0045, "num_tokens": 32341714.0, "reward": 1.5069444179534912, "reward_std": 0.2113002985715866, "rewards/reasoning_reward/mean": 1.5069442987442017, "rewards/reasoning_reward/std": 0.44363224506378174, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 162.5416717529297, "completions/mean_terminated_length": 165.60870361328125, "completions/min_length": 92.0, "completions/min_terminated_length": 117.0, "epoch": 0.4105480868665977, "grad_norm": 3.3579511466794707, "kl": 0.08154296875, "learning_rate": 6.402285443273865e-07, "loss": 0.0033, "num_tokens": 32426735.0, "reward": 1.25, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.5316095352172852, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.41158221302998965, "grad_norm": 3.718010993661941, "kl": 0.0771484375, "learning_rate": 6.386685982715056e-07, "loss": 0.0031, "num_tokens": 32511453.0, "reward": 1.25, "reward_std": 0.3120119273662567, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.5897678136825562, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.52174377441406, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.4126163391933816, "grad_norm": 2.040886107836913, "kl": 0.0830078125, "learning_rate": 6.371071886084132e-07, "loss": 0.0033, "num_tokens": 32597216.0, "reward": 1.0416667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.20412413775920868, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4136504653567735, "grad_norm": 3.8617028180789013, "kl": 0.09326171875, "learning_rate": 6.355443318183394e-07, "loss": 0.0037, "num_tokens": 32682504.0, "reward": 0.7708333730697632, "reward_std": 0.21322892606258392, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.32900264859199524, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 155.20834350585938, "completions/mean_terminated_length": 155.20834350585938, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.41468459152016546, "grad_norm": 3.8146003140109523, "kl": 0.10205078125, "learning_rate": 6.339800443967884e-07, "loss": 0.0041, "num_tokens": 32763605.0, "reward": 0.9375, "reward_std": 0.43075722455978394, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.5379611253738403, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4157187176835574, "grad_norm": 2.025558543566053, "kl": 0.07666015625, "learning_rate": 6.324143428543647e-07, "loss": 0.0031, "num_tokens": 32840709.0, "reward": 1.0625, "reward_std": 0.12400396913290024, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.2242136001586914, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 156.4166717529297, "completions/mean_terminated_length": 156.4166717529297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.41675284384694933, "grad_norm": 3.726281910392908, "kl": 0.12158203125, "learning_rate": 6.308472437165982e-07, "loss": 0.0049, "num_tokens": 32934543.0, "reward": 1.6805555820465088, "reward_std": 0.28358834981918335, "rewards/reasoning_reward/mean": 1.6805554628372192, "rewards/reasoning_reward/std": 0.3143764138221741, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 146.6666717529297, "completions/mean_terminated_length": 146.6666717529297, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.41778697001034126, "grad_norm": 4.354273396992502, "kl": 0.078125, "learning_rate": 6.292787635237699e-07, "loss": 0.0031, "num_tokens": 33015679.0, "reward": 0.6666666865348816, "reward_std": 0.4993361234664917, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.6370220184326172, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 173.58334350585938, "completions/mean_terminated_length": 173.58334350585938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4188210961737332, "grad_norm": 3.983196444253434, "kl": 0.11767578125, "learning_rate": 6.277089188307378e-07, "loss": 0.0047, "num_tokens": 33110301.0, "reward": 1.375, "reward_std": 0.483431339263916, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.6796738505363464, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 138.7916717529297, "completions/mean_terminated_length": 138.7916717529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.41985522233712513, "grad_norm": 3.0815905327353788, "kl": 0.07763671875, "learning_rate": 6.261377262067615e-07, "loss": 0.0031, "num_tokens": 33186976.0, "reward": 0.625, "reward_std": 0.2553258538246155, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.5757792592048645, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.42088934850051707, "grad_norm": 3.2668731146325487, "kl": 0.111328125, "learning_rate": 6.245652022353276e-07, "loss": 0.0045, "num_tokens": 33269199.0, "reward": 0.7291666865348816, "reward_std": 0.38959476351737976, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.5311833620071411, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 136.95652770996094, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.421923474663909, "grad_norm": 2.7293326482277154, "kl": 0.138671875, "learning_rate": 6.229913635139748e-07, "loss": 0.0056, "num_tokens": 33357652.0, "reward": 1.3333333730697632, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.9168313145637512, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.42295760082730094, "grad_norm": 3.6612477267420185, "kl": 0.08984375, "learning_rate": 6.214162266541187e-07, "loss": 0.0036, "num_tokens": 33437576.0, "reward": 0.7083333730697632, "reward_std": 0.3907342851161957, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4239917269906929, "grad_norm": 0.3608860547487124, "kl": 0.06689453125, "learning_rate": 6.198398082808763e-07, "loss": 0.0027, "num_tokens": 33518076.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.4250258531540848, "grad_norm": 1.8961111171035863, "kl": 0.06787109375, "learning_rate": 6.182621250328905e-07, "loss": 0.0027, "num_tokens": 33603971.0, "reward": 1.25, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 121.16667175292969, "completions/mean_terminated_length": 121.16667175292969, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.42605997931747674, "grad_norm": 3.9688045476328773, "kl": 0.08642578125, "learning_rate": 6.166831935621546e-07, "loss": 0.0035, "num_tokens": 33684095.0, "reward": 1.0, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 166.70834350585938, "completions/mean_terminated_length": 166.70834350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4270941054808687, "grad_norm": 3.51307920136012, "kl": 0.072265625, "learning_rate": 6.151030305338367e-07, "loss": 0.0029, "num_tokens": 33763608.0, "reward": 0.9166666865348816, "reward_std": 0.355445921421051, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.5450701117515564, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 142.7916717529297, "completions/mean_terminated_length": 142.7916717529297, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4281282316442606, "grad_norm": 2.4788553923664116, "kl": 0.08642578125, "learning_rate": 6.135216526261036e-07, "loss": 0.0035, "num_tokens": 33848787.0, "reward": 0.9791666865348816, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.8139966726303101, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.42916235780765255, "grad_norm": 2.8834248799500073, "kl": 0.10009765625, "learning_rate": 6.119390765299447e-07, "loss": 0.004, "num_tokens": 33931398.0, "reward": 0.5902777910232544, "reward_std": 0.19089612364768982, "rewards/reasoning_reward/mean": 0.5902777910232544, "rewards/reasoning_reward/std": 0.4661984443664551, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 157.1666717529297, "completions/mean_terminated_length": 157.1666717529297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4301964839710445, "grad_norm": 3.9268847081933598, "kl": 0.0859375, "learning_rate": 6.103553189489959e-07, "loss": 0.0034, "num_tokens": 34013978.0, "reward": 1.7430557012557983, "reward_std": 0.36321234703063965, "rewards/reasoning_reward/mean": 1.7430557012557983, "rewards/reasoning_reward/std": 0.3961748480796814, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 179.83334350585938, "completions/mean_terminated_length": 179.83334350585938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4312306101344364, "grad_norm": 2.896040987170424, "kl": 0.09619140625, "learning_rate": 6.087703965993636e-07, "loss": 0.0038, "num_tokens": 34098558.0, "reward": 1.3333333730697632, "reward_std": 0.26726123690605164, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.458415687084198, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.43226473629782836, "grad_norm": 3.0726708113736834, "kl": 0.08642578125, "learning_rate": 6.071843262094476e-07, "loss": 0.0035, "num_tokens": 34176129.0, "reward": 0.7708333730697632, "reward_std": 0.4130779504776001, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.5311833620071411, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 158.2916717529297, "completions/mean_terminated_length": 158.2916717529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4332988624612203, "grad_norm": 3.651158491177238, "kl": 0.115234375, "learning_rate": 6.055971245197652e-07, "loss": 0.0046, "num_tokens": 34258992.0, "reward": 1.1319445371627808, "reward_std": 0.40136003494262695, "rewards/reasoning_reward/mean": 1.1319445371627808, "rewards/reasoning_reward/std": 0.6407633423805237, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4343329886246122, "grad_norm": 3.011267910041691, "kl": 0.1337890625, "learning_rate": 6.040088082827744e-07, "loss": 0.0053, "num_tokens": 34342034.0, "reward": 0.9791666865348816, "reward_std": 0.43203312158584595, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.9609683156013489, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.43536711478800416, "grad_norm": 3.934648597506697, "kl": 0.048828125, "learning_rate": 6.024193942626961e-07, "loss": 0.002, "num_tokens": 34423189.0, "reward": 0.4583333432674408, "reward_std": 0.4082186818122864, "rewards/reasoning_reward/mean": 0.4583333432674408, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4364012409513961, "grad_norm": 2.895700821484859, "kl": 0.103515625, "learning_rate": 6.008288992353396e-07, "loss": 0.0041, "num_tokens": 34500773.0, "reward": 0.625, "reward_std": 0.19416078925132751, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.5565811395645142, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 154.5416717529297, "completions/mean_terminated_length": 154.5416717529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.43743536711478803, "grad_norm": 3.7713820783469303, "kl": 0.07177734375, "learning_rate": 5.99237339987922e-07, "loss": 0.0029, "num_tokens": 34577722.0, "reward": 0.5625, "reward_std": 0.5429885983467102, "rewards/reasoning_reward/mean": 0.5625, "rewards/reasoning_reward/std": 0.5379611253738403, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 148.70834350585938, "completions/mean_terminated_length": 148.70834350585938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.4384694932781799, "grad_norm": 3.373544653686179, "kl": 0.050537109375, "learning_rate": 5.976447333188944e-07, "loss": 0.002, "num_tokens": 34652923.0, "reward": 0.5, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.43950361944157185, "grad_norm": 4.7102217326975415, "kl": 0.09033203125, "learning_rate": 5.960510960377626e-07, "loss": 0.0036, "num_tokens": 34736582.0, "reward": 0.9791666865348816, "reward_std": 0.354950875043869, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.3753018081188202, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 153.95834350585938, "completions/mean_terminated_length": 153.95834350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.4405377456049638, "grad_norm": 4.16978047474189, "kl": 0.10693359375, "learning_rate": 5.944564449649099e-07, "loss": 0.0043, "num_tokens": 34813797.0, "reward": 0.9583333730697632, "reward_std": 0.5391935706138611, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.6743220090866089, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4415718717683557, "grad_norm": 4.140299837667284, "kl": 0.10791015625, "learning_rate": 5.928607969314201e-07, "loss": 0.0043, "num_tokens": 34894010.0, "reward": 0.75, "reward_std": 0.47364258766174316, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 124.45833587646484, "completions/mean_terminated_length": 124.45833587646484, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.44260599793174765, "grad_norm": 3.8213113040181987, "kl": 0.064453125, "learning_rate": 5.912641687789002e-07, "loss": 0.0026, "num_tokens": 34968525.0, "reward": 0.625, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 136.4166717529297, "completions/mean_terminated_length": 136.4166717529297, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4436401240951396, "grad_norm": 2.733283075630004, "kl": 0.058837890625, "learning_rate": 5.896665773593012e-07, "loss": 0.0023, "num_tokens": 35046975.0, "reward": 1.0625, "reward_std": 0.08625819534063339, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.1689159870147705, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 151.58334350585938, "completions/mean_terminated_length": 151.58334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.4446742502585315, "grad_norm": 2.848539827633773, "kl": 0.07177734375, "learning_rate": 5.880680395347418e-07, "loss": 0.0029, "num_tokens": 35132493.0, "reward": 1.2638888359069824, "reward_std": 0.26332971453666687, "rewards/reasoning_reward/mean": 1.2638888359069824, "rewards/reasoning_reward/std": 0.8899827003479004, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.44570837642192346, "grad_norm": 2.221455031748688, "kl": 0.057861328125, "learning_rate": 5.864685721773293e-07, "loss": 0.0023, "num_tokens": 35212989.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.4467425025853154, "grad_norm": 3.710718263121241, "kl": 0.07958984375, "learning_rate": 5.848681921689819e-07, "loss": 0.0032, "num_tokens": 35298725.0, "reward": 1.1319445371627808, "reward_std": 0.3816637396812439, "rewards/reasoning_reward/mean": 1.1319444179534912, "rewards/reasoning_reward/std": 0.6255030035972595, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 169.08334350585938, "completions/mean_terminated_length": 169.08334350585938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.44777662874870733, "grad_norm": 3.6503587546386016, "kl": 0.0810546875, "learning_rate": 5.832669164012513e-07, "loss": 0.0032, "num_tokens": 35381551.0, "reward": 0.6666666865348816, "reward_std": 0.4745539426803589, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 173.0416717529297, "completions/mean_terminated_length": 173.0416717529297, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.44881075491209926, "grad_norm": 3.4278772902345302, "kl": 0.07421875, "learning_rate": 5.816647617751424e-07, "loss": 0.003, "num_tokens": 35463120.0, "reward": 1.1875, "reward_std": 0.3177132308483124, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.5067479610443115, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4498448810754912, "grad_norm": 2.8560665845210957, "kl": 0.07666015625, "learning_rate": 5.800617452009375e-07, "loss": 0.0031, "num_tokens": 35548048.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 161.1666717529297, "completions/mean_terminated_length": 161.1666717529297, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.45087900723888313, "grad_norm": 4.509744320701168, "kl": 0.119140625, "learning_rate": 5.784578835980157e-07, "loss": 0.0048, "num_tokens": 35642260.0, "reward": 1.4791667461395264, "reward_std": 0.41317981481552124, "rewards/reasoning_reward/mean": 1.4791666269302368, "rewards/reasoning_reward/std": 0.4293363690376282, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 134.7916717529297, "completions/mean_terminated_length": 134.7916717529297, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.45191313340227507, "grad_norm": 2.5466178790890446, "kl": 0.07080078125, "learning_rate": 5.768531938946756e-07, "loss": 0.0028, "num_tokens": 35719927.0, "reward": 0.6458333730697632, "reward_std": 0.13908717036247253, "rewards/reasoning_reward/mean": 0.6458333134651184, "rewards/reasoning_reward/std": 0.5208514928817749, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 148.4166717529297, "completions/mean_terminated_length": 148.4166717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.452947259565667, "grad_norm": 4.2489173473223145, "kl": 0.107421875, "learning_rate": 5.752476930279557e-07, "loss": 0.0043, "num_tokens": 35804233.0, "reward": 1.2083333730697632, "reward_std": 0.4671573042869568, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5694518089294434, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.45398138572905894, "grad_norm": 4.008875539956656, "kl": 0.08837890625, "learning_rate": 5.736413979434566e-07, "loss": 0.0035, "num_tokens": 35882052.0, "reward": 1.0277777910232544, "reward_std": 0.329608291387558, "rewards/reasoning_reward/mean": 1.0277777910232544, "rewards/reasoning_reward/std": 0.3859447240829468, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4550155118924509, "grad_norm": 3.550772237674307, "kl": 0.091796875, "learning_rate": 5.720343255951611e-07, "loss": 0.0037, "num_tokens": 35961122.0, "reward": 0.5416666865348816, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 178.1666717529297, "completions/mean_terminated_length": 178.1666717529297, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.4560496380558428, "grad_norm": 4.361928899053349, "kl": 0.11083984375, "learning_rate": 5.704264929452562e-07, "loss": 0.0044, "num_tokens": 36050486.0, "reward": 1.1597222089767456, "reward_std": 0.41859424114227295, "rewards/reasoning_reward/mean": 1.1597222089767456, "rewards/reasoning_reward/std": 0.7573778033256531, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 158.70834350585938, "completions/mean_terminated_length": 158.70834350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.45708376421923474, "grad_norm": 4.446970005435705, "kl": 0.109375, "learning_rate": 5.688179169639537e-07, "loss": 0.0044, "num_tokens": 36130319.0, "reward": 0.7083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4581178903826267, "grad_norm": 4.394083775689496, "kl": 0.138671875, "learning_rate": 5.672086146293108e-07, "loss": 0.0056, "num_tokens": 36219107.0, "reward": 1.3819444179534912, "reward_std": 0.48597466945648193, "rewards/reasoning_reward/mean": 1.3819442987442017, "rewards/reasoning_reward/std": 0.7428876161575317, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 154.7916717529297, "completions/mean_terminated_length": 154.7916717529297, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4591520165460186, "grad_norm": 3.2132018317716478, "kl": 0.1513671875, "learning_rate": 5.65598602927051e-07, "loss": 0.006, "num_tokens": 36301518.0, "reward": 0.9375, "reward_std": 0.26507532596588135, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.6309499144554138, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 171.95834350585938, "completions/mean_terminated_length": 171.95834350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.46018614270941055, "grad_norm": 4.320892935715968, "kl": 0.10791015625, "learning_rate": 5.639878988503858e-07, "loss": 0.0043, "num_tokens": 36377789.0, "reward": 1.2916667461395264, "reward_std": 0.436039537191391, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.4871537983417511, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 146.0416717529297, "completions/mean_terminated_length": 146.0416717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4612202688728025, "grad_norm": 3.1035680233792466, "kl": 0.09765625, "learning_rate": 5.623765193998333e-07, "loss": 0.0039, "num_tokens": 36461686.0, "reward": 1.125, "reward_std": 0.2553258538246155, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.6634888052940369, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 133.45834350585938, "completions/mean_terminated_length": 133.45834350585938, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4622543950361944, "grad_norm": 3.4348649831091187, "kl": 0.09130859375, "learning_rate": 5.607644815830412e-07, "loss": 0.0037, "num_tokens": 36539977.0, "reward": 0.930555522441864, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.930555522441864, "rewards/reasoning_reward/std": 0.405030757188797, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.46328852119958636, "grad_norm": 3.0932846774731515, "kl": 0.050048828125, "learning_rate": 5.591518024146049e-07, "loss": 0.002, "num_tokens": 36621971.0, "reward": 0.7083333730697632, "reward_std": 0.243839293718338, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.6064269542694092, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.4643226473629783, "grad_norm": 1.5405981712007193, "kl": 0.058349609375, "learning_rate": 5.5753849891589e-07, "loss": 0.0023, "num_tokens": 36698823.0, "reward": 1.1666667461395264, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4653567735263702, "grad_norm": 3.2989222982872035, "kl": 0.080078125, "learning_rate": 5.559245881148513e-07, "loss": 0.0032, "num_tokens": 36779479.0, "reward": 1.125, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 170.2916717529297, "completions/mean_terminated_length": 170.2916717529297, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.46639089968976216, "grad_norm": 3.9459517839159957, "kl": 0.06591796875, "learning_rate": 5.543100870458537e-07, "loss": 0.0026, "num_tokens": 36855542.0, "reward": 0.7708333730697632, "reward_std": 0.3492930829524994, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.4657664895057678, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 151.0416717529297, "completions/mean_terminated_length": 151.0416717529297, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.4674250258531541, "grad_norm": 3.080169913436108, "kl": 0.09619140625, "learning_rate": 5.526950127494918e-07, "loss": 0.0039, "num_tokens": 36939599.0, "reward": 1.3263888359069824, "reward_std": 0.259171724319458, "rewards/reasoning_reward/mean": 1.3263888359069824, "rewards/reasoning_reward/std": 0.5549060106277466, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 144.95834350585938, "completions/mean_terminated_length": 144.95834350585938, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.46845915201654603, "grad_norm": 3.478946037062411, "kl": 0.09619140625, "learning_rate": 5.510793822724111e-07, "loss": 0.0039, "num_tokens": 37024790.0, "reward": 1.2083333730697632, "reward_std": 0.2616034746170044, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.35864076018333435, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 159.4166717529297, "completions/mean_terminated_length": 159.4166717529297, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.46949327817993797, "grad_norm": 4.69807004009168, "kl": 0.07421875, "learning_rate": 5.494632126671268e-07, "loss": 0.003, "num_tokens": 37105936.0, "reward": 0.75, "reward_std": 0.46631526947021484, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.5897678136825562, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.4705274043433299, "grad_norm": 3.5314961615269183, "kl": 0.08642578125, "learning_rate": 5.478465209918449e-07, "loss": 0.0035, "num_tokens": 37190787.0, "reward": 1.2291667461395264, "reward_std": 0.37612998485565186, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.7658352255821228, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 169.2916717529297, "completions/mean_terminated_length": 169.2916717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47156153050672184, "grad_norm": 3.157082111223005, "kl": 0.06298828125, "learning_rate": 5.462293243102815e-07, "loss": 0.0025, "num_tokens": 37270178.0, "reward": 1.0625, "reward_std": 0.2644323706626892, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.5954993963241577, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 144.70834350585938, "completions/mean_terminated_length": 144.70834350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.4725956566701138, "grad_norm": 2.2530317338339554, "kl": 0.09326171875, "learning_rate": 5.44611639691483e-07, "loss": 0.0037, "num_tokens": 37356739.0, "reward": 1.3333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 129.58334350585938, "completions/mean_terminated_length": 129.58334350585938, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.4736297828335057, "grad_norm": 3.530213579949582, "kl": 0.06884765625, "learning_rate": 5.429934842096453e-07, "loss": 0.0028, "num_tokens": 37444233.0, "reward": 1.3333333730697632, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 453.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 159.45834350585938, "completions/mean_terminated_length": 146.69564819335938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.47466390899689764, "grad_norm": 10.937724197833212, "kl": 0.07373046875, "learning_rate": 5.41374874943935e-07, "loss": 0.0029, "num_tokens": 37530724.0, "reward": 0.8958333730697632, "reward_std": 0.15268757939338684, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.7515081763267517, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 126.41667175292969, "completions/mean_terminated_length": 126.41667175292969, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4756980351602896, "grad_norm": 0.23424191596610372, "kl": 0.07568359375, "learning_rate": 5.397558289783079e-07, "loss": 0.003, "num_tokens": 37614622.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 144.2916717529297, "completions/mean_terminated_length": 144.2916717529297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.4767321613236815, "grad_norm": 3.5303048844343343, "kl": 0.068359375, "learning_rate": 5.381363634013285e-07, "loss": 0.0027, "num_tokens": 37697165.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.47776628748707345, "grad_norm": 2.4796497555216117, "kl": 0.042236328125, "learning_rate": 5.365164953059911e-07, "loss": 0.0017, "num_tokens": 37776012.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4788004136504654, "grad_norm": 3.9203231363884554, "kl": 0.07177734375, "learning_rate": 5.348962417895378e-07, "loss": 0.0029, "num_tokens": 37852260.0, "reward": 1.1041667461395264, "reward_std": 0.3027648627758026, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.7798991799354553, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 128.33334350585938, "completions/mean_terminated_length": 128.33334350585938, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.47983453981385726, "grad_norm": 2.3870572364816045, "kl": 0.0498046875, "learning_rate": 5.332756199532791e-07, "loss": 0.002, "num_tokens": 37932140.0, "reward": 1.1666667461395264, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 159.45834350585938, "completions/mean_terminated_length": 159.45834350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4808686659772492, "grad_norm": 3.4551439032369498, "kl": 0.0703125, "learning_rate": 5.316546469024127e-07, "loss": 0.0028, "num_tokens": 38008975.0, "reward": 0.8333333730697632, "reward_std": 0.3493061661720276, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.5835920572280884, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 151.58334350585938, "completions/mean_terminated_length": 151.58334350585938, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.48190279214064113, "grad_norm": 2.2638186168786993, "kl": 0.050048828125, "learning_rate": 5.300333397458436e-07, "loss": 0.002, "num_tokens": 38086597.0, "reward": 0.7291666865348816, "reward_std": 0.12400396913290024, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.4418136179447174, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 135.6666717529297, "completions/mean_terminated_length": 135.6666717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.48293691830403307, "grad_norm": 3.4589069312702914, "kl": 0.0751953125, "learning_rate": 5.284117155960025e-07, "loss": 0.003, "num_tokens": 38165749.0, "reward": 0.6666666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 145.83334350585938, "completions/mean_terminated_length": 145.83334350585938, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.483971044467425, "grad_norm": 3.777619829419609, "kl": 0.056640625, "learning_rate": 5.267897915686668e-07, "loss": 0.0023, "num_tokens": 38247225.0, "reward": 1.0833333730697632, "reward_std": 0.28029152750968933, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.458415687084198, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 158.6666717529297, "completions/mean_terminated_length": 158.6666717529297, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.48500517063081694, "grad_norm": 3.5113858847079924, "kl": 0.08740234375, "learning_rate": 5.251675847827784e-07, "loss": 0.0035, "num_tokens": 38331265.0, "reward": 0.9375, "reward_std": 0.2725489139556885, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.8884831070899963, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4860392967942089, "grad_norm": 3.0917385670170185, "kl": 0.07177734375, "learning_rate": 5.235451123602641e-07, "loss": 0.0029, "num_tokens": 38408302.0, "reward": 0.9375, "reward_std": 0.29339051246643066, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.7705517411231995, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.4870734229576008, "grad_norm": 0.3175955227430425, "kl": 0.06494140625, "learning_rate": 5.219223914258538e-07, "loss": 0.0026, "num_tokens": 38491450.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.48810754912099275, "grad_norm": 2.4136738947782126, "kl": 0.06884765625, "learning_rate": 5.202994391069008e-07, "loss": 0.0028, "num_tokens": 38570469.0, "reward": 0.6111111640930176, "reward_std": 0.23193079233169556, "rewards/reasoning_reward/mean": 0.6111111044883728, "rewards/reasoning_reward/std": 0.589084804058075, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 166.5416717529297, "completions/mean_terminated_length": 166.5416717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4891416752843847, "grad_norm": 3.1274290142900023, "kl": 0.08984375, "learning_rate": 5.186762725332008e-07, "loss": 0.0036, "num_tokens": 38655282.0, "reward": 0.9791666865348816, "reward_std": 0.2717381715774536, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.8272421956062317, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4901758014477766, "grad_norm": 4.7856676348279015, "kl": 0.07958984375, "learning_rate": 5.170529088368103e-07, "loss": 0.0032, "num_tokens": 38745685.0, "reward": 0.7083333730697632, "reward_std": 0.4939148426055908, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 165.9166717529297, "completions/mean_terminated_length": 165.9166717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.49120992761116855, "grad_norm": 3.991471510599265, "kl": 0.07568359375, "learning_rate": 5.154293651518666e-07, "loss": 0.003, "num_tokens": 38822715.0, "reward": 0.7777777910232544, "reward_std": 0.44400954246520996, "rewards/reasoning_reward/mean": 0.7777777314186096, "rewards/reasoning_reward/std": 0.6344891786575317, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.4922440537745605, "grad_norm": 3.6198040700819236, "kl": 0.091796875, "learning_rate": 5.138056586144071e-07, "loss": 0.0037, "num_tokens": 38905196.0, "reward": 1.0625, "reward_std": 0.33768826723098755, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.5954993963241577, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 159.58334350585938, "completions/mean_terminated_length": 159.58334350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4932781799379524, "grad_norm": 3.4082780232755794, "kl": 0.0556640625, "learning_rate": 5.121818063621877e-07, "loss": 0.0022, "num_tokens": 38995162.0, "reward": 1.2291667461395264, "reward_std": 0.36204060912132263, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.5706435441970825, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.49431230610134436, "grad_norm": 3.0224789729725345, "kl": 0.07373046875, "learning_rate": 5.105578255345021e-07, "loss": 0.003, "num_tokens": 39077227.0, "reward": 0.8125, "reward_std": 0.22466278076171875, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.7042186260223389, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 160.0416717529297, "completions/mean_terminated_length": 160.0416717529297, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4953464322647363, "grad_norm": 3.053679665790895, "kl": 0.07861328125, "learning_rate": 5.089337332720016e-07, "loss": 0.0031, "num_tokens": 39165884.0, "reward": 1.3125, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.8945790529251099, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 162.08334350585938, "completions/mean_terminated_length": 162.08334350585938, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.4963805584281282, "grad_norm": 3.1381029630629733, "kl": 0.06640625, "learning_rate": 5.073095467165134e-07, "loss": 0.0027, "num_tokens": 39248414.0, "reward": 1.0555555820465088, "reward_std": 0.22960862517356873, "rewards/reasoning_reward/mean": 1.0555554628372192, "rewards/reasoning_reward/std": 0.5766525864601135, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.49741468459152016, "grad_norm": 2.6932656248524913, "kl": 0.09765625, "learning_rate": 5.056852830108598e-07, "loss": 0.0039, "num_tokens": 39330824.0, "reward": 1.5, "reward_std": 0.28029152750968933, "rewards/reasoning_reward/mean": 1.5, "rewards/reasoning_reward/std": 0.48900964856147766, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4984488107549121, "grad_norm": 4.963868420618398, "kl": 0.08935546875, "learning_rate": 5.040609592986775e-07, "loss": 0.0036, "num_tokens": 39413486.0, "reward": 0.5208333730697632, "reward_std": 0.36753225326538086, "rewards/reasoning_reward/mean": 0.5208333134651184, "rewards/reasoning_reward/std": 0.6507381796836853, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 153.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.49948293691830403, "grad_norm": 0.19760209135973245, "kl": 0.049560546875, "learning_rate": 5.024365927242367e-07, "loss": 0.002, "num_tokens": 39493220.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 145.2916717529297, "completions/mean_terminated_length": 145.2916717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.500517063081696, "grad_norm": 4.265668637720173, "kl": 0.09423828125, "learning_rate": 5.008122004322597e-07, "loss": 0.0038, "num_tokens": 39575411.0, "reward": 1.0208333730697632, "reward_std": 0.45082372426986694, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.47729235887527466, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.5015511892450879, "grad_norm": 3.577115117621814, "kl": 0.0791015625, "learning_rate": 4.991877995677404e-07, "loss": 0.0032, "num_tokens": 39659448.0, "reward": 1.2083333730697632, "reward_std": 0.367926687002182, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.721060037612915, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 131.4166717529297, "completions/mean_terminated_length": 131.4166717529297, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5025853154084798, "grad_norm": 3.025540625801558, "kl": 0.07470703125, "learning_rate": 4.975634072757634e-07, "loss": 0.003, "num_tokens": 39742018.0, "reward": 1.4166667461395264, "reward_std": 0.19500279426574707, "rewards/reasoning_reward/mean": 1.4166666269302368, "rewards/reasoning_reward/std": 0.524749755859375, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 167.08334350585938, "completions/mean_terminated_length": 167.08334350585938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5036194415718718, "grad_norm": 3.4418671262934426, "kl": 0.09375, "learning_rate": 4.959390407013226e-07, "loss": 0.0038, "num_tokens": 39819180.0, "reward": 1.0, "reward_std": 0.42724665999412537, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5046535677352637, "grad_norm": 3.319472739975236, "kl": 0.06884765625, "learning_rate": 4.943147169891402e-07, "loss": 0.0028, "num_tokens": 39896373.0, "reward": 1.0416667461395264, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.35864076018333435, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 162.9166717529297, "completions/mean_terminated_length": 162.9166717529297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5056876938986556, "grad_norm": 3.3304049158014823, "kl": 0.054931640625, "learning_rate": 4.926904532834866e-07, "loss": 0.0022, "num_tokens": 39984259.0, "reward": 0.375, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 151.5416717529297, "completions/mean_terminated_length": 151.5416717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.5067218200620476, "grad_norm": 0.18358736944603146, "kl": 0.0576171875, "learning_rate": 4.910662667279983e-07, "loss": 0.0023, "num_tokens": 40060984.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5077559462254395, "grad_norm": 3.9194876650899495, "kl": 0.061767578125, "learning_rate": 4.894421744654979e-07, "loss": 0.0025, "num_tokens": 40147480.0, "reward": 0.8125, "reward_std": 0.4737785756587982, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.6726408004760742, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5087900723888314, "grad_norm": 3.096539621007116, "kl": 0.07666015625, "learning_rate": 4.878181936378124e-07, "loss": 0.0031, "num_tokens": 40224658.0, "reward": 1.2291667461395264, "reward_std": 0.30366337299346924, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.45792141556739807, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 170.95834350585938, "completions/mean_terminated_length": 170.95834350585938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5098241985522234, "grad_norm": 3.575630215884605, "kl": 0.06396484375, "learning_rate": 4.861943413855928e-07, "loss": 0.0026, "num_tokens": 40301689.0, "reward": 0.6666666865348816, "reward_std": 0.3493061661720276, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.5450701117515564, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 143.70834350585938, "completions/mean_terminated_length": 143.70834350585938, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.5108583247156153, "grad_norm": 3.7344523341009213, "kl": 0.044677734375, "learning_rate": 4.845706348481333e-07, "loss": 0.0018, "num_tokens": 40381394.0, "reward": 0.7083333730697632, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 160.2916717529297, "completions/mean_terminated_length": 160.2916717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5118924508790073, "grad_norm": 1.866445752533792, "kl": 0.11328125, "learning_rate": 4.829470911631898e-07, "loss": 0.0045, "num_tokens": 40458361.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 135.33334350585938, "completions/mean_terminated_length": 135.33334350585938, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.5129265770423992, "grad_norm": 4.4003611615962095, "kl": 0.080078125, "learning_rate": 4.813237274667993e-07, "loss": 0.0032, "num_tokens": 40548409.0, "reward": 1.2083333730697632, "reward_std": 0.48112308979034424, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.8329709768295288, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 152.7916717529297, "completions/mean_terminated_length": 152.7916717529297, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5139607032057911, "grad_norm": 3.3720583603504815, "kl": 0.046142578125, "learning_rate": 4.797005608930991e-07, "loss": 0.0018, "num_tokens": 40628820.0, "reward": 0.7083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 142.58334350585938, "completions/mean_terminated_length": 142.58334350585938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5149948293691831, "grad_norm": 0.15329245374819234, "kl": 0.045166015625, "learning_rate": 4.780776085741462e-07, "loss": 0.0018, "num_tokens": 40704098.0, "reward": 1.1666667461395264, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.24077169597148895, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 165.33334350585938, "completions/mean_terminated_length": 165.33334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.516028955532575, "grad_norm": 3.4000622681936936, "kl": 0.08349609375, "learning_rate": 4.76454887639736e-07, "loss": 0.0033, "num_tokens": 40787242.0, "reward": 1.0833333730697632, "reward_std": 0.41387641429901123, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5835920572280884, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 152.20834350585938, "completions/mean_terminated_length": 152.20834350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5170630816959669, "grad_norm": 0.1571488063532437, "kl": 0.048583984375, "learning_rate": 4.7483241521722154e-07, "loss": 0.0019, "num_tokens": 40865215.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 138.70834350585938, "completions/mean_terminated_length": 138.70834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.5180972078593589, "grad_norm": 2.487583254626514, "kl": 0.06689453125, "learning_rate": 4.7321020843133326e-07, "loss": 0.0027, "num_tokens": 40950344.0, "reward": 1.3402776718139648, "reward_std": 0.09820928424596786, "rewards/reasoning_reward/mean": 1.3402776718139648, "rewards/reasoning_reward/std": 0.4622962176799774, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 145.0416717529297, "completions/mean_terminated_length": 145.0416717529297, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5191313340227508, "grad_norm": 4.3586596392892565, "kl": 0.0693359375, "learning_rate": 4.7158828440399747e-07, "loss": 0.0028, "num_tokens": 41034001.0, "reward": 1.2361111640930176, "reward_std": 0.382678747177124, "rewards/reasoning_reward/mean": 1.236111044883728, "rewards/reasoning_reward/std": 0.7691463828086853, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 150.58334350585938, "completions/mean_terminated_length": 150.58334350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5201654601861427, "grad_norm": 3.884759725833334, "kl": 0.059326171875, "learning_rate": 4.699666602541565e-07, "loss": 0.0024, "num_tokens": 41110591.0, "reward": 0.75, "reward_std": 0.44819486141204834, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.7372097969055176, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5211995863495347, "grad_norm": 3.6687504446873556, "kl": 0.06884765625, "learning_rate": 4.683453530975872e-07, "loss": 0.0028, "num_tokens": 41188139.0, "reward": 0.75, "reward_std": 0.45045679807662964, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.5897678136825562, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5222337125129266, "grad_norm": 4.270390001851114, "kl": 0.056396484375, "learning_rate": 4.6672438004672074e-07, "loss": 0.0023, "num_tokens": 41265292.0, "reward": 0.8958333730697632, "reward_std": 0.24185511469841003, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.29411497712135315, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 160.83334350585938, "completions/mean_terminated_length": 160.83334350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5232678386763185, "grad_norm": 3.860866061268796, "kl": 0.07177734375, "learning_rate": 4.6510375821046204e-07, "loss": 0.0029, "num_tokens": 41349392.0, "reward": 1.1666667461395264, "reward_std": 0.47301241755485535, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.6915640830993652, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 132.4166717529297, "completions/mean_terminated_length": 132.4166717529297, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5243019648397105, "grad_norm": 3.5645240907272, "kl": 0.050048828125, "learning_rate": 4.6348350469400885e-07, "loss": 0.002, "num_tokens": 41428746.0, "reward": 0.7916666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 162.58334350585938, "completions/mean_terminated_length": 162.58334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5253360910031024, "grad_norm": 4.091927437075089, "kl": 0.06396484375, "learning_rate": 4.618636365986714e-07, "loss": 0.0026, "num_tokens": 41509568.0, "reward": 0.7291666865348816, "reward_std": 0.4690367579460144, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.5706435441970825, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 151.70834350585938, "completions/mean_terminated_length": 151.70834350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5263702171664943, "grad_norm": 3.4807276011577883, "kl": 0.07177734375, "learning_rate": 4.602441710216922e-07, "loss": 0.0029, "num_tokens": 41588865.0, "reward": 1.25, "reward_std": 0.3314744830131531, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.675663948059082, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 123.95833587646484, "completions/mean_terminated_length": 123.95833587646484, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5274043433298863, "grad_norm": 3.5624810608659483, "kl": 0.0751953125, "learning_rate": 4.586251250560648e-07, "loss": 0.003, "num_tokens": 41672816.0, "reward": 0.9166666865348816, "reward_std": 0.3493061661720276, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.434057354927063, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5284384694932782, "grad_norm": 2.1052711972616454, "kl": 0.07275390625, "learning_rate": 4.5700651579035453e-07, "loss": 0.0029, "num_tokens": 41754563.0, "reward": 1.1666667461395264, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.8164966106414795, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 147.58334350585938, "completions/mean_terminated_length": 147.58334350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5294725956566702, "grad_norm": 3.6467154627194254, "kl": 0.06982421875, "learning_rate": 4.55388360308517e-07, "loss": 0.0028, "num_tokens": 41836945.0, "reward": 1.0833333730697632, "reward_std": 0.4629100561141968, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.7755315899848938, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 142.20834350585938, "completions/mean_terminated_length": 142.20834350585938, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5305067218200621, "grad_norm": 2.9628853082893154, "kl": 0.07861328125, "learning_rate": 4.5377067568971837e-07, "loss": 0.0031, "num_tokens": 41925614.0, "reward": 1.1875, "reward_std": 0.183963343501091, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.28788962960243225, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 166.5416717529297, "completions/mean_terminated_length": 166.5416717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.531540847983454, "grad_norm": 1.7751423102606667, "kl": 0.040771484375, "learning_rate": 4.521534790081549e-07, "loss": 0.0016, "num_tokens": 42009907.0, "reward": 1.0208333730697632, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.10206207633018494, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 157.70834350585938, "completions/mean_terminated_length": 157.70834350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.532574974146846, "grad_norm": 2.6567778397813617, "kl": 0.0771484375, "learning_rate": 4.505367873328731e-07, "loss": 0.0031, "num_tokens": 42100620.0, "reward": 1.5208333730697632, "reward_std": 0.21309106051921844, "rewards/reasoning_reward/mean": 1.5208333730697632, "rewards/reasoning_reward/std": 0.453948050737381, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 138.9166717529297, "completions/mean_terminated_length": 138.9166717529297, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5336091003102379, "grad_norm": 3.1909559071085325, "kl": 0.0458984375, "learning_rate": 4.489206177275889e-07, "loss": 0.0018, "num_tokens": 42185474.0, "reward": 1.125, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 151.33334350585938, "completions/mean_terminated_length": 151.33334350585938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5346432264736298, "grad_norm": 2.715317039850287, "kl": 0.06982421875, "learning_rate": 4.473049872505081e-07, "loss": 0.0028, "num_tokens": 42269130.0, "reward": 1.0416667461395264, "reward_std": 0.20693820714950562, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.721060037612915, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 133.7916717529297, "completions/mean_terminated_length": 133.7916717529297, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5356773526370218, "grad_norm": 4.433835698458295, "kl": 0.07763671875, "learning_rate": 4.4568991295414637e-07, "loss": 0.0031, "num_tokens": 42358405.0, "reward": 1.1666667461395264, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 126.04167175292969, "completions/mean_terminated_length": 126.04167175292969, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5367114788004137, "grad_norm": 2.490946137525958, "kl": 0.07861328125, "learning_rate": 4.440754118851486e-07, "loss": 0.0032, "num_tokens": 42437086.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5377456049638056, "grad_norm": 4.116295710152532, "kl": 0.0703125, "learning_rate": 4.424615010841099e-07, "loss": 0.0028, "num_tokens": 42521801.0, "reward": 1.0138888359069824, "reward_std": 0.4999142587184906, "rewards/reasoning_reward/mean": 1.0138888359069824, "rewards/reasoning_reward/std": 0.547097384929657, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5387797311271976, "grad_norm": 3.031621149484474, "kl": 0.10595703125, "learning_rate": 4.4084819758539506e-07, "loss": 0.0043, "num_tokens": 42604818.0, "reward": 1.2986111640930176, "reward_std": 0.1608150601387024, "rewards/reasoning_reward/mean": 1.298611044883728, "rewards/reasoning_reward/std": 0.39311453700065613, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5398138572905895, "grad_norm": 1.8896709692670526, "kl": 0.0458984375, "learning_rate": 4.3923551841695885e-07, "loss": 0.0018, "num_tokens": 42684495.0, "reward": 1.0416667461395264, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5408479834539814, "grad_norm": 2.5348710252262485, "kl": 0.048583984375, "learning_rate": 4.376234806001665e-07, "loss": 0.0019, "num_tokens": 42762557.0, "reward": 1.4375, "reward_std": 0.08625819534063339, "rewards/reasoning_reward/mean": 1.4375, "rewards/reasoning_reward/std": 0.37044334411621094, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5418821096173733, "grad_norm": 4.4258462156938965, "kl": 0.06298828125, "learning_rate": 4.360121011496142e-07, "loss": 0.0025, "num_tokens": 42841621.0, "reward": 0.6458333730697632, "reward_std": 0.41873571276664734, "rewards/reasoning_reward/mean": 0.6458333134651184, "rewards/reasoning_reward/std": 0.580089271068573, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 165.2916717529297, "completions/mean_terminated_length": 165.2916717529297, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5429162357807652, "grad_norm": 2.973908534715797, "kl": 0.056640625, "learning_rate": 4.344013970729489e-07, "loss": 0.0023, "num_tokens": 42926292.0, "reward": 1.0208333730697632, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.7868369221687317, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5439503619441571, "grad_norm": 4.478416886750731, "kl": 0.0732421875, "learning_rate": 4.327913853706893e-07, "loss": 0.0029, "num_tokens": 43011243.0, "reward": 1.375, "reward_std": 0.39814266562461853, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 133.4166717529297, "completions/mean_terminated_length": 133.4166717529297, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.5449844881075491, "grad_norm": 0.24843813220604533, "kl": 0.07666015625, "learning_rate": 4.3118208303604635e-07, "loss": 0.0031, "num_tokens": 43096325.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.8340576887130737, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.546018614270941, "grad_norm": 3.677049891807094, "kl": 0.09423828125, "learning_rate": 4.295735070547438e-07, "loss": 0.0038, "num_tokens": 43184346.0, "reward": 0.9791666865348816, "reward_std": 0.3438849151134491, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.8905196785926819, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 159.4166717529297, "completions/mean_terminated_length": 159.4166717529297, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5470527404343329, "grad_norm": 3.3666791301046612, "kl": 0.059326171875, "learning_rate": 4.2796567440483904e-07, "loss": 0.0024, "num_tokens": 43270244.0, "reward": 0.8125, "reward_std": 0.3438849151134491, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.5277618765830994, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 162.08334350585938, "completions/mean_terminated_length": 162.08334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5480868665977249, "grad_norm": 4.015099024696647, "kl": 0.07568359375, "learning_rate": 4.263586020565436e-07, "loss": 0.003, "num_tokens": 43349494.0, "reward": 0.6458333730697632, "reward_std": 0.3310800790786743, "rewards/reasoning_reward/mean": 0.6458333134651184, "rewards/reasoning_reward/std": 0.5610387921333313, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5491209927611168, "grad_norm": 1.6909730449484282, "kl": 0.07861328125, "learning_rate": 4.2475230697204446e-07, "loss": 0.0032, "num_tokens": 43439009.0, "reward": 1.3125, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.4618605971336365, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5501551189245087, "grad_norm": 3.64263703695767, "kl": 0.08056640625, "learning_rate": 4.2314680610532445e-07, "loss": 0.0032, "num_tokens": 43528666.0, "reward": 1.1666667461395264, "reward_std": 0.23894576728343964, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 156.9166717529297, "completions/mean_terminated_length": 156.9166717529297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5511892450879007, "grad_norm": 3.424907168401309, "kl": 0.05859375, "learning_rate": 4.2154211640198426e-07, "loss": 0.0023, "num_tokens": 43611552.0, "reward": 0.7083333730697632, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 154.2916717529297, "completions/mean_terminated_length": 154.2916717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5522233712512926, "grad_norm": 3.695112863190559, "kl": 0.1298828125, "learning_rate": 4.199382547990625e-07, "loss": 0.0052, "num_tokens": 43690327.0, "reward": 0.8333333730697632, "reward_std": 0.38613972067832947, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5532574974146846, "grad_norm": 2.0790329349480148, "kl": 0.047119140625, "learning_rate": 4.1833523822485766e-07, "loss": 0.0019, "num_tokens": 43774529.0, "reward": 1.1666667461395264, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 170.58334350585938, "completions/mean_terminated_length": 170.58334350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5542916235780765, "grad_norm": 2.323046338652214, "kl": 0.0830078125, "learning_rate": 4.167330835987489e-07, "loss": 0.0033, "num_tokens": 43852103.0, "reward": 0.7291666865348816, "reward_std": 0.19795583188533783, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.6251811385154724, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5553257497414684, "grad_norm": 4.195661413844518, "kl": 0.08544921875, "learning_rate": 4.1513180783101807e-07, "loss": 0.0034, "num_tokens": 43936347.0, "reward": 1.1458333730697632, "reward_std": 0.4177326560020447, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.580089271068573, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 149.9166717529297, "completions/mean_terminated_length": 149.9166717529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5563598759048604, "grad_norm": 4.132578751644871, "kl": 0.07080078125, "learning_rate": 4.135314278226708e-07, "loss": 0.0028, "num_tokens": 44017537.0, "reward": 1.0833333730697632, "reward_std": 0.41387641429901123, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.8030737638473511, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 132.5416717529297, "completions/mean_terminated_length": 132.5416717529297, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5573940020682523, "grad_norm": 6.445791655514141, "kl": 0.0498046875, "learning_rate": 4.119319604652583e-07, "loss": 0.002, "num_tokens": 44097166.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 145.83334350585938, "completions/mean_terminated_length": 145.83334350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5584281282316442, "grad_norm": 3.455599464978524, "kl": 0.09423828125, "learning_rate": 4.1033342264069887e-07, "loss": 0.0038, "num_tokens": 44176954.0, "reward": 1.2222222089767456, "reward_std": 0.24338775873184204, "rewards/reasoning_reward/mean": 1.2222222089767456, "rewards/reasoning_reward/std": 0.47565528750419617, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5594622543950362, "grad_norm": 31.684587209903366, "kl": 0.333984375, "learning_rate": 4.0873583122109986e-07, "loss": 0.0134, "num_tokens": 44265163.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5604963805584281, "grad_norm": 2.58155802773976, "kl": 0.061279296875, "learning_rate": 4.071392030685799e-07, "loss": 0.0025, "num_tokens": 44344491.0, "reward": 0.8958333730697632, "reward_std": 0.19795583188533783, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.3605300188064575, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 177.08334350585938, "completions/mean_terminated_length": 177.60870361328125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.56153050672182, "grad_norm": 3.9169957578271273, "kl": 0.057373046875, "learning_rate": 4.055435550350903e-07, "loss": 0.0023, "num_tokens": 44430293.0, "reward": 1.2986111640930176, "reward_std": 0.34180140495300293, "rewards/reasoning_reward/mean": 1.298611044883728, "rewards/reasoning_reward/std": 0.46878182888031006, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.562564632885212, "grad_norm": 3.5379641241095476, "kl": 0.049560546875, "learning_rate": 4.039489039622376e-07, "loss": 0.002, "num_tokens": 44511572.0, "reward": 0.7916666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 136.7916717529297, "completions/mean_terminated_length": 136.7916717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.5635987590486039, "grad_norm": 2.7760862641762913, "kl": 0.0576171875, "learning_rate": 4.023552666811056e-07, "loss": 0.0023, "num_tokens": 44591183.0, "reward": 0.6875, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 0.6875, "rewards/reasoning_reward/std": 0.5067479610443115, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 156.70834350585938, "completions/mean_terminated_length": 156.70834350585938, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5646328852119958, "grad_norm": 3.308306062539149, "kl": 0.0439453125, "learning_rate": 4.0076266001207796e-07, "loss": 0.0018, "num_tokens": 44672504.0, "reward": 0.9791666865348816, "reward_std": 0.3255884051322937, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.40322521328926086, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 109.91667175292969, "completions/mean_terminated_length": 109.91667175292969, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5656670113753878, "grad_norm": 2.3476228247469417, "kl": 0.0478515625, "learning_rate": 3.9917110076466054e-07, "loss": 0.0019, "num_tokens": 44748886.0, "reward": 0.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 139.375, "completions/mean_terminated_length": 139.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5667011375387797, "grad_norm": 1.9607809752536416, "kl": 0.04296875, "learning_rate": 3.9758060573730376e-07, "loss": 0.0017, "num_tokens": 44826023.0, "reward": 0.7916666865348816, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4871538281440735, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 136.20834350585938, "completions/mean_terminated_length": 136.20834350585938, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5677352637021716, "grad_norm": 4.07950906711803, "kl": 0.057861328125, "learning_rate": 3.9599119171722575e-07, "loss": 0.0023, "num_tokens": 44906724.0, "reward": 0.8333333730697632, "reward_std": 0.416355699300766, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.5450701713562012, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5687693898655636, "grad_norm": 2.389180002667207, "kl": 0.0634765625, "learning_rate": 3.9440287548023484e-07, "loss": 0.0025, "num_tokens": 44984433.0, "reward": 0.7083333730697632, "reward_std": 0.07715167850255966, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4402732849121094, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5698035160289555, "grad_norm": 3.4273893764803365, "kl": 0.083984375, "learning_rate": 3.928156737905525e-07, "loss": 0.0034, "num_tokens": 45067011.0, "reward": 1.3333333730697632, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 146.6666717529297, "completions/mean_terminated_length": 146.6666717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5708376421923474, "grad_norm": 3.5104041268905806, "kl": 0.0830078125, "learning_rate": 3.912296034006365e-07, "loss": 0.0033, "num_tokens": 45143987.0, "reward": 1.1458333730697632, "reward_std": 0.24999213218688965, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.3120467960834503, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 149.95834350585938, "completions/mean_terminated_length": 149.95834350585938, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.5718717683557394, "grad_norm": 0.29409247983573755, "kl": 0.059814453125, "learning_rate": 3.896446810510041e-07, "loss": 0.0024, "num_tokens": 45229122.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5729058945191313, "grad_norm": 3.9211078693164234, "kl": 0.07275390625, "learning_rate": 3.880609234700554e-07, "loss": 0.0029, "num_tokens": 45306684.0, "reward": 0.25, "reward_std": 0.43459486961364746, "rewards/reasoning_reward/mean": 0.25, "rewards/reasoning_reward/std": 0.41702884435653687, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5739400206825233, "grad_norm": 12.819817865471965, "kl": 0.080078125, "learning_rate": 3.8647834737389637e-07, "loss": 0.0032, "num_tokens": 45389860.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 111.45833587646484, "completions/mean_terminated_length": 111.45833587646484, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5749741468459152, "grad_norm": 2.5262171083580074, "kl": 0.048583984375, "learning_rate": 3.8489696946616334e-07, "loss": 0.0019, "num_tokens": 45469223.0, "reward": 0.7083333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5760082730093071, "grad_norm": 2.651554531736017, "kl": 0.0595703125, "learning_rate": 3.833168064378455e-07, "loss": 0.0024, "num_tokens": 45546182.0, "reward": 0.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5770423991726991, "grad_norm": 3.386660184100593, "kl": 0.0625, "learning_rate": 3.817378749671095e-07, "loss": 0.0025, "num_tokens": 45630044.0, "reward": 1.0833333730697632, "reward_std": 0.34503278136253357, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5835920572280884, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 171.08334350585938, "completions/mean_terminated_length": 171.08334350585938, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.578076525336091, "grad_norm": 3.757186136491232, "kl": 0.0712890625, "learning_rate": 3.801601917191237e-07, "loss": 0.0029, "num_tokens": 45714430.0, "reward": 1.1666667461395264, "reward_std": 0.34930619597435, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.6197241544723511, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5791106514994829, "grad_norm": 4.509762581052894, "kl": 0.060302734375, "learning_rate": 3.7858377334588127e-07, "loss": 0.0024, "num_tokens": 45800863.0, "reward": 1.1041667461395264, "reward_std": 0.30217814445495605, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.6590369343757629, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 169.9166717529297, "completions/mean_terminated_length": 169.9166717529297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5801447776628749, "grad_norm": 4.1190032329728385, "kl": 0.07421875, "learning_rate": 3.7700863648602516e-07, "loss": 0.003, "num_tokens": 45882437.0, "reward": 1.25, "reward_std": 0.48957228660583496, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.7518094182014465, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 145.33334350585938, "completions/mean_terminated_length": 145.33334350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5811789038262668, "grad_norm": 3.870893194153702, "kl": 0.09912109375, "learning_rate": 3.7543479776467244e-07, "loss": 0.004, "num_tokens": 45962437.0, "reward": 0.6666666865348816, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.868114709854126, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5822130299896587, "grad_norm": 3.3202411092809654, "kl": 0.09814453125, "learning_rate": 3.7386227379323855e-07, "loss": 0.0039, "num_tokens": 46040004.0, "reward": 0.1875, "reward_std": 0.33108004927635193, "rewards/reasoning_reward/mean": 0.1875, "rewards/reasoning_reward/std": 0.4121128022670746, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5832471561530507, "grad_norm": 4.0347158309096764, "kl": 0.0654296875, "learning_rate": 3.7229108116926223e-07, "loss": 0.0026, "num_tokens": 46116284.0, "reward": 0.9375, "reward_std": 0.35970625281333923, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.7845311760902405, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5842812823164426, "grad_norm": 4.136783811313354, "kl": 0.091796875, "learning_rate": 3.707212364762301e-07, "loss": 0.0037, "num_tokens": 46194998.0, "reward": 0.6875, "reward_std": 0.5491421222686768, "rewards/reasoning_reward/mean": 0.6875, "rewards/reasoning_reward/std": 0.7634660601615906, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 148.2916717529297, "completions/mean_terminated_length": 148.2916717529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5853154084798345, "grad_norm": 3.7478667802868526, "kl": 0.07421875, "learning_rate": 3.691527562834018e-07, "loss": 0.003, "num_tokens": 46278789.0, "reward": 1.0416667461395264, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.6902530789375305, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5863495346432265, "grad_norm": 2.8655968955809286, "kl": 0.08056640625, "learning_rate": 3.6758565714563534e-07, "loss": 0.0032, "num_tokens": 46355567.0, "reward": 1.0416667461395264, "reward_std": 0.2985045611858368, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.8198179602622986, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 160.58334350585938, "completions/mean_terminated_length": 160.58334350585938, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5873836608066184, "grad_norm": 4.048409611227816, "kl": 0.06640625, "learning_rate": 3.6601995560321164e-07, "loss": 0.0027, "num_tokens": 46435733.0, "reward": 0.7291666865348816, "reward_std": 0.47280198335647583, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.48854634165763855, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 159.9166717529297, "completions/mean_terminated_length": 159.9166717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5884177869700103, "grad_norm": 4.048728445965589, "kl": 0.078125, "learning_rate": 3.6445566818166075e-07, "loss": 0.0031, "num_tokens": 46518451.0, "reward": 1.2916667461395264, "reward_std": 0.3610706031322479, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 131.6666717529297, "completions/mean_terminated_length": 131.6666717529297, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5894519131334023, "grad_norm": 3.9194927568414175, "kl": 0.06298828125, "learning_rate": 3.6289281139158685e-07, "loss": 0.0025, "num_tokens": 46601875.0, "reward": 0.9583333730697632, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.8064504861831665, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 170.6666717529297, "completions/mean_terminated_length": 170.6666717529297, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5904860392967942, "grad_norm": 3.375166233888406, "kl": 0.08935546875, "learning_rate": 3.613314017284943e-07, "loss": 0.0036, "num_tokens": 46677899.0, "reward": 0.8541666865348816, "reward_std": 0.23144195973873138, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.7442411184310913, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 139.95834350585938, "completions/mean_terminated_length": 139.95834350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5915201654601862, "grad_norm": 2.187391650107713, "kl": 0.0478515625, "learning_rate": 3.5977145567261355e-07, "loss": 0.0019, "num_tokens": 46755594.0, "reward": 0.5, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.5, "rewards/reasoning_reward/std": 0.5107539296150208, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 144.9166717529297, "completions/mean_terminated_length": 144.9166717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5925542916235781, "grad_norm": 3.5297891429966186, "kl": 0.0615234375, "learning_rate": 3.5821298968872696e-07, "loss": 0.0025, "num_tokens": 46835112.0, "reward": 0.9583333730697632, "reward_std": 0.243839293718338, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.2917960286140442, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 145.0416717529297, "completions/mean_terminated_length": 145.0416717529297, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.59358841778697, "grad_norm": 3.362879841575184, "kl": 0.0732421875, "learning_rate": 3.566560202259951e-07, "loss": 0.0029, "num_tokens": 46911793.0, "reward": 0.9375, "reward_std": 0.3688412308692932, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.49590715765953064, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 116.29167175292969, "completions/mean_terminated_length": 116.29167175292969, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.594622543950362, "grad_norm": 2.429845277216557, "kl": 0.0625, "learning_rate": 3.5510056371778337e-07, "loss": 0.0025, "num_tokens": 46990464.0, "reward": 1.125, "reward_std": 0.14773420989513397, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.30395936965942383, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 162.83334350585938, "completions/mean_terminated_length": 162.83334350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5956566701137539, "grad_norm": 2.922280213132429, "kl": 0.0654296875, "learning_rate": 3.5354663658148834e-07, "loss": 0.0026, "num_tokens": 47079588.0, "reward": 1.1875, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.8945790529251099, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 143.20834350585938, "completions/mean_terminated_length": 143.21739196777344, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5966907962771458, "grad_norm": 4.143240683081162, "kl": 0.041259765625, "learning_rate": 3.5199425521836445e-07, "loss": 0.0017, "num_tokens": 47157417.0, "reward": 0.5833333730697632, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.5833333134651184, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 140.4166717529297, "completions/mean_terminated_length": 140.4166717529297, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5977249224405378, "grad_norm": 1.9659319807373157, "kl": 0.0634765625, "learning_rate": 3.50443436013351e-07, "loss": 0.0025, "num_tokens": 47235395.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 123.33333587646484, "completions/mean_terminated_length": 123.33333587646484, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5987590486039297, "grad_norm": 2.2199998500371265, "kl": 0.06396484375, "learning_rate": 3.4889419533489895e-07, "loss": 0.0026, "num_tokens": 47321179.0, "reward": 1.2152777910232544, "reward_std": 0.07534459978342056, "rewards/reasoning_reward/mean": 1.2152777910232544, "rewards/reasoning_reward/std": 0.33506491780281067, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 179.20834350585938, "completions/mean_terminated_length": 179.20834350585938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5997931747673216, "grad_norm": 4.027929778275415, "kl": 0.056640625, "learning_rate": 3.4734654953479863e-07, "loss": 0.0023, "num_tokens": 47401472.0, "reward": 0.6666666865348816, "reward_std": 0.48678088188171387, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 145.2916717529297, "completions/mean_terminated_length": 145.2916717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6008273009307136, "grad_norm": 4.203672290672796, "kl": 0.08349609375, "learning_rate": 3.458005149480068e-07, "loss": 0.0033, "num_tokens": 47483511.0, "reward": 0.9861111044883728, "reward_std": 0.3927455544471741, "rewards/reasoning_reward/mean": 0.9861111044883728, "rewards/reasoning_reward/std": 0.6683251857757568, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6018614270941055, "grad_norm": 0.1514238170358154, "kl": 0.042236328125, "learning_rate": 3.4425610789247415e-07, "loss": 0.0017, "num_tokens": 47564375.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6028955532574974, "grad_norm": 3.123715432165474, "kl": 0.06689453125, "learning_rate": 3.4271334466897353e-07, "loss": 0.0027, "num_tokens": 47646583.0, "reward": 1.1458333730697632, "reward_std": 0.23709973692893982, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.40322521328926086, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 156.70834350585938, "completions/mean_terminated_length": 156.70834350585938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6039296794208894, "grad_norm": 3.5821953064727694, "kl": 0.0439453125, "learning_rate": 3.411722415609275e-07, "loss": 0.0018, "num_tokens": 47731776.0, "reward": 1.125, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.47204458713531494, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 150.08334350585938, "completions/mean_terminated_length": 150.08334350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6049638055842813, "grad_norm": 2.9553822886142007, "kl": 0.046630859375, "learning_rate": 3.396328148342366e-07, "loss": 0.0019, "num_tokens": 47811218.0, "reward": 0.4791666865348816, "reward_std": 0.31580695509910583, "rewards/reasoning_reward/mean": 0.4791666567325592, "rewards/reasoning_reward/std": 0.5413181781768799, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 136.4166717529297, "completions/mean_terminated_length": 136.4166717529297, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6059979317476732, "grad_norm": 3.452553119930219, "kl": 0.059326171875, "learning_rate": 3.3809508073710754e-07, "loss": 0.0024, "num_tokens": 47899740.0, "reward": 1.0833333730697632, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.40824830532073975, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 180.4166717529297, "completions/mean_terminated_length": 180.4166717529297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6070320579110652, "grad_norm": 2.0970296040353067, "kl": 0.052001953125, "learning_rate": 3.365590554998819e-07, "loss": 0.0021, "num_tokens": 47978398.0, "reward": 1.25, "reward_std": 0.1259881556034088, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.41702884435653687, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 147.7391357421875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.6080661840744571, "grad_norm": 4.189419981041684, "kl": 0.0546875, "learning_rate": 3.350247553348647e-07, "loss": 0.0022, "num_tokens": 48060621.0, "reward": 0.875, "reward_std": 0.4082186818122864, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.609100310237849, "grad_norm": 3.5703666580841418, "kl": 0.054443359375, "learning_rate": 3.3349219643615344e-07, "loss": 0.0022, "num_tokens": 48139971.0, "reward": 1.0833333730697632, "reward_std": 0.27392348647117615, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.35098204016685486, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.610134436401241, "grad_norm": 3.272644263091865, "kl": 0.060791015625, "learning_rate": 3.319613949794668e-07, "loss": 0.0024, "num_tokens": 48223079.0, "reward": 0.6111111640930176, "reward_std": 0.31463193893432617, "rewards/reasoning_reward/mean": 0.6111111044883728, "rewards/reasoning_reward/std": 0.48070666193962097, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 178.2916717529297, "completions/mean_terminated_length": 178.2916717529297, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6111685625646329, "grad_norm": 3.8491229946452425, "kl": 0.06884765625, "learning_rate": 3.304323671219744e-07, "loss": 0.0028, "num_tokens": 48302822.0, "reward": 0.9166666865348816, "reward_std": 0.24602244794368744, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.7708455324172974, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 108.08333587646484, "completions/mean_terminated_length": 108.08333587646484, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.6122026887280249, "grad_norm": 3.2825922300107773, "kl": 0.052978515625, "learning_rate": 3.2890512900212585e-07, "loss": 0.0021, "num_tokens": 48380512.0, "reward": 0.5833333730697632, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.5833333134651184, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.6132368148914168, "grad_norm": 2.5916120299999803, "kl": 0.053955078125, "learning_rate": 3.273796967394809e-07, "loss": 0.0022, "num_tokens": 48465905.0, "reward": 1.2083333730697632, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.6580052971839905, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6142709410548087, "grad_norm": 4.020983723037926, "kl": 0.050048828125, "learning_rate": 3.2585608643453867e-07, "loss": 0.002, "num_tokens": 48543945.0, "reward": 0.9791666865348816, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.8139966726303101, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 133.6666717529297, "completions/mean_terminated_length": 133.6666717529297, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6153050672182007, "grad_norm": 2.05299852274566, "kl": 0.04052734375, "learning_rate": 3.2433431416856816e-07, "loss": 0.0016, "num_tokens": 48622017.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 171.20834350585938, "completions/mean_terminated_length": 171.20834350585938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6163391933815926, "grad_norm": 3.1066601718856584, "kl": 0.087890625, "learning_rate": 3.2281439600343835e-07, "loss": 0.0035, "num_tokens": 48704886.0, "reward": 0.5833333730697632, "reward_std": 0.21161314845085144, "rewards/reasoning_reward/mean": 0.5833333134651184, "rewards/reasoning_reward/std": 0.8427009582519531, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 165.2916717529297, "completions/mean_terminated_length": 165.2916717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6173733195449845, "grad_norm": 4.305444073516952, "kl": 0.07763671875, "learning_rate": 3.2129634798144885e-07, "loss": 0.0031, "num_tokens": 48787885.0, "reward": 1.5625, "reward_std": 0.37917613983154297, "rewards/reasoning_reward/mean": 1.5625, "rewards/reasoning_reward/std": 0.517361581325531, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 157.1666717529297, "completions/mean_terminated_length": 157.1666717529297, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6184074457083765, "grad_norm": 3.1348740582949395, "kl": 0.07470703125, "learning_rate": 3.1978018612516024e-07, "loss": 0.003, "num_tokens": 48870625.0, "reward": 0.7291666865348816, "reward_std": 0.12400396913290024, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.9438492059707642, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 137.45834350585938, "completions/mean_terminated_length": 137.45834350585938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6194415718717684, "grad_norm": 3.57892132145044, "kl": 0.0751953125, "learning_rate": 3.182659264372254e-07, "loss": 0.003, "num_tokens": 48953788.0, "reward": 1.1666667461395264, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.637022078037262, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 142.2916717529297, "completions/mean_terminated_length": 142.2916717529297, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6204756980351603, "grad_norm": 4.301884451387064, "kl": 0.0673828125, "learning_rate": 3.1675358490022006e-07, "loss": 0.0027, "num_tokens": 49032971.0, "reward": 0.375, "reward_std": 0.48112308979034424, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 155.4166717529297, "completions/mean_terminated_length": 155.4166717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6215098241985523, "grad_norm": 2.256674358684405, "kl": 0.056884765625, "learning_rate": 3.1524317747647487e-07, "loss": 0.0023, "num_tokens": 49112829.0, "reward": 1.1805555820465088, "reward_std": 0.12858611345291138, "rewards/reasoning_reward/mean": 1.1805554628372192, "rewards/reasoning_reward/std": 0.3366382122039795, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 154.95834350585938, "completions/mean_terminated_length": 154.95834350585938, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6225439503619442, "grad_norm": 2.155514941763339, "kl": 0.0771484375, "learning_rate": 3.1373472010790613e-07, "loss": 0.0031, "num_tokens": 49189396.0, "reward": 0.8333333730697632, "reward_std": 0.267261266708374, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.5036101937294006, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6235780765253361, "grad_norm": 3.1569287636174064, "kl": 0.0810546875, "learning_rate": 3.122282287158479e-07, "loss": 0.0032, "num_tokens": 49267197.0, "reward": 0.5625, "reward_std": 0.2848889231681824, "rewards/reasoning_reward/mean": 0.5625, "rewards/reasoning_reward/std": 0.517361581325531, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6246122026887281, "grad_norm": 3.5123089746645637, "kl": 0.058349609375, "learning_rate": 3.1072371920088393e-07, "loss": 0.0023, "num_tokens": 49352248.0, "reward": 1.2083333730697632, "reward_std": 0.243839293718338, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.4871537983417511, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 164.95834350585938, "completions/mean_terminated_length": 164.95834350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6256463288521199, "grad_norm": 2.617225700730397, "kl": 0.080078125, "learning_rate": 3.092212074426799e-07, "loss": 0.0032, "num_tokens": 49430807.0, "reward": 1.0625, "reward_std": 0.08625819534063339, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.1689159870147705, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 159.70834350585938, "completions/mean_terminated_length": 159.70834350585938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6266804550155118, "grad_norm": 3.775214071699908, "kl": 0.07568359375, "learning_rate": 3.0772070929981587e-07, "loss": 0.003, "num_tokens": 49520960.0, "reward": 1.3125, "reward_std": 0.36753225326538086, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.6222766637802124, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 166.20834350585938, "completions/mean_terminated_length": 166.20834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6277145811789038, "grad_norm": 3.7546362665727453, "kl": 0.0615234375, "learning_rate": 3.062222406096183e-07, "loss": 0.0025, "num_tokens": 49602421.0, "reward": 0.6041666865348816, "reward_std": 0.4294546842575073, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.5893837213516235, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 110.54167175292969, "completions/mean_terminated_length": 110.54167175292969, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6287487073422957, "grad_norm": 0.1828600047194931, "kl": 0.039794921875, "learning_rate": 3.047258171879939e-07, "loss": 0.0016, "num_tokens": 49682514.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 170.1666717529297, "completions/mean_terminated_length": 170.1666717529297, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6297828335056876, "grad_norm": 1.781481839446534, "kl": 0.033447265625, "learning_rate": 3.032314548292618e-07, "loss": 0.0013, "num_tokens": 49762454.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 145.1666717529297, "completions/mean_terminated_length": 145.1666717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6308169596690796, "grad_norm": 3.893664449459787, "kl": 0.057861328125, "learning_rate": 3.0173916930598743e-07, "loss": 0.0023, "num_tokens": 49848010.0, "reward": 1.0763888359069824, "reward_std": 0.3868226408958435, "rewards/reasoning_reward/mean": 1.0763888359069824, "rewards/reasoning_reward/std": 0.6078773736953735, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6318510858324715, "grad_norm": 2.91304378338177, "kl": 0.07275390625, "learning_rate": 3.0024897636881556e-07, "loss": 0.0029, "num_tokens": 49926275.0, "reward": 0.8333333730697632, "reward_std": 0.24966806173324585, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.458415687084198, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 160.33334350585938, "completions/mean_terminated_length": 160.33334350585938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6328852119958635, "grad_norm": 4.645327728337031, "kl": 0.08935546875, "learning_rate": 2.9876089174630465e-07, "loss": 0.0036, "num_tokens": 50008955.0, "reward": 1.0625, "reward_std": 0.3766257166862488, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.5954993963241577, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 114.58333587646484, "completions/mean_terminated_length": 114.58333587646484, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6339193381592554, "grad_norm": 2.3500928948101976, "kl": 0.043701171875, "learning_rate": 2.972749311447602e-07, "loss": 0.0017, "num_tokens": 50086817.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 144.33334350585938, "completions/mean_terminated_length": 144.33334350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6349534643226473, "grad_norm": 4.815665886134638, "kl": 0.06689453125, "learning_rate": 2.957911102480694e-07, "loss": 0.0027, "num_tokens": 50165777.0, "reward": 0.7708333730697632, "reward_std": 0.5201427936553955, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.5512666702270508, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 150.7916717529297, "completions/mean_terminated_length": 150.7916717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6359875904860393, "grad_norm": 0.5275728635268186, "kl": 0.04052734375, "learning_rate": 2.943094447175356e-07, "loss": 0.0016, "num_tokens": 50248660.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 169.6666717529297, "completions/mean_terminated_length": 169.6666717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.6370217166494312, "grad_norm": 2.0323032948060398, "kl": 0.058837890625, "learning_rate": 2.9282995019171276e-07, "loss": 0.0024, "num_tokens": 50334548.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.8064504265785217, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 154.20834350585938, "completions/mean_terminated_length": 154.20834350585938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.6380558428128231, "grad_norm": 3.175018020932587, "kl": 0.068359375, "learning_rate": 2.9135264228624036e-07, "loss": 0.0027, "num_tokens": 50413561.0, "reward": 1.0416667461395264, "reward_std": 0.31925714015960693, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.417752206325531, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 153.6666717529297, "completions/mean_terminated_length": 153.6666717529297, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6390899689762151, "grad_norm": 4.35022573120695, "kl": 0.07275390625, "learning_rate": 2.8987753659367884e-07, "loss": 0.0029, "num_tokens": 50488753.0, "reward": 0.9375, "reward_std": 0.24185511469841003, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.6645119190216064, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.640124095139607, "grad_norm": 3.9637606696357435, "kl": 0.064453125, "learning_rate": 2.884046486833453e-07, "loss": 0.0026, "num_tokens": 50569367.0, "reward": 1.2083333730697632, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.705824613571167, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6411582213029989, "grad_norm": 2.8397850153757553, "kl": 0.060546875, "learning_rate": 2.8693399410114793e-07, "loss": 0.0024, "num_tokens": 50649053.0, "reward": 1.0694444179534912, "reward_std": 0.2472916543483734, "rewards/reasoning_reward/mean": 1.0694444179534912, "rewards/reasoning_reward/std": 0.3506951928138733, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 176.33334350585938, "completions/mean_terminated_length": 176.33334350585938, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6421923474663909, "grad_norm": 3.382273478222385, "kl": 0.07080078125, "learning_rate": 2.854655883694238e-07, "loss": 0.0028, "num_tokens": 50733821.0, "reward": 0.7708333730697632, "reward_std": 0.28302299976348877, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.416485458612442, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 172.2916717529297, "completions/mean_terminated_length": 172.2916717529297, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6432264736297828, "grad_norm": 2.8244945361941944, "kl": 0.06982421875, "learning_rate": 2.83999446986773e-07, "loss": 0.0028, "num_tokens": 50815580.0, "reward": 0.8958333730697632, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.7937139868736267, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 159.1666717529297, "completions/mean_terminated_length": 159.1666717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6442605997931747, "grad_norm": 3.773616346150942, "kl": 0.08056640625, "learning_rate": 2.82535585427897e-07, "loss": 0.0032, "num_tokens": 50897280.0, "reward": 0.8958333730697632, "reward_std": 0.33768826723098755, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.4418136179447174, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6452947259565667, "grad_norm": 3.3186602952698903, "kl": 0.07470703125, "learning_rate": 2.8107401914343363e-07, "loss": 0.003, "num_tokens": 50980988.0, "reward": 1.2708333730697632, "reward_std": 0.33086174726486206, "rewards/reasoning_reward/mean": 1.2708333730697632, "rewards/reasoning_reward/std": 0.4418136179447174, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 148.4166717529297, "completions/mean_terminated_length": 148.4166717529297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6463288521199586, "grad_norm": 4.081124219760009, "kl": 0.06689453125, "learning_rate": 2.796147635597954e-07, "loss": 0.0027, "num_tokens": 51065942.0, "reward": 1.4583333730697632, "reward_std": 0.36124157905578613, "rewards/reasoning_reward/mean": 1.4583333730697632, "rewards/reasoning_reward/std": 0.550032913684845, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 155.5416717529297, "completions/mean_terminated_length": 155.5416717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6473629782833505, "grad_norm": 4.171951829209572, "kl": 0.07373046875, "learning_rate": 2.781578340790053e-07, "loss": 0.003, "num_tokens": 51148627.0, "reward": 1.0833333730697632, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.6863049864768982, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 175.4166717529297, "completions/mean_terminated_length": 175.4166717529297, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.6483971044467425, "grad_norm": 3.816589069723601, "kl": 0.07763671875, "learning_rate": 2.767032460785356e-07, "loss": 0.0031, "num_tokens": 51238597.0, "reward": 1.0208333730697632, "reward_std": 0.42099714279174805, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.6164266467094421, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6494312306101344, "grad_norm": 1.955866878147104, "kl": 0.05078125, "learning_rate": 2.752510149111449e-07, "loss": 0.002, "num_tokens": 51316820.0, "reward": 1.1875, "reward_std": 0.13908717036247253, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.355469673871994, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 150.58334350585938, "completions/mean_terminated_length": 150.58334350585938, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.6504653567735263, "grad_norm": 3.0788457748288915, "kl": 0.06396484375, "learning_rate": 2.738011559047155e-07, "loss": 0.0026, "num_tokens": 51402106.0, "reward": 0.8541666865348816, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.5985338687896729, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 162.20834350585938, "completions/mean_terminated_length": 162.20834350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6514994829369183, "grad_norm": 3.631670896990646, "kl": 0.064453125, "learning_rate": 2.723536843620931e-07, "loss": 0.0026, "num_tokens": 51486543.0, "reward": 1.0208333730697632, "reward_std": 0.4769924581050873, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.6833289265632629, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6525336091003102, "grad_norm": 3.121087345839574, "kl": 0.07275390625, "learning_rate": 2.7090861556092347e-07, "loss": 0.0029, "num_tokens": 51570619.0, "reward": 0.9236111044883728, "reward_std": 0.18046043813228607, "rewards/reasoning_reward/mean": 0.9236111044883728, "rewards/reasoning_reward/std": 0.7239504456520081, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 173.33334350585938, "completions/mean_terminated_length": 173.33334350585938, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6535677352637022, "grad_norm": 4.0132486156921, "kl": 0.07275390625, "learning_rate": 2.6946596475349305e-07, "loss": 0.0029, "num_tokens": 51655891.0, "reward": 1.125, "reward_std": 0.40037286281585693, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.5299029350280762, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 193.45834350585938, "completions/mean_terminated_length": 193.45834350585938, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6546018614270941, "grad_norm": 3.686435293590089, "kl": 0.0712890625, "learning_rate": 2.680257471665661e-07, "loss": 0.0029, "num_tokens": 51745486.0, "reward": 1.25, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.4662523865699768, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 154.08334350585938, "completions/mean_terminated_length": 154.08334350585938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.655635987590486, "grad_norm": 3.142110321096429, "kl": 0.058837890625, "learning_rate": 2.66587978001226e-07, "loss": 0.0024, "num_tokens": 51829440.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.656670113753878, "grad_norm": 4.105530331463831, "kl": 0.09521484375, "learning_rate": 2.651526724327127e-07, "loss": 0.0038, "num_tokens": 51906116.0, "reward": 1.0, "reward_std": 0.4981178641319275, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.5316095352172852, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 130.1666717529297, "completions/mean_terminated_length": 130.1666717529297, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6577042399172699, "grad_norm": 3.203427593680448, "kl": 0.0537109375, "learning_rate": 2.6371984561026416e-07, "loss": 0.0021, "num_tokens": 51985168.0, "reward": 0.9861111044883728, "reward_std": 0.29100531339645386, "rewards/reasoning_reward/mean": 0.9861111044883728, "rewards/reasoning_reward/std": 0.35751646757125854, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6587383660806618, "grad_norm": 4.913600177708958, "kl": 0.06396484375, "learning_rate": 2.622895126569562e-07, "loss": 0.0026, "num_tokens": 52063404.0, "reward": 0.7916666865348816, "reward_std": 0.4082186818122864, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 174.1666717529297, "completions/mean_terminated_length": 174.1666717529297, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6597724922440538, "grad_norm": 2.4339757197483873, "kl": 0.058837890625, "learning_rate": 2.6086168866954175e-07, "loss": 0.0024, "num_tokens": 52148776.0, "reward": 1.1666667461395264, "reward_std": 0.08908708393573761, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6608066184074457, "grad_norm": 2.3656571936353594, "kl": 0.072265625, "learning_rate": 2.5943638871829296e-07, "loss": 0.0029, "num_tokens": 52232894.0, "reward": 1.0416667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.8064504265785217, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 154.0416717529297, "completions/mean_terminated_length": 154.0416717529297, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.6618407445708376, "grad_norm": 2.370538964682846, "kl": 0.060302734375, "learning_rate": 2.5801362784684104e-07, "loss": 0.0024, "num_tokens": 52311279.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.39009472727775574, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 136.70834350585938, "completions/mean_terminated_length": 136.70834350585938, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6628748707342296, "grad_norm": 4.307337721384269, "kl": 0.0693359375, "learning_rate": 2.5659342107201857e-07, "loss": 0.0028, "num_tokens": 52395696.0, "reward": 1.1041667461395264, "reward_std": 0.4240165948867798, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.642332136631012, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 126.04167175292969, "completions/mean_terminated_length": 126.04167175292969, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6639089968976215, "grad_norm": 3.4267578238060703, "kl": 0.047119140625, "learning_rate": 2.551757833836996e-07, "loss": 0.0019, "num_tokens": 52477881.0, "reward": 0.5416666865348816, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6649431230610134, "grad_norm": 3.0652745827085965, "kl": 0.06640625, "learning_rate": 2.537607297446428e-07, "loss": 0.0027, "num_tokens": 52567396.0, "reward": 1.4583333730697632, "reward_std": 0.1451837718486786, "rewards/reasoning_reward/mean": 1.4583333730697632, "rewards/reasoning_reward/std": 0.3877657949924469, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6659772492244054, "grad_norm": 4.114200525159867, "kl": 0.0546875, "learning_rate": 2.5234827509033294e-07, "loss": 0.0022, "num_tokens": 52643658.0, "reward": 1.1666667461395264, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 162.95834350585938, "completions/mean_terminated_length": 162.95834350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6670113753877973, "grad_norm": 4.283824327880422, "kl": 0.061279296875, "learning_rate": 2.509384343288227e-07, "loss": 0.0024, "num_tokens": 52728177.0, "reward": 1.2291667461395264, "reward_std": 0.35970625281333923, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.8720186948776245, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 112.29167175292969, "completions/mean_terminated_length": 112.29167175292969, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6680455015511892, "grad_norm": 2.5631822617842257, "kl": 0.0517578125, "learning_rate": 2.495312223405766e-07, "loss": 0.0021, "num_tokens": 52805880.0, "reward": 0.5416666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6690796277145812, "grad_norm": 3.1846394660646826, "kl": 0.0693359375, "learning_rate": 2.4812665397831243e-07, "loss": 0.0028, "num_tokens": 52888628.0, "reward": 1.1597222089767456, "reward_std": 0.2325192391872406, "rewards/reasoning_reward/mean": 1.1597222089767456, "rewards/reasoning_reward/std": 0.5118857026100159, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 157.95834350585938, "completions/mean_terminated_length": 157.95834350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6701137538779731, "grad_norm": 2.9464142757577507, "kl": 0.0771484375, "learning_rate": 2.467247440668462e-07, "loss": 0.0031, "num_tokens": 52966915.0, "reward": 1.3333333730697632, "reward_std": 0.28029152750968933, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4584156572818756, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.671147880041365, "grad_norm": 3.492845045813914, "kl": 0.06689453125, "learning_rate": 2.453255074029336e-07, "loss": 0.0027, "num_tokens": 53052506.0, "reward": 1.0625, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.3061862289905548, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 165.58334350585938, "completions/mean_terminated_length": 165.58334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.672182006204757, "grad_norm": 3.471526846228643, "kl": 0.06494140625, "learning_rate": 2.4392895875511613e-07, "loss": 0.0026, "num_tokens": 53128112.0, "reward": 0.25, "reward_std": 0.30416232347488403, "rewards/reasoning_reward/mean": 0.25, "rewards/reasoning_reward/std": 0.41702884435653687, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 172.58334350585938, "completions/mean_terminated_length": 172.58334350585938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.6732161323681489, "grad_norm": 3.4426169831993034, "kl": 0.06787109375, "learning_rate": 2.425351128635632e-07, "loss": 0.0027, "num_tokens": 53211558.0, "reward": 1.5, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.5, "rewards/reasoning_reward/std": 0.6593804359436035, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6742502585315409, "grad_norm": 2.6977128922942453, "kl": 0.06787109375, "learning_rate": 2.411439844399177e-07, "loss": 0.0027, "num_tokens": 53290618.0, "reward": 0.8541666865348816, "reward_std": 0.18766528367996216, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.3753018081188202, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 135.70834350585938, "completions/mean_terminated_length": 135.70834350585938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6752843846949328, "grad_norm": 4.273684646889255, "kl": 0.06396484375, "learning_rate": 2.3975558816714073e-07, "loss": 0.0026, "num_tokens": 53366675.0, "reward": 1.0, "reward_std": 0.28029152750968933, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.39009472727775574, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 175.58334350585938, "completions/mean_terminated_length": 178.56521606445312, "completions/min_length": 107.0, "completions/min_terminated_length": 121.0, "epoch": 0.6763185108583247, "grad_norm": 3.922489743480712, "kl": 0.08544921875, "learning_rate": 2.383699386993557e-07, "loss": 0.0034, "num_tokens": 53451353.0, "reward": 1.1111111640930176, "reward_std": 0.4623008966445923, "rewards/reasoning_reward/mean": 1.111111044883728, "rewards/reasoning_reward/std": 0.647676408290863, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 177.2916717529297, "completions/mean_terminated_length": 177.2916717529297, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6773526370217167, "grad_norm": 3.720452216088832, "kl": 0.08984375, "learning_rate": 2.3698705066169483e-07, "loss": 0.0036, "num_tokens": 53539944.0, "reward": 0.6666666865348816, "reward_std": 0.6266384720802307, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.6538625359535217, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 160.70834350585938, "completions/mean_terminated_length": 160.70834350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6783867631851086, "grad_norm": 13.755429616291899, "kl": 0.310546875, "learning_rate": 2.356069386501438e-07, "loss": 0.0125, "num_tokens": 53618241.0, "reward": 1.2291667461395264, "reward_std": 0.21322892606258392, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.5893837213516235, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 147.58334350585938, "completions/mean_terminated_length": 147.58334350585938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6794208893485005, "grad_norm": 3.308998342120999, "kl": 0.0732421875, "learning_rate": 2.342296172313886e-07, "loss": 0.0029, "num_tokens": 53706335.0, "reward": 1.4722222089767456, "reward_std": 0.2105759084224701, "rewards/reasoning_reward/mean": 1.4722222089767456, "rewards/reasoning_reward/std": 0.4441424012184143, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 160.6666717529297, "completions/mean_terminated_length": 160.6666717529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6804550155118925, "grad_norm": 2.0128009083781153, "kl": 0.07666015625, "learning_rate": 2.3285510094266087e-07, "loss": 0.0031, "num_tokens": 53784703.0, "reward": 1.1875, "reward_std": 0.10681166499853134, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.3234494626522064, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6814891416752844, "grad_norm": 2.9811727952229545, "kl": 0.06591796875, "learning_rate": 2.3148340429158526e-07, "loss": 0.0027, "num_tokens": 53870556.0, "reward": 1.0416667461395264, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.35864076018333435, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 154.2916717529297, "completions/mean_terminated_length": 154.2916717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6825232678386763, "grad_norm": 3.6867423359764477, "kl": 0.040283203125, "learning_rate": 2.3011454175602558e-07, "loss": 0.0016, "num_tokens": 53949691.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 221.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 154.0869598388672, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6835573940020683, "grad_norm": 3.4323592527418394, "kl": 0.111328125, "learning_rate": 2.2874852778393266e-07, "loss": 0.0045, "num_tokens": 54037808.0, "reward": 0.9583333730697632, "reward_std": 0.2985045611858368, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 143.4166717529297, "completions/mean_terminated_length": 139.95652770996094, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6845915201654602, "grad_norm": 3.3556977958017895, "kl": 0.07958984375, "learning_rate": 2.273853767931918e-07, "loss": 0.0032, "num_tokens": 54116794.0, "reward": 1.0416667461395264, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.5500329732894897, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6856256463288521, "grad_norm": 2.337364555066446, "kl": 0.049072265625, "learning_rate": 2.2602510317146956e-07, "loss": 0.002, "num_tokens": 54201880.0, "reward": 1.125, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.7408866882324219, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 154.83334350585938, "completions/mean_terminated_length": 154.83334350585938, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6866597724922441, "grad_norm": 3.4959986502490086, "kl": 0.09521484375, "learning_rate": 2.246677212760636e-07, "loss": 0.0038, "num_tokens": 54290276.0, "reward": 1.4930555820465088, "reward_std": 0.35425281524658203, "rewards/reasoning_reward/mean": 1.4930554628372192, "rewards/reasoning_reward/std": 0.7010673880577087, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.687693898655636, "grad_norm": 3.6836605606951633, "kl": 0.07763671875, "learning_rate": 2.233132454337494e-07, "loss": 0.0031, "num_tokens": 54368369.0, "reward": 0.9166666865348816, "reward_std": 0.41387641429901123, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.524749755859375, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 141.70834350585938, "completions/mean_terminated_length": 141.70834350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.688728024819028, "grad_norm": 3.8448609470179624, "kl": 0.07275390625, "learning_rate": 2.2196168994063075e-07, "loss": 0.0029, "num_tokens": 54446450.0, "reward": 0.8541666865348816, "reward_std": 0.23709973692893982, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.40322521328926086, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 181.58334350585938, "completions/mean_terminated_length": 181.58334350585938, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6897621509824199, "grad_norm": 3.2290937591662097, "kl": 0.08544921875, "learning_rate": 2.2061306906198707e-07, "loss": 0.0034, "num_tokens": 54530368.0, "reward": 1.3541667461395264, "reward_std": 0.27830731868743896, "rewards/reasoning_reward/mean": 1.3541666269302368, "rewards/reasoning_reward/std": 0.453948050737381, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 184.33334350585938, "completions/mean_terminated_length": 184.33334350585938, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6907962771458118, "grad_norm": 3.9502785850753477, "kl": 0.08154296875, "learning_rate": 2.1926739703212472e-07, "loss": 0.0033, "num_tokens": 54613976.0, "reward": 0.9375, "reward_std": 0.34602540731430054, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.6806725859642029, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 171.9166717529297, "completions/mean_terminated_length": 171.9166717529297, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6918304033092038, "grad_norm": 2.466267818183153, "kl": 0.056884765625, "learning_rate": 2.1792468805422487e-07, "loss": 0.0023, "num_tokens": 54695686.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.3061862289905548, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 149.1666717529297, "completions/mean_terminated_length": 149.1666717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6928645294725957, "grad_norm": 3.9637637220123643, "kl": 0.08447265625, "learning_rate": 2.1658495630019518e-07, "loss": 0.0034, "num_tokens": 54782322.0, "reward": 1.3680555820465088, "reward_std": 0.36238986253738403, "rewards/reasoning_reward/mean": 1.3680554628372192, "rewards/reasoning_reward/std": 0.5059528946876526, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6938986556359876, "grad_norm": 3.8385551187680393, "kl": 0.0771484375, "learning_rate": 2.152482159105196e-07, "loss": 0.0031, "num_tokens": 54865144.0, "reward": 0.930555522441864, "reward_std": 0.4847185015678406, "rewards/reasoning_reward/mean": 0.930555522441864, "rewards/reasoning_reward/std": 0.8411469459533691, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 173.33334350585938, "completions/mean_terminated_length": 173.33334350585938, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6949327817993796, "grad_norm": 3.164139243922661, "kl": 0.0791015625, "learning_rate": 2.1391448099410853e-07, "loss": 0.0032, "num_tokens": 54948368.0, "reward": 1.1597222089767456, "reward_std": 0.33324792981147766, "rewards/reasoning_reward/mean": 1.1597222089767456, "rewards/reasoning_reward/std": 0.5118856430053711, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6959669079627715, "grad_norm": 3.4067201589070613, "kl": 0.054443359375, "learning_rate": 2.1258376562815112e-07, "loss": 0.0022, "num_tokens": 55029237.0, "reward": 1.0833333730697632, "reward_std": 0.32025060057640076, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 162.58334350585938, "completions/mean_terminated_length": 162.58334350585938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6970010341261634, "grad_norm": 2.5831577803082353, "kl": 0.07275390625, "learning_rate": 2.112560838579653e-07, "loss": 0.0029, "num_tokens": 55120027.0, "reward": 1.5763888359069824, "reward_std": 0.1376926600933075, "rewards/reasoning_reward/mean": 1.5763888359069824, "rewards/reasoning_reward/std": 0.45038676261901855, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 172.4166717529297, "completions/mean_terminated_length": 172.4166717529297, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.6980351602895554, "grad_norm": 3.9629234668552953, "kl": 0.06640625, "learning_rate": 2.0993144969685106e-07, "loss": 0.0027, "num_tokens": 55195797.0, "reward": 0.8333333730697632, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.6863049864768982, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.6990692864529473, "grad_norm": 2.8177613643424504, "kl": 0.09228515625, "learning_rate": 2.0860987712594103e-07, "loss": 0.0037, "num_tokens": 55284382.0, "reward": 1.2708333730697632, "reward_std": 0.20249390602111816, "rewards/reasoning_reward/mean": 1.2708333730697632, "rewards/reasoning_reward/std": 0.8509904742240906, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 151.70834350585938, "completions/mean_terminated_length": 151.70834350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7001034126163392, "grad_norm": 0.17490723954585877, "kl": 0.0546875, "learning_rate": 2.0729138009405417e-07, "loss": 0.0022, "num_tokens": 55363807.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 151.70834350585938, "completions/mean_terminated_length": 151.70834350585938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.7011375387797312, "grad_norm": 2.1768886150741418, "kl": 0.07861328125, "learning_rate": 2.05975972517548e-07, "loss": 0.0031, "num_tokens": 55452096.0, "reward": 1.6458333730697632, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.6458333730697632, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 125.66667175292969, "completions/mean_terminated_length": 125.66667175292969, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7021716649431231, "grad_norm": 3.6068735599469863, "kl": 0.044189453125, "learning_rate": 2.0466366828017113e-07, "loss": 0.0018, "num_tokens": 55538384.0, "reward": 1.2083333730697632, "reward_std": 0.20693820714950562, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.703205791106515, "grad_norm": 3.5043504652823727, "kl": 0.08251953125, "learning_rate": 2.033544812329181e-07, "loss": 0.0033, "num_tokens": 55626745.0, "reward": 1.3541667461395264, "reward_std": 0.32196044921875, "rewards/reasoning_reward/mean": 1.3541666269302368, "rewards/reasoning_reward/std": 0.5800893306732178, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.704239917269907, "grad_norm": 2.9250034907846185, "kl": 0.076171875, "learning_rate": 2.020484251938817e-07, "loss": 0.0031, "num_tokens": 55712742.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.8998792171478271, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 139.5416717529297, "completions/mean_terminated_length": 139.5416717529297, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.7052740434332989, "grad_norm": 2.899108027116337, "kl": 0.0595703125, "learning_rate": 2.007455139481085e-07, "loss": 0.0024, "num_tokens": 55792819.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 157.70834350585938, "completions/mean_terminated_length": 157.70834350585938, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.7063081695966908, "grad_norm": 0.19921601907261913, "kl": 0.055908203125, "learning_rate": 1.9944576124745205e-07, "loss": 0.0022, "num_tokens": 55869540.0, "reward": 1.1666667461395264, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.24077169597148895, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 183.33334350585938, "completions/mean_terminated_length": 183.33334350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7073422957600828, "grad_norm": 3.8943822086746436, "kl": 0.0791015625, "learning_rate": 1.9814918081042887e-07, "loss": 0.0032, "num_tokens": 55948236.0, "reward": 1.0625, "reward_std": 0.3584126830101013, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.42509588599205017, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 161.4166717529297, "completions/mean_terminated_length": 161.4166717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.7083764219234746, "grad_norm": 2.3459301471771914, "kl": 0.06591796875, "learning_rate": 1.9685578632207268e-07, "loss": 0.0026, "num_tokens": 56033366.0, "reward": 0.7916666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.9315329194068909, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 142.20834350585938, "completions/mean_terminated_length": 142.20834350585938, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7094105480868665, "grad_norm": 3.786217018380023, "kl": 0.060302734375, "learning_rate": 1.9556559143379097e-07, "loss": 0.0024, "num_tokens": 56110987.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7104446742502585, "grad_norm": 3.277700175814856, "kl": 0.0634765625, "learning_rate": 1.9427860976321996e-07, "loss": 0.0025, "num_tokens": 56191280.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 139.70834350585938, "completions/mean_terminated_length": 139.70834350585938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7114788004136504, "grad_norm": 2.2482590984095934, "kl": 0.0830078125, "learning_rate": 1.9299485489408125e-07, "loss": 0.0033, "num_tokens": 56275553.0, "reward": 0.9722222089767456, "reward_std": 0.0514344647526741, "rewards/reasoning_reward/mean": 0.9722221493721008, "rewards/reasoning_reward/std": 0.8040757775306702, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 168.4166717529297, "completions/mean_terminated_length": 168.4166717529297, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.7125129265770423, "grad_norm": 1.952148524225847, "kl": 0.06982421875, "learning_rate": 1.9171434037603883e-07, "loss": 0.0028, "num_tokens": 56354091.0, "reward": 1.2013888359069824, "reward_std": 0.058925554156303406, "rewards/reasoning_reward/mean": 1.2013888359069824, "rewards/reasoning_reward/std": 0.3068428933620453, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 151.1666717529297, "completions/mean_terminated_length": 151.1666717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.7135470527404343, "grad_norm": 3.0579537299955377, "kl": 0.06591796875, "learning_rate": 1.9043707972455537e-07, "loss": 0.0026, "num_tokens": 56437463.0, "reward": 1.0, "reward_std": 0.31100785732269287, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.6370220184326172, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7145811789038262, "grad_norm": 2.1711645432342626, "kl": 0.12158203125, "learning_rate": 1.8916308642075007e-07, "loss": 0.0049, "num_tokens": 56525892.0, "reward": 1.8125, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.8125, "rewards/reasoning_reward/std": 0.28788962960243225, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.7156153050672182, "grad_norm": 2.18124506294311, "kl": 0.10888671875, "learning_rate": 1.8789237391125644e-07, "loss": 0.0044, "num_tokens": 56620214.0, "reward": 1.375, "reward_std": 0.21362332999706268, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.5757792592048645, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 141.5416717529297, "completions/mean_terminated_length": 141.5416717529297, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7166494312306101, "grad_norm": 2.1892743478043566, "kl": 0.06689453125, "learning_rate": 1.8662495560807957e-07, "loss": 0.0027, "num_tokens": 56702419.0, "reward": 1.1875, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.28788962960243225, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 165.2916717529297, "completions/mean_terminated_length": 165.2916717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.717683557394002, "grad_norm": 4.212537614597707, "kl": 0.09326171875, "learning_rate": 1.8536084488845583e-07, "loss": 0.0037, "num_tokens": 56779346.0, "reward": 0.5625, "reward_std": 0.37478944659233093, "rewards/reasoning_reward/mean": 0.5625, "rewards/reasoning_reward/std": 0.7270025014877319, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 177.7916717529297, "completions/mean_terminated_length": 177.7916717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.718717683557394, "grad_norm": 2.0102770916091472, "kl": 0.056396484375, "learning_rate": 1.8410005509471028e-07, "loss": 0.0023, "num_tokens": 56860157.0, "reward": 1.0625, "reward_std": 0.12400396913290024, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.2242136001586914, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.7197518097207859, "grad_norm": 2.329478391630926, "kl": 0.0439453125, "learning_rate": 1.828425995341173e-07, "loss": 0.0018, "num_tokens": 56939902.0, "reward": 0.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.7207859358841778, "grad_norm": 3.1877025746249577, "kl": 0.055908203125, "learning_rate": 1.815884914787587e-07, "loss": 0.0022, "num_tokens": 57024539.0, "reward": 1.0208333730697632, "reward_std": 0.24056154489517212, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.3753018081188202, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 159.7916717529297, "completions/mean_terminated_length": 159.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7218200620475698, "grad_norm": 3.3274373464580114, "kl": 0.055908203125, "learning_rate": 1.80337744165385e-07, "loss": 0.0022, "num_tokens": 57108438.0, "reward": 0.6041666865348816, "reward_std": 0.3803371787071228, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.642332136631012, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 144.2916717529297, "completions/mean_terminated_length": 144.2916717529297, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7228541882109617, "grad_norm": 0.20006012207095064, "kl": 0.07568359375, "learning_rate": 1.7909037079527433e-07, "loss": 0.003, "num_tokens": 57192853.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.9630867838859558, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 146.83334350585938, "completions/mean_terminated_length": 146.83334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7238883143743536, "grad_norm": 3.3580145045189655, "kl": 0.07666015625, "learning_rate": 1.7784638453409451e-07, "loss": 0.0031, "num_tokens": 57271785.0, "reward": 1.0555555820465088, "reward_std": 0.3299504518508911, "rewards/reasoning_reward/mean": 1.0555555820465088, "rewards/reasoning_reward/std": 0.4189550578594208, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 140.95834350585938, "completions/mean_terminated_length": 140.95834350585938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7249224405377456, "grad_norm": 0.22405126131157563, "kl": 0.059814453125, "learning_rate": 1.7660579851176317e-07, "loss": 0.0024, "num_tokens": 57347536.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 315.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 174.6521759033203, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7259565667011375, "grad_norm": 2.9389018302570014, "kl": 0.055419921875, "learning_rate": 1.7536862582230893e-07, "loss": 0.0022, "num_tokens": 57430948.0, "reward": 1.5833333730697632, "reward_std": 0.27392348647117615, "rewards/reasoning_reward/mean": 1.5833333730697632, "rewards/reasoning_reward/std": 0.4584156572818756, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 178.0416717529297, "completions/mean_terminated_length": 174.60870361328125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7269906928645294, "grad_norm": 2.282134417830866, "kl": 0.0576171875, "learning_rate": 1.7413487952373455e-07, "loss": 0.0023, "num_tokens": 57508453.0, "reward": 1.0833333730697632, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.40824830532073975, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 184.58334350585938, "completions/mean_terminated_length": 181.69564819335938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7280248190279214, "grad_norm": 4.1836131721858765, "kl": 0.08935546875, "learning_rate": 1.7290457263787728e-07, "loss": 0.0036, "num_tokens": 57593995.0, "reward": 1.3125, "reward_std": 0.3492930829524994, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.7775728702545166, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 166.70834350585938, "completions/mean_terminated_length": 166.70834350585938, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.7290589451913133, "grad_norm": 3.165264879865031, "kl": 0.058837890625, "learning_rate": 1.7167771815027317e-07, "loss": 0.0024, "num_tokens": 57672700.0, "reward": 0.6041666865348816, "reward_std": 0.30217814445495605, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.6590369343757629, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7300930713547052, "grad_norm": 2.672806554030225, "kl": 0.054931640625, "learning_rate": 1.7045432901001844e-07, "loss": 0.0022, "num_tokens": 57751843.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 136.70834350585938, "completions/mean_terminated_length": 136.70834350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7311271975180972, "grad_norm": 3.035864140075777, "kl": 0.07080078125, "learning_rate": 1.6923441812963434e-07, "loss": 0.0028, "num_tokens": 57839420.0, "reward": 1.3333333730697632, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 162.95834350585938, "completions/mean_terminated_length": 162.95834350585938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7321613236814891, "grad_norm": 2.6352705999114425, "kl": 0.07861328125, "learning_rate": 1.6801799838492942e-07, "loss": 0.0032, "num_tokens": 57917803.0, "reward": 0.7083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.733195449844881, "grad_norm": 2.839232223213205, "kl": 0.07666015625, "learning_rate": 1.6680508261486465e-07, "loss": 0.0031, "num_tokens": 58001793.0, "reward": 0.7083333730697632, "reward_std": 0.33034375309944153, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.7649476528167725, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 166.20834350585938, "completions/mean_terminated_length": 166.20834350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.734229576008273, "grad_norm": 3.7820274298450687, "kl": 0.07177734375, "learning_rate": 1.6559568362141769e-07, "loss": 0.0029, "num_tokens": 58086726.0, "reward": 0.8125, "reward_std": 0.4476938843727112, "rewards/reasoning_reward/mean": 0.8125, "rewards/reasoning_reward/std": 0.6562823057174683, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 154.5416717529297, "completions/mean_terminated_length": 154.5416717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7352637021716649, "grad_norm": 3.1124009121892198, "kl": 0.09228515625, "learning_rate": 1.6438981416944708e-07, "loss": 0.0037, "num_tokens": 58173611.0, "reward": 0.8333333730697632, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.8164966106414795, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 196.0416717529297, "completions/mean_terminated_length": 196.0416717529297, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7362978283350569, "grad_norm": 3.1443322611380684, "kl": 0.08544921875, "learning_rate": 1.631874869865587e-07, "loss": 0.0034, "num_tokens": 58251924.0, "reward": 1.3958333730697632, "reward_std": 0.33768826723098755, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.48854637145996094, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7373319544984488, "grad_norm": 2.3752422112829867, "kl": 0.054443359375, "learning_rate": 1.6198871476297033e-07, "loss": 0.0022, "num_tokens": 58332575.0, "reward": 0.8958333730697632, "reward_std": 0.19795583188533783, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.3605300188064575, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 150.4166717529297, "completions/mean_terminated_length": 150.4166717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7383660806618407, "grad_norm": 2.0212470615420246, "kl": 0.051025390625, "learning_rate": 1.607935101513785e-07, "loss": 0.002, "num_tokens": 58418201.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 171.58334350585938, "completions/mean_terminated_length": 171.58334350585938, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.7394002068252327, "grad_norm": 3.251685083157416, "kl": 0.061767578125, "learning_rate": 1.596018857668242e-07, "loss": 0.0025, "num_tokens": 58495583.0, "reward": 0.6875, "reward_std": 0.3709374666213989, "rewards/reasoning_reward/mean": 0.6875, "rewards/reasoning_reward/std": 0.7194880843162537, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 212.08334350585938, "completions/mean_terminated_length": 212.08334350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7404343329886246, "grad_norm": 3.242133442856897, "kl": 0.07177734375, "learning_rate": 1.5841385418656068e-07, "loss": 0.0029, "num_tokens": 58581425.0, "reward": 1.4583333730697632, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 1.4583333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 168.33334350585938, "completions/mean_terminated_length": 168.33334350585938, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7414684591520165, "grad_norm": 3.0656410513536554, "kl": 0.07373046875, "learning_rate": 1.5722942794991995e-07, "loss": 0.003, "num_tokens": 58665161.0, "reward": 1.2291667461395264, "reward_std": 0.39486488699913025, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.6251811385154724, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 144.2916717529297, "completions/mean_terminated_length": 144.2916717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7425025853154085, "grad_norm": 3.1287480808791184, "kl": 0.07177734375, "learning_rate": 1.5604861955818038e-07, "loss": 0.0029, "num_tokens": 58752048.0, "reward": 0.9166666865348816, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.7172814607620239, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 161.9166717529297, "completions/mean_terminated_length": 161.9166717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7435367114788004, "grad_norm": 4.0449765747453545, "kl": 0.08740234375, "learning_rate": 1.548714414744356e-07, "loss": 0.0035, "num_tokens": 58834862.0, "reward": 0.9583333730697632, "reward_std": 0.39814266562461853, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.7359800934791565, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 172.08334350585938, "completions/mean_terminated_length": 172.08334350585938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7445708376421923, "grad_norm": 3.011337708200572, "kl": 0.06298828125, "learning_rate": 1.5369790612346168e-07, "loss": 0.0025, "num_tokens": 58914176.0, "reward": 1.0694444179534912, "reward_std": 0.2472916543483734, "rewards/reasoning_reward/mean": 1.0694444179534912, "rewards/reasoning_reward/std": 0.3506951928138733, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7456049638055843, "grad_norm": 4.6480620380414965, "kl": 0.0625, "learning_rate": 1.5252802589158737e-07, "loss": 0.0025, "num_tokens": 58996844.0, "reward": 1.0694445371627808, "reward_std": 0.30431777238845825, "rewards/reasoning_reward/mean": 1.0694445371627808, "rewards/reasoning_reward/std": 0.7659994959831238, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 148.83334350585938, "completions/mean_terminated_length": 148.83334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7466390899689762, "grad_norm": 3.1996652428167045, "kl": 0.056884765625, "learning_rate": 1.513618131265621e-07, "loss": 0.0023, "num_tokens": 59076408.0, "reward": 0.9791666865348816, "reward_std": 0.24056154489517212, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.6672325730323792, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7476732161323681, "grad_norm": 3.280704854907078, "kl": 0.06982421875, "learning_rate": 1.5019928013742682e-07, "loss": 0.0028, "num_tokens": 59163431.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.9315329790115356, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 134.70834350585938, "completions/mean_terminated_length": 134.70834350585938, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7487073422957601, "grad_norm": 2.185541011879563, "kl": 0.06640625, "learning_rate": 1.490404391943829e-07, "loss": 0.0026, "num_tokens": 59246096.0, "reward": 1.25, "reward_std": 0.07273930311203003, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 165.0416717529297, "completions/mean_terminated_length": 165.0416717529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.749741468459152, "grad_norm": 2.51252824919629, "kl": 0.09521484375, "learning_rate": 1.4788530252866372e-07, "loss": 0.0038, "num_tokens": 59325185.0, "reward": 0.625, "reward_std": 0.21362332999706268, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.5757792592048645, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 162.6666717529297, "completions/mean_terminated_length": 162.6666717529297, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.750775594622544, "grad_norm": 2.8053048835881818, "kl": 0.060546875, "learning_rate": 1.4673388233240502e-07, "loss": 0.0024, "num_tokens": 59416513.0, "reward": 1.4375, "reward_std": 0.12400396913290024, "rewards/reasoning_reward/mean": 1.4375, "rewards/reasoning_reward/std": 0.47348156571388245, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 181.45834350585938, "completions/mean_terminated_length": 178.3913116455078, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7518097207859359, "grad_norm": 4.163295102377276, "kl": 0.060546875, "learning_rate": 1.455861907585158e-07, "loss": 0.0024, "num_tokens": 59494236.0, "reward": 0.9375, "reward_std": 0.5304333567619324, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.613480806350708, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 169.83334350585938, "completions/mean_terminated_length": 169.83334350585938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7528438469493278, "grad_norm": 3.420329539869673, "kl": 0.08251953125, "learning_rate": 1.4444223992055116e-07, "loss": 0.0033, "num_tokens": 59578752.0, "reward": 1.0, "reward_std": 0.36585909128189087, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.4662523865699768, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 147.9166717529297, "completions/mean_terminated_length": 147.9166717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7538779731127198, "grad_norm": 4.210005348002778, "kl": 0.06298828125, "learning_rate": 1.4330204189258327e-07, "loss": 0.0025, "num_tokens": 59655494.0, "reward": 0.6180555820465088, "reward_std": 0.47668325901031494, "rewards/reasoning_reward/mean": 0.618055522441864, "rewards/reasoning_reward/std": 0.5948230028152466, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 151.7916717529297, "completions/mean_terminated_length": 151.7916717529297, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7549120992761117, "grad_norm": 3.359678282929136, "kl": 0.0771484375, "learning_rate": 1.4216560870907496e-07, "loss": 0.0031, "num_tokens": 59733049.0, "reward": 1.1111111640930176, "reward_std": 0.3074157238006592, "rewards/reasoning_reward/mean": 1.1111111640930176, "rewards/reasoning_reward/std": 0.3796345293521881, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 169.6666717529297, "completions/mean_terminated_length": 169.43478393554688, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7559462254395036, "grad_norm": 4.109241986433982, "kl": 0.072265625, "learning_rate": 1.4103295236475166e-07, "loss": 0.0029, "num_tokens": 59811105.0, "reward": 1.125, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.6123724579811096, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 164.0416717529297, "completions/mean_terminated_length": 164.0416717529297, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7569803516028956, "grad_norm": 3.609914878076435, "kl": 0.07958984375, "learning_rate": 1.3990408481447596e-07, "loss": 0.0032, "num_tokens": 59895338.0, "reward": 0.7916666865348816, "reward_std": 0.4563485085964203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.832970917224884, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 170.9166717529297, "completions/mean_terminated_length": 170.9166717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.7580144777662875, "grad_norm": 3.4849706899707638, "kl": 0.051513671875, "learning_rate": 1.387790179731202e-07, "loss": 0.0021, "num_tokens": 59974200.0, "reward": 1.0625, "reward_std": 0.29339051246643066, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.4499396085739136, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7590486039296794, "grad_norm": 4.775586282531071, "kl": 0.06298828125, "learning_rate": 1.3765776371544173e-07, "loss": 0.0025, "num_tokens": 60050775.0, "reward": 0.8472222685813904, "reward_std": 0.3458244502544403, "rewards/reasoning_reward/mean": 0.8472222685813904, "rewards/reasoning_reward/std": 0.38359054923057556, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 176.20834350585938, "completions/mean_terminated_length": 176.20834350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7600827300930714, "grad_norm": 3.511611967896255, "kl": 0.083984375, "learning_rate": 1.3654033387595732e-07, "loss": 0.0034, "num_tokens": 60134140.0, "reward": 1.2916667461395264, "reward_std": 0.30078065395355225, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.7611168562564633, "grad_norm": 4.1099857019944395, "kl": 0.10205078125, "learning_rate": 1.3542674024881746e-07, "loss": 0.0041, "num_tokens": 60211512.0, "reward": 1.0416667461395264, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.3877657949924469, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 123.95833587646484, "completions/mean_terminated_length": 123.95833587646484, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.7621509824198552, "grad_norm": 2.5865983458888597, "kl": 0.056396484375, "learning_rate": 1.3431699458768332e-07, "loss": 0.0023, "num_tokens": 60289567.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 148.33334350585938, "completions/mean_terminated_length": 148.33334350585938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7631851085832472, "grad_norm": 4.027481526615197, "kl": 0.07470703125, "learning_rate": 1.332111086056011e-07, "loss": 0.003, "num_tokens": 60370247.0, "reward": 1.3125, "reward_std": 0.39615845680236816, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.6045569777488708, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 134.70834350585938, "completions/mean_terminated_length": 134.70834350585938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7642192347466391, "grad_norm": 3.3303426892145067, "kl": 0.0419921875, "learning_rate": 1.3210909397487995e-07, "loss": 0.0017, "num_tokens": 60450768.0, "reward": 0.8333333730697632, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 137.2916717529297, "completions/mean_terminated_length": 137.2916717529297, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.765253360910031, "grad_norm": 4.295523478132368, "kl": 0.087890625, "learning_rate": 1.3101096232696735e-07, "loss": 0.0035, "num_tokens": 60540887.0, "reward": 0.993055522441864, "reward_std": 0.25392836332321167, "rewards/reasoning_reward/mean": 0.993055522441864, "rewards/reasoning_reward/std": 0.6245368719100952, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 159.7916717529297, "completions/mean_terminated_length": 159.7916717529297, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.766287487073423, "grad_norm": 3.7021465221363217, "kl": 0.08740234375, "learning_rate": 1.2991672525232756e-07, "loss": 0.0035, "num_tokens": 60629106.0, "reward": 1.5416667461395264, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.6580053567886353, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 172.6666717529297, "completions/mean_terminated_length": 172.6666717529297, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7673216132368149, "grad_norm": 3.9994888009560037, "kl": 0.07763671875, "learning_rate": 1.2882639430031833e-07, "loss": 0.0031, "num_tokens": 60718066.0, "reward": 1.5625, "reward_std": 0.25392839312553406, "rewards/reasoning_reward/mean": 1.5625, "rewards/reasoning_reward/std": 0.5578004121780396, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7683557394002068, "grad_norm": 0.3657715562784098, "kl": 0.0693359375, "learning_rate": 1.2773998097906962e-07, "loss": 0.0028, "num_tokens": 60800979.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.8340576887130737, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 159.125, "completions/mean_terminated_length": 159.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7693898655635988, "grad_norm": 2.1996165323485397, "kl": 0.052978515625, "learning_rate": 1.2665749675536209e-07, "loss": 0.0021, "num_tokens": 60879894.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 159.125, "completions/mean_terminated_length": 159.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7704239917269907, "grad_norm": 3.291864531573179, "kl": 0.056640625, "learning_rate": 1.2557895305450533e-07, "loss": 0.0023, "num_tokens": 60962449.0, "reward": 1.125, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.6123724579811096, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7714581178903827, "grad_norm": 4.09716328217157, "kl": 0.06396484375, "learning_rate": 1.2450436126021863e-07, "loss": 0.0026, "num_tokens": 61045190.0, "reward": 1.2569444179534912, "reward_std": 0.40088510513305664, "rewards/reasoning_reward/mean": 1.2569444179534912, "rewards/reasoning_reward/std": 0.6738367080688477, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 156.33334350585938, "completions/mean_terminated_length": 156.33334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7724922440537746, "grad_norm": 3.237616260351, "kl": 0.05859375, "learning_rate": 1.234337327145092e-07, "loss": 0.0023, "num_tokens": 61130822.0, "reward": 1.2777777910232544, "reward_std": 0.15713486075401306, "rewards/reasoning_reward/mean": 1.2777777910232544, "rewards/reasoning_reward/std": 0.5353825092315674, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.7735263702171665, "grad_norm": 2.731762557253584, "kl": 0.05859375, "learning_rate": 1.2236707871755403e-07, "loss": 0.0023, "num_tokens": 61209893.0, "reward": 1.0208333730697632, "reward_std": 0.16517187654972076, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.2750164568424225, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7745604963805585, "grad_norm": 3.5907800559210017, "kl": 0.07275390625, "learning_rate": 1.2130441052757939e-07, "loss": 0.0029, "num_tokens": 61289695.0, "reward": 0.75, "reward_std": 0.3247893452644348, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.48900964856147766, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7755946225439504, "grad_norm": 2.491098383777402, "kl": 0.095703125, "learning_rate": 1.2024573936074274e-07, "loss": 0.0038, "num_tokens": 61373152.0, "reward": 0.8541666865348816, "reward_std": 0.16517187654972076, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.7144344449043274, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7766287487073423, "grad_norm": 2.7802792084915904, "kl": 0.062255859375, "learning_rate": 1.1919107639101423e-07, "loss": 0.0025, "num_tokens": 61452002.0, "reward": 0.75, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.5897678136825562, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 150.2916717529297, "completions/mean_terminated_length": 150.2916717529297, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7776628748707343, "grad_norm": 0.23470415565994152, "kl": 0.07177734375, "learning_rate": 1.181404327500582e-07, "loss": 0.0029, "num_tokens": 61528641.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7786970010341262, "grad_norm": 3.327920369451495, "kl": 0.07470703125, "learning_rate": 1.1709381952711667e-07, "loss": 0.003, "num_tokens": 61609248.0, "reward": 1.0, "reward_std": 0.19500279426574707, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.2553769648075104, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 144.08334350585938, "completions/mean_terminated_length": 144.08334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7797311271975181, "grad_norm": 3.335800938480737, "kl": 0.057373046875, "learning_rate": 1.1605124776889125e-07, "loss": 0.0023, "num_tokens": 61685530.0, "reward": 1.1666667461395264, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.637022078037262, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 175.08334350585938, "completions/mean_terminated_length": 175.08334350585938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7807652533609101, "grad_norm": 3.294221512666946, "kl": 0.06884765625, "learning_rate": 1.150127284794275e-07, "loss": 0.0027, "num_tokens": 61768532.0, "reward": 1.3888888359069824, "reward_std": 0.11878277361392975, "rewards/reasoning_reward/mean": 1.3888888359069824, "rewards/reasoning_reward/std": 0.3134145140647888, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.781799379524302, "grad_norm": 3.396721463922415, "kl": 0.06298828125, "learning_rate": 1.1397827261999793e-07, "loss": 0.0025, "num_tokens": 61853439.0, "reward": 1.1875, "reward_std": 0.4671441912651062, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.6395055651664734, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7828335056876939, "grad_norm": 4.325100386537247, "kl": 0.08447265625, "learning_rate": 1.1294789110898711e-07, "loss": 0.0034, "num_tokens": 61932395.0, "reward": 0.75, "reward_std": 0.5782498121261597, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.6255432367324829, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 171.7916717529297, "completions/mean_terminated_length": 171.7916717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7838676318510859, "grad_norm": 4.370174208909052, "kl": 0.07275390625, "learning_rate": 1.119215948217756e-07, "loss": 0.0029, "num_tokens": 62009526.0, "reward": 0.9166666865348816, "reward_std": 0.3493061661720276, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 173.08334350585938, "completions/mean_terminated_length": 173.08334350585938, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7849017580144778, "grad_norm": 2.9640064799173294, "kl": 0.07568359375, "learning_rate": 1.1089939459062602e-07, "loss": 0.003, "num_tokens": 62092712.0, "reward": 1.0416667461395264, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.6902530789375305, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7859358841778697, "grad_norm": 3.2949381510622544, "kl": 0.08251953125, "learning_rate": 1.0988130120456813e-07, "loss": 0.0033, "num_tokens": 62169958.0, "reward": 0.7083333730697632, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.7790277004241943, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7869700103412617, "grad_norm": 2.3992415178790885, "kl": 0.07373046875, "learning_rate": 1.088673254092849e-07, "loss": 0.003, "num_tokens": 62252120.0, "reward": 1.25, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7880041365046536, "grad_norm": 3.3520352974736984, "kl": 0.042236328125, "learning_rate": 1.0785747790699978e-07, "loss": 0.0017, "num_tokens": 62333205.0, "reward": 0.7916666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 161.83334350585938, "completions/mean_terminated_length": 161.83334350585938, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7890382626680456, "grad_norm": 3.9881962321706217, "kl": 0.08740234375, "learning_rate": 1.0685176935636265e-07, "loss": 0.0035, "num_tokens": 62416633.0, "reward": 1.375, "reward_std": 0.31285393238067627, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.5160468220710754, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 125.29167175292969, "completions/mean_terminated_length": 125.29167175292969, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7900723888314375, "grad_norm": 4.570037175008805, "kl": 0.07177734375, "learning_rate": 1.0585021037233871e-07, "loss": 0.0029, "num_tokens": 62492632.0, "reward": 1.0833333730697632, "reward_std": 0.43459486961364746, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.637022078037262, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 170.83334350585938, "completions/mean_terminated_length": 170.83334350585938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.7911065149948294, "grad_norm": 4.0290312319505395, "kl": 0.07763671875, "learning_rate": 1.0485281152609482e-07, "loss": 0.0031, "num_tokens": 62575660.0, "reward": 1.1666667461395264, "reward_std": 0.36444199085235596, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.40824830532073975, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 157.4166717529297, "completions/mean_terminated_length": 157.4166717529297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7921406411582212, "grad_norm": 1.9636414153768984, "kl": 0.07568359375, "learning_rate": 1.0385958334488965e-07, "loss": 0.003, "num_tokens": 62654838.0, "reward": 0.375, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 156.4166717529297, "completions/mean_terminated_length": 156.4166717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7931747673216132, "grad_norm": 3.236809976550798, "kl": 0.09765625, "learning_rate": 1.0287053631196108e-07, "loss": 0.0039, "num_tokens": 62731304.0, "reward": 0.4583333432674408, "reward_std": 0.2842140197753906, "rewards/reasoning_reward/mean": 0.4583333432674408, "rewards/reasoning_reward/std": 0.6412736177444458, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 151.33334350585938, "completions/mean_terminated_length": 151.33334350585938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7942088934850051, "grad_norm": 4.220315176240241, "kl": 0.08447265625, "learning_rate": 1.0188568086641614e-07, "loss": 0.0034, "num_tokens": 62813816.0, "reward": 1.1458333730697632, "reward_std": 0.4039583206176758, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.5610387921333313, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 175.9166717529297, "completions/mean_terminated_length": 175.9166717529297, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.795243019648397, "grad_norm": 3.8313849909839535, "kl": 0.05859375, "learning_rate": 1.0090502740312152e-07, "loss": 0.0023, "num_tokens": 62893494.0, "reward": 0.6875, "reward_std": 0.47950729727745056, "rewards/reasoning_reward/mean": 0.6875, "rewards/reasoning_reward/std": 0.4618605971336365, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 144.4166717529297, "completions/mean_terminated_length": 144.4166717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.796277145811789, "grad_norm": 4.064061665509299, "kl": 0.09033203125, "learning_rate": 9.992858627259237e-08, "loss": 0.0036, "num_tokens": 62975872.0, "reward": 1.125, "reward_std": 0.5049939155578613, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.6796738505363464, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 169.45834350585938, "completions/mean_terminated_length": 169.45834350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7973112719751809, "grad_norm": 3.87391780953322, "kl": 0.07666015625, "learning_rate": 9.895636778088457e-08, "loss": 0.0031, "num_tokens": 63058859.0, "reward": 0.6875, "reward_std": 0.4851650893688202, "rewards/reasoning_reward/mean": 0.6875, "rewards/reasoning_reward/std": 0.7490936517715454, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 142.5416717529297, "completions/mean_terminated_length": 142.5416717529297, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7983453981385729, "grad_norm": 4.501546300705031, "kl": 0.0859375, "learning_rate": 9.798838218948468e-08, "loss": 0.0034, "num_tokens": 63135176.0, "reward": 1.0833333730697632, "reward_std": 0.42156457901000977, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.7322785258293152, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 156.08334350585938, "completions/mean_terminated_length": 156.08334350585938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7993795243019648, "grad_norm": 2.414414647480725, "kl": 0.055908203125, "learning_rate": 9.702463971520264e-08, "loss": 0.0022, "num_tokens": 63217306.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 148.2916717529297, "completions/mean_terminated_length": 148.2916717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8004136504653567, "grad_norm": 4.229101545256886, "kl": 0.09716796875, "learning_rate": 9.606515053006347e-08, "loss": 0.0039, "num_tokens": 63305329.0, "reward": 1.375, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 115.79167175292969, "completions/mean_terminated_length": 115.79167175292969, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8014477766287487, "grad_norm": 4.537579562049641, "kl": 0.050537109375, "learning_rate": 9.510992476119962e-08, "loss": 0.002, "num_tokens": 63385604.0, "reward": 0.6666666865348816, "reward_std": 0.4446708858013153, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8024819027921406, "grad_norm": 0.2165151880931561, "kl": 0.06982421875, "learning_rate": 9.415897249074478e-08, "loss": 0.0028, "num_tokens": 63474276.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.6666666269302368, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 145.20834350585938, "completions/mean_terminated_length": 145.20834350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8035160289555325, "grad_norm": 2.756431222536208, "kl": 0.044189453125, "learning_rate": 9.321230375572681e-08, "loss": 0.0018, "num_tokens": 63553017.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 166.83334350585938, "completions/mean_terminated_length": 166.83334350585938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.8045501551189245, "grad_norm": 3.3926466918323492, "kl": 0.048095703125, "learning_rate": 9.226992854796234e-08, "loss": 0.0019, "num_tokens": 63633093.0, "reward": 1.1041667461395264, "reward_std": 0.28302299976348877, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.416485458612442, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.8055842812823164, "grad_norm": 3.0968300481941142, "kl": 0.068359375, "learning_rate": 9.133185681395072e-08, "loss": 0.0027, "num_tokens": 63716078.0, "reward": 1.0416667461395264, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8066184074457083, "grad_norm": 3.196506746978936, "kl": 0.0810546875, "learning_rate": 9.03980984547697e-08, "loss": 0.0032, "num_tokens": 63798485.0, "reward": 0.7708333730697632, "reward_std": 0.2644323706626892, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.3895137906074524, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8076525336091003, "grad_norm": 3.6585350966268906, "kl": 0.048095703125, "learning_rate": 8.946866332597064e-08, "loss": 0.0019, "num_tokens": 63885723.0, "reward": 1.2083333730697632, "reward_std": 0.31285393238067627, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5694518685340881, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 158.45834350585938, "completions/mean_terminated_length": 158.45834350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8086866597724922, "grad_norm": 3.5060979679162885, "kl": 0.049560546875, "learning_rate": 8.854356123747392e-08, "loss": 0.002, "num_tokens": 63964702.0, "reward": 0.625, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 174.6666717529297, "completions/mean_terminated_length": 174.6666717529297, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8097207859358841, "grad_norm": 3.1782626992547836, "kl": 0.076171875, "learning_rate": 8.762280195346655e-08, "loss": 0.003, "num_tokens": 64047942.0, "reward": 1.1666667461395264, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8107549120992761, "grad_norm": 3.2114912520386825, "kl": 0.08154296875, "learning_rate": 8.6706395192298e-08, "loss": 0.0033, "num_tokens": 64128223.0, "reward": 0.8333333730697632, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.811789038262668, "grad_norm": 2.4667511559934585, "kl": 0.0576171875, "learning_rate": 8.579435062637863e-08, "loss": 0.0023, "num_tokens": 64212144.0, "reward": 0.875, "reward_std": 0.07715167850255966, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.6954823136329651, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.81282316442606, "grad_norm": 2.527486671838808, "kl": 0.08642578125, "learning_rate": 8.488667788207642e-08, "loss": 0.0035, "num_tokens": 64296945.0, "reward": 1.3125, "reward_std": 0.13908717036247253, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.5479705333709717, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8138572905894519, "grad_norm": 4.580987567602128, "kl": 0.09521484375, "learning_rate": 8.398338653961673e-08, "loss": 0.0038, "num_tokens": 64384935.0, "reward": 1.2083333730697632, "reward_std": 0.3478729724884033, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.8670706748962402, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8148914167528438, "grad_norm": 3.6226405946498326, "kl": 0.08935546875, "learning_rate": 8.30844861329798e-08, "loss": 0.0036, "num_tokens": 64462951.0, "reward": 1.152777910232544, "reward_std": 0.237970232963562, "rewards/reasoning_reward/mean": 1.1527777910232544, "rewards/reasoning_reward/std": 0.30263206362724304, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.8159255429162358, "grad_norm": 4.446754103716307, "kl": 0.08447265625, "learning_rate": 8.218998614980132e-08, "loss": 0.0034, "num_tokens": 64539215.0, "reward": 0.375, "reward_std": 0.47419947385787964, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 179.0416717529297, "completions/mean_terminated_length": 179.0416717529297, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8169596690796277, "grad_norm": 3.921864271005609, "kl": 0.061279296875, "learning_rate": 8.12998960312718e-08, "loss": 0.0024, "num_tokens": 64622584.0, "reward": 1.3402776718139648, "reward_std": 0.38187703490257263, "rewards/reasoning_reward/mean": 1.3402776718139648, "rewards/reasoning_reward/std": 0.5804362297058105, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 164.95834350585938, "completions/mean_terminated_length": 164.95834350585938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8179937952430196, "grad_norm": 4.340848550091104, "kl": 0.076171875, "learning_rate": 8.041422517203627e-08, "loss": 0.003, "num_tokens": 64711807.0, "reward": 1.1666667461395264, "reward_std": 0.4710209369659424, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.6197240948677063, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.8190279214064116, "grad_norm": 2.649981760760686, "kl": 0.0537109375, "learning_rate": 7.953298292009658e-08, "loss": 0.0021, "num_tokens": 64792344.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 153.6666717529297, "completions/mean_terminated_length": 153.6666717529297, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.8200620475698035, "grad_norm": 3.2958211549645253, "kl": 0.06689453125, "learning_rate": 7.86561785767112e-08, "loss": 0.0027, "num_tokens": 64868632.0, "reward": 0.7916666865348816, "reward_std": 0.3020375669002533, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.3877657949924469, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8210961737331954, "grad_norm": 3.09060522662954, "kl": 0.0693359375, "learning_rate": 7.77838213962983e-08, "loss": 0.0028, "num_tokens": 64950501.0, "reward": 1.1666667461395264, "reward_std": 0.3666771650314331, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.5450701713562012, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 144.4166717529297, "completions/mean_terminated_length": 144.4166717529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8221302998965874, "grad_norm": 3.8226975362183455, "kl": 0.06787109375, "learning_rate": 7.691592058633694e-08, "loss": 0.0027, "num_tokens": 65028095.0, "reward": 0.9375, "reward_std": 0.35495084524154663, "rewards/reasoning_reward/mean": 0.9375, "rewards/reasoning_reward/std": 0.5379611253738403, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 127.91667175292969, "completions/mean_terminated_length": 127.91667175292969, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8231644260599793, "grad_norm": 2.3041894307690525, "kl": 0.061767578125, "learning_rate": 7.605248530727115e-08, "loss": 0.0025, "num_tokens": 65112941.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 167.83334350585938, "completions/mean_terminated_length": 167.83334350585938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.8241985522233712, "grad_norm": 4.091470007656279, "kl": 0.10986328125, "learning_rate": 7.519352467241197e-08, "loss": 0.0044, "num_tokens": 65195697.0, "reward": 1.0833333730697632, "reward_std": 0.3794546127319336, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.6197241544723511, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8252326783867632, "grad_norm": 3.2064801888792744, "kl": 0.07958984375, "learning_rate": 7.433904774784216e-08, "loss": 0.0032, "num_tokens": 65275673.0, "reward": 1.076388955116272, "reward_std": 0.12554192543029785, "rewards/reasoning_reward/mean": 1.076388955116272, "rewards/reasoning_reward/std": 0.17706114053726196, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 149.0416717529297, "completions/mean_terminated_length": 149.0416717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8262668045501551, "grad_norm": 2.331881060813706, "kl": 0.06396484375, "learning_rate": 7.348906355232027e-08, "loss": 0.0026, "num_tokens": 65353674.0, "reward": 0.7291666865348816, "reward_std": 0.08625819534063339, "rewards/reasoning_reward/mean": 0.7291666865348816, "rewards/reasoning_reward/std": 0.551266610622406, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.827300930713547, "grad_norm": 3.5190409760967327, "kl": 0.09228515625, "learning_rate": 7.264358105718505e-08, "loss": 0.0037, "num_tokens": 65443242.0, "reward": 1.125, "reward_std": 0.367926687002182, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.4484272003173828, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 135.83334350585938, "completions/mean_terminated_length": 135.83334350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.828335056876939, "grad_norm": 2.332152031715114, "kl": 0.0703125, "learning_rate": 7.180260918626152e-08, "loss": 0.0028, "num_tokens": 65530702.0, "reward": 1.3125, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.4618605971336365, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.8293691830403309, "grad_norm": 3.0269063067385193, "kl": 0.057373046875, "learning_rate": 7.096615681576596e-08, "loss": 0.0023, "num_tokens": 65607861.0, "reward": 0.5625, "reward_std": 0.33768826723098755, "rewards/reasoning_reward/mean": 0.5625, "rewards/reasoning_reward/std": 0.5954993963241577, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 168.6666717529297, "completions/mean_terminated_length": 168.6666717529297, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8304033092037229, "grad_norm": 3.273127666339797, "kl": 0.055419921875, "learning_rate": 7.013423277421299e-08, "loss": 0.0022, "num_tokens": 65686197.0, "reward": 0.8541666865348816, "reward_std": 0.1767766922712326, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.5985338091850281, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 198.95834350585938, "completions/mean_terminated_length": 198.95834350585938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8314374353671148, "grad_norm": 3.550378331709515, "kl": 0.07373046875, "learning_rate": 6.93068458423216e-08, "loss": 0.0029, "num_tokens": 65774468.0, "reward": 0.9722222685813904, "reward_std": 0.3949992060661316, "rewards/reasoning_reward/mean": 0.9722222685813904, "rewards/reasoning_reward/std": 0.5660837888717651, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8324715615305067, "grad_norm": 2.947936195797767, "kl": 0.06689453125, "learning_rate": 6.848400475292343e-08, "loss": 0.0027, "num_tokens": 65854024.0, "reward": 1.0486111640930176, "reward_std": 0.06924575567245483, "rewards/reasoning_reward/mean": 1.0486111640930176, "rewards/reasoning_reward/std": 0.13440841436386108, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 183.58334350585938, "completions/mean_terminated_length": 183.58334350585938, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.8335056876938987, "grad_norm": 3.8760059999402343, "kl": 0.08544921875, "learning_rate": 6.766571819086941e-08, "loss": 0.0034, "num_tokens": 65936142.0, "reward": 1.0625, "reward_std": 0.44589531421661377, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.7658352255821228, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 183.4166717529297, "completions/mean_terminated_length": 183.4166717529297, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8345398138572906, "grad_norm": 3.429218077443575, "kl": 0.06787109375, "learning_rate": 6.685199479293929e-08, "loss": 0.0027, "num_tokens": 66027528.0, "reward": 1.2291667461395264, "reward_std": 0.34140023589134216, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.8678538203239441, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8355739400206825, "grad_norm": 3.4009884082930597, "kl": 0.049072265625, "learning_rate": 6.604284314774983e-08, "loss": 0.002, "num_tokens": 66111411.0, "reward": 1.3819444179534912, "reward_std": 0.36571210622787476, "rewards/reasoning_reward/mean": 1.3819442987442017, "rewards/reasoning_reward/std": 0.5189155340194702, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 129.45834350585938, "completions/mean_terminated_length": 129.45834350585938, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8366080661840745, "grad_norm": 6.175187456794133, "kl": 0.1650390625, "learning_rate": 6.523827179566394e-08, "loss": 0.0066, "num_tokens": 66195886.0, "reward": 0.8333333730697632, "reward_std": 0.48678088188171387, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.7019641399383545, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8376421923474664, "grad_norm": 3.893758481978179, "kl": 0.058837890625, "learning_rate": 6.443828922870127e-08, "loss": 0.0024, "num_tokens": 66277543.0, "reward": 1.0416667461395264, "reward_std": 0.3506905436515808, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.6240935921669006, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 152.08334350585938, "completions/mean_terminated_length": 152.08334350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8386763185108583, "grad_norm": 3.882217378171403, "kl": 0.08642578125, "learning_rate": 6.364290389044769e-08, "loss": 0.0035, "num_tokens": 66365673.0, "reward": 1.4444444179534912, "reward_std": 0.3385341763496399, "rewards/reasoning_reward/mean": 1.4444442987442017, "rewards/reasoning_reward/std": 0.5787431597709656, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 137.5416717529297, "completions/mean_terminated_length": 138.3913116455078, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8397104446742503, "grad_norm": 3.974211152800969, "kl": 0.06787109375, "learning_rate": 6.285212417596719e-08, "loss": 0.0027, "num_tokens": 66448998.0, "reward": 0.9166666865348816, "reward_std": 0.4446708858013153, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.7172814607620239, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 159.83334350585938, "completions/mean_terminated_length": 159.83334350585938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8407445708376422, "grad_norm": 3.374272533210629, "kl": 0.0537109375, "learning_rate": 6.206595843171225e-08, "loss": 0.0021, "num_tokens": 66525802.0, "reward": 0.8541666865348816, "reward_std": 0.4042079448699951, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.5413181781768799, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 188.7916717529297, "completions/mean_terminated_length": 188.7916717529297, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8417786970010341, "grad_norm": 2.707109875272163, "kl": 0.07421875, "learning_rate": 6.128441495543646e-08, "loss": 0.003, "num_tokens": 66605133.0, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8428128231644261, "grad_norm": 11.687611255036002, "kl": 0.46875, "learning_rate": 6.050750199610682e-08, "loss": 0.0189, "num_tokens": 66684349.0, "reward": 0.6666666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.843846949327818, "grad_norm": 3.5189829393185375, "kl": 0.06298828125, "learning_rate": 5.973522775381618e-08, "loss": 0.0025, "num_tokens": 66762020.0, "reward": 1.2083333730697632, "reward_std": 0.4261821210384369, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8448810754912099, "grad_norm": 3.111695745233412, "kl": 0.0693359375, "learning_rate": 5.896760037969739e-08, "loss": 0.0028, "num_tokens": 66846116.0, "reward": 0.7777777910232544, "reward_std": 0.2375655621290207, "rewards/reasoning_reward/mean": 0.7777777314186096, "rewards/reasoning_reward/std": 0.8493627309799194, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 144.9166717529297, "completions/mean_terminated_length": 144.9166717529297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.8459152016546019, "grad_norm": 2.7736990622484976, "kl": 0.054931640625, "learning_rate": 5.8204627975836696e-08, "loss": 0.0022, "num_tokens": 66924266.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.494535356760025, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8469493278179938, "grad_norm": 4.60558678901362, "kl": 0.068359375, "learning_rate": 5.744631859518878e-08, "loss": 0.0027, "num_tokens": 67002865.0, "reward": 1.1666667461395264, "reward_std": 0.2840898931026459, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.35098204016685486, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 151.7916717529297, "completions/mean_terminated_length": 151.7916717529297, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8479834539813857, "grad_norm": 3.9470723321035543, "kl": 0.0556640625, "learning_rate": 5.66926802414911e-08, "loss": 0.0022, "num_tokens": 67087588.0, "reward": 0.8541666865348816, "reward_std": 0.4386448860168457, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.6780058741569519, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 160.6666717529297, "completions/mean_terminated_length": 160.6666717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8490175801447777, "grad_norm": 3.2433337512879303, "kl": 0.052978515625, "learning_rate": 5.594372086918009e-08, "loss": 0.0021, "num_tokens": 67166756.0, "reward": 0.9791666865348816, "reward_std": 0.32520395517349243, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 155.95834350585938, "completions/mean_terminated_length": 155.95834350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8500517063081696, "grad_norm": 2.8260741217314123, "kl": 0.07080078125, "learning_rate": 5.519944838330659e-08, "loss": 0.0028, "num_tokens": 67249443.0, "reward": 1.1875, "reward_std": 0.27053868770599365, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.4848240315914154, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 164.58334350585938, "completions/mean_terminated_length": 164.58334350585938, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.8510858324715616, "grad_norm": 4.350268666915288, "kl": 0.06494140625, "learning_rate": 5.4459870639452897e-08, "loss": 0.0026, "num_tokens": 67327881.0, "reward": 1.0208333730697632, "reward_std": 0.494476854801178, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.580089271068573, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 151.20834350585938, "completions/mean_terminated_length": 151.20834350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8521199586349535, "grad_norm": 4.367925297416209, "kl": 0.09228515625, "learning_rate": 5.372499544364972e-08, "loss": 0.0037, "num_tokens": 67416214.0, "reward": 1.625, "reward_std": 0.4151468276977539, "rewards/reasoning_reward/mean": 1.625, "rewards/reasoning_reward/std": 0.42633703351020813, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8531540847983454, "grad_norm": 3.804426970311705, "kl": 0.083984375, "learning_rate": 5.2994830552293365e-08, "loss": 0.0033, "num_tokens": 67502027.0, "reward": 1.1666667461395264, "reward_std": 0.4114243686199188, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.7172815203666687, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 164.6666717529297, "completions/mean_terminated_length": 164.6666717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.8541882109617374, "grad_norm": 2.9609678472482113, "kl": 0.046875, "learning_rate": 5.2269383672064736e-08, "loss": 0.0019, "num_tokens": 67585763.0, "reward": 1.0833333730697632, "reward_std": 0.26726123690605164, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.6197240948677063, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 147.70834350585938, "completions/mean_terminated_length": 147.70834350585938, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8552223371251293, "grad_norm": 3.152704440105035, "kl": 0.08837890625, "learning_rate": 5.154866245984696e-08, "loss": 0.0035, "num_tokens": 67663468.0, "reward": 1.1041667461395264, "reward_std": 0.43070170283317566, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.5103103518486023, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 154.33334350585938, "completions/mean_terminated_length": 154.33334350585938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8562564632885212, "grad_norm": 0.17170167898547498, "kl": 0.0634765625, "learning_rate": 5.083267452264556e-08, "loss": 0.0025, "num_tokens": 67747188.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 131.2916717529297, "completions/mean_terminated_length": 131.2916717529297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8572905894519132, "grad_norm": 0.18506989130036433, "kl": 0.053955078125, "learning_rate": 5.012142741750725e-08, "loss": 0.0022, "num_tokens": 67824483.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8583247156153051, "grad_norm": 4.548140603940961, "kl": 0.130859375, "learning_rate": 4.941492865144115e-08, "loss": 0.0052, "num_tokens": 67906433.0, "reward": 1.0208333730697632, "reward_std": 0.41873571276664734, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.7144345045089722, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.859358841778697, "grad_norm": 3.507638044504915, "kl": 0.080078125, "learning_rate": 4.8713185681338477e-08, "loss": 0.0032, "num_tokens": 67994881.0, "reward": 1.2083333730697632, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.6240935325622559, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.860392967942089, "grad_norm": 2.847799628996861, "kl": 0.06591796875, "learning_rate": 4.801620591389477e-08, "loss": 0.0026, "num_tokens": 68074736.0, "reward": 1.1666667461395264, "reward_std": 0.08908708393573761, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 156.5416717529297, "completions/mean_terminated_length": 156.5416717529297, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8614270941054809, "grad_norm": 3.2176346783915846, "kl": 0.0439453125, "learning_rate": 4.7323996705531335e-08, "loss": 0.0018, "num_tokens": 68152653.0, "reward": 1.1875, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 1.1875, "rewards/reasoning_reward/std": 0.4376940429210663, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 122.91667175292969, "completions/mean_terminated_length": 122.91667175292969, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.8624612202688728, "grad_norm": 2.119171203323999, "kl": 0.05859375, "learning_rate": 4.6636565362317304e-08, "loss": 0.0023, "num_tokens": 68230691.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 151.4166717529297, "completions/mean_terminated_length": 151.4166717529297, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8634953464322648, "grad_norm": 2.9688355068863923, "kl": 0.06591796875, "learning_rate": 4.59539191398931e-08, "loss": 0.0026, "num_tokens": 68307461.0, "reward": 1.1458333730697632, "reward_std": 0.10681166499853134, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.2750164568424225, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 183.70834350585938, "completions/mean_terminated_length": 183.70834350585938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8645294725956567, "grad_norm": 4.374431997930627, "kl": 0.10595703125, "learning_rate": 4.527606524339328e-08, "loss": 0.0043, "num_tokens": 68391278.0, "reward": 1.3958333730697632, "reward_std": 0.4973698854446411, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.5512666702270508, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 117.33333587646484, "completions/mean_terminated_length": 117.33333587646484, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.8655635987590486, "grad_norm": 3.368684441355164, "kl": 0.12890625, "learning_rate": 4.4603010827371224e-08, "loss": 0.0052, "num_tokens": 68475710.0, "reward": 0.9583333730697632, "reward_std": 0.21362332999706268, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.35864076018333435, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 110.04167175292969, "completions/mean_terminated_length": 110.04167175292969, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8665977249224406, "grad_norm": 0.4287350953005526, "kl": 0.060546875, "learning_rate": 4.393476299572263e-08, "loss": 0.0024, "num_tokens": 68552959.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 172.4166717529297, "completions/mean_terminated_length": 172.4166717529297, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8676318510858325, "grad_norm": 3.1249910497191165, "kl": 0.0546875, "learning_rate": 4.327132880161161e-08, "loss": 0.0022, "num_tokens": 68630249.0, "reward": 0.4791666865348816, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 0.4791666567325592, "rewards/reasoning_reward/std": 0.6507381796836853, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 168.70834350585938, "completions/mean_terminated_length": 168.70834350585938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8686659772492245, "grad_norm": 3.2822755480432844, "kl": 0.052490234375, "learning_rate": 4.261271524739524e-08, "loss": 0.0021, "num_tokens": 68714258.0, "reward": 1.4027776718139648, "reward_std": 0.2818480134010315, "rewards/reasoning_reward/mean": 1.4027776718139648, "rewards/reasoning_reward/std": 0.44482171535491943, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 139.0416717529297, "completions/mean_terminated_length": 139.0416717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8697001034126164, "grad_norm": 3.768686239797151, "kl": 0.052978515625, "learning_rate": 4.195892928455047e-08, "loss": 0.0021, "num_tokens": 68792851.0, "reward": 1.0416667461395264, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.3877657949924469, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 149.45834350585938, "completions/mean_terminated_length": 149.45834350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8707342295760083, "grad_norm": 2.9671230713429453, "kl": 0.043212890625, "learning_rate": 4.130997781360035e-08, "loss": 0.0017, "num_tokens": 68872318.0, "reward": 0.7916666865348816, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 150.1666717529297, "completions/mean_terminated_length": 150.1666717529297, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8717683557394003, "grad_norm": 3.480205852106029, "kl": 0.068359375, "learning_rate": 4.0665867684041013e-08, "loss": 0.0027, "num_tokens": 68947754.0, "reward": 0.7708333730697632, "reward_std": 0.28302299976348877, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.416485458612442, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 161.9166717529297, "completions/mean_terminated_length": 161.9166717529297, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8728024819027922, "grad_norm": 2.661968506889756, "kl": 0.07373046875, "learning_rate": 4.002660569426997e-08, "loss": 0.003, "num_tokens": 69031840.0, "reward": 1.3541667461395264, "reward_std": 0.0589255653321743, "rewards/reasoning_reward/mean": 1.3541666269302368, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.26087951660156, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8738366080661841, "grad_norm": 3.489151843177023, "kl": 0.06689453125, "learning_rate": 3.939219859151377e-08, "loss": 0.0027, "num_tokens": 69110189.0, "reward": 0.7847222685813904, "reward_std": 0.5546908974647522, "rewards/reasoning_reward/mean": 0.7847222685813904, "rewards/reasoning_reward/std": 0.699342668056488, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 150.2916717529297, "completions/mean_terminated_length": 150.2916717529297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8748707342295761, "grad_norm": 3.1194409662404183, "kl": 0.0859375, "learning_rate": 3.876265307175714e-08, "loss": 0.0034, "num_tokens": 69188180.0, "reward": 0.5416666865348816, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 160.95834350585938, "completions/mean_terminated_length": 160.95834350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8759048603929679, "grad_norm": 4.060590754998399, "kl": 0.06396484375, "learning_rate": 3.813797577967209e-08, "loss": 0.0026, "num_tokens": 69265083.0, "reward": 1.2291667461395264, "reward_std": 0.33108004927635193, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.5103103518486023, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 197.4166717529297, "completions/mean_terminated_length": 197.4166717529297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8769389865563598, "grad_norm": 1.7203510455755477, "kl": 0.07275390625, "learning_rate": 3.751817330854806e-08, "loss": 0.0029, "num_tokens": 69346101.0, "reward": 0.7083333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.5500329732894897, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8779731127197518, "grad_norm": 3.1548182837694405, "kl": 0.0751953125, "learning_rate": 3.6903252200222e-08, "loss": 0.003, "num_tokens": 69430283.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.47204458713531494, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8790072388831437, "grad_norm": 3.229700554376665, "kl": 0.0751953125, "learning_rate": 3.6293218945009364e-08, "loss": 0.003, "num_tokens": 69508583.0, "reward": 0.7708333730697632, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.5311833620071411, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 140.0416717529297, "completions/mean_terminated_length": 140.0416717529297, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8800413650465356, "grad_norm": 3.6374367800290743, "kl": 0.08837890625, "learning_rate": 3.56880799816362e-08, "loss": 0.0035, "num_tokens": 69597664.0, "reward": 1.2916667461395264, "reward_std": 0.252508282661438, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.8795173168182373, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 151.58334350585938, "completions/mean_terminated_length": 151.58334350585938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8810754912099276, "grad_norm": 3.8095535645459964, "kl": 0.07373046875, "learning_rate": 3.50878416971701e-08, "loss": 0.0029, "num_tokens": 69679846.0, "reward": 0.8541666865348816, "reward_std": 0.42002925276756287, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.8531165719032288, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 130.58334350585938, "completions/mean_terminated_length": 130.58334350585938, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8821096173733195, "grad_norm": 2.5914771322358585, "kl": 0.06494140625, "learning_rate": 3.449251042695378e-08, "loss": 0.0026, "num_tokens": 69766932.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8831437435367114, "grad_norm": 2.3610444312496774, "kl": 0.07373046875, "learning_rate": 3.39020924545379e-08, "loss": 0.0029, "num_tokens": 69848611.0, "reward": 1.25, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.8841778697001034, "grad_norm": 3.907652167180534, "kl": 0.07861328125, "learning_rate": 3.331659401161435e-08, "loss": 0.0032, "num_tokens": 69936947.0, "reward": 1.7083333730697632, "reward_std": 0.3897872865200043, "rewards/reasoning_reward/mean": 1.7083333730697632, "rewards/reasoning_reward/std": 0.4402732849121094, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 163.5416717529297, "completions/mean_terminated_length": 163.5416717529297, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.8852119958634953, "grad_norm": 3.4271002980194485, "kl": 0.072265625, "learning_rate": 3.2736021277951055e-08, "loss": 0.0029, "num_tokens": 70016560.0, "reward": 0.6666666865348816, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.5646597146987915, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 309.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 162.78260803222656, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8862461220268872, "grad_norm": 3.242036202397786, "kl": 0.10400390625, "learning_rate": 3.216038038132623e-08, "loss": 0.0042, "num_tokens": 70099381.0, "reward": 0.75, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 124.16667175292969, "completions/mean_terminated_length": 124.16667175292969, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8872802481902792, "grad_norm": 2.4008627320484064, "kl": 0.039306640625, "learning_rate": 3.1589677397464433e-08, "loss": 0.0016, "num_tokens": 70178001.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 144.9166717529297, "completions/mean_terminated_length": 144.9166717529297, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8883143743536711, "grad_norm": 2.3968224744059543, "kl": 0.06298828125, "learning_rate": 3.102391834997142e-08, "loss": 0.0025, "num_tokens": 70260223.0, "reward": 1.0416667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.20412413775920868, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 124.16667175292969, "completions/mean_terminated_length": 124.16667175292969, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.889348500517063, "grad_norm": 3.3305458427955363, "kl": 0.08056640625, "learning_rate": 3.0463109210271566e-08, "loss": 0.0032, "num_tokens": 70343299.0, "reward": 1.2708333730697632, "reward_std": 0.30318546295166016, "rewards/reasoning_reward/mean": 1.2708333730697632, "rewards/reasoning_reward/std": 0.5893837809562683, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 163.33334350585938, "completions/mean_terminated_length": 163.33334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.890382626680455, "grad_norm": 4.099541212176896, "kl": 0.06494140625, "learning_rate": 2.990725589754406e-08, "loss": 0.0026, "num_tokens": 70434139.0, "reward": 1.3333333730697632, "reward_std": 0.46631526947021484, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.6197241544723511, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 149.95834350585938, "completions/mean_terminated_length": 149.95834350585938, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.8914167528438469, "grad_norm": 2.2220524006143076, "kl": 0.0732421875, "learning_rate": 2.935636427866095e-08, "loss": 0.0029, "num_tokens": 70517314.0, "reward": 0.9513888359069824, "reward_std": 0.06924576312303543, "rewards/reasoning_reward/mean": 0.9513888359069824, "rewards/reasoning_reward/std": 0.7824759483337402, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 161.58334350585938, "completions/mean_terminated_length": 161.58334350585938, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8924508790072389, "grad_norm": 3.2564376835733837, "kl": 0.059814453125, "learning_rate": 2.881044016812506e-08, "loss": 0.0024, "num_tokens": 70593264.0, "reward": 0.4791666865348816, "reward_std": 0.3704721927642822, "rewards/reasoning_reward/mean": 0.4791666567325592, "rewards/reasoning_reward/std": 0.5610387921333313, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 157.0416717529297, "completions/mean_terminated_length": 157.0416717529297, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8934850051706308, "grad_norm": 3.445585242026353, "kl": 0.07763671875, "learning_rate": 2.8269489328008433e-08, "loss": 0.0031, "num_tokens": 70677257.0, "reward": 1.2083333730697632, "reward_std": 0.20693820714950562, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.550032913684845, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 124.54167175292969, "completions/mean_terminated_length": 124.54167175292969, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8945191313340227, "grad_norm": 3.26944024523484, "kl": 0.07421875, "learning_rate": 2.7733517467891822e-08, "loss": 0.003, "num_tokens": 70761230.0, "reward": 1.2291667461395264, "reward_std": 0.21322892606258392, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.5893837213516235, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 143.08334350585938, "completions/mean_terminated_length": 143.08334350585938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8955532574974147, "grad_norm": 3.6817862651467532, "kl": 0.08154296875, "learning_rate": 2.720253024480418e-08, "loss": 0.0033, "num_tokens": 70855008.0, "reward": 1.6458333730697632, "reward_std": 0.35878798365592957, "rewards/reasoning_reward/mean": 1.6458333730697632, "rewards/reasoning_reward/std": 0.5208514332771301, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8965873836608066, "grad_norm": 3.9516108836506296, "kl": 0.06982421875, "learning_rate": 2.6676533263163103e-08, "loss": 0.0028, "num_tokens": 70931629.0, "reward": 0.6041666865348816, "reward_std": 0.4130779206752777, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.5893837213516235, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 302.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 150.0416717529297, "completions/mean_terminated_length": 143.43478393554688, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8976215098241985, "grad_norm": 5.483549708220734, "kl": 0.21484375, "learning_rate": 2.6155532074715548e-08, "loss": 0.0086, "num_tokens": 71011766.0, "reward": 0.7083333730697632, "reward_std": 0.46288391947746277, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 138.08334350585938, "completions/mean_terminated_length": 138.08334350585938, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8986556359875905, "grad_norm": 3.8988737222080285, "kl": 0.07373046875, "learning_rate": 2.5639532178479417e-08, "loss": 0.0029, "num_tokens": 71096920.0, "reward": 1.25, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.6079187393188477, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 154.70834350585938, "completions/mean_terminated_length": 154.70834350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8996897621509824, "grad_norm": 3.83680323474309, "kl": 0.044921875, "learning_rate": 2.512853902068529e-08, "loss": 0.0018, "num_tokens": 71174081.0, "reward": 0.875, "reward_std": 0.46288391947746277, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.494535356760025, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 133.33334350585938, "completions/mean_terminated_length": 133.33334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9007238883143743, "grad_norm": 4.143284467335127, "kl": 0.08740234375, "learning_rate": 2.462255799471913e-08, "loss": 0.0035, "num_tokens": 71258481.0, "reward": 1.4583333730697632, "reward_std": 0.42645785212516785, "rewards/reasoning_reward/mean": 1.4583333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 141.9166717529297, "completions/mean_terminated_length": 141.9166717529297, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9017580144777663, "grad_norm": 2.618618934323824, "kl": 0.115234375, "learning_rate": 2.412159444106543e-08, "loss": 0.0046, "num_tokens": 71340479.0, "reward": 0.7083333730697632, "reward_std": 0.21362332999706268, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.6240935921669006, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9027921406411582, "grad_norm": 2.942245207754384, "kl": 0.06396484375, "learning_rate": 2.3625653647250388e-08, "loss": 0.0026, "num_tokens": 71425653.0, "reward": 1.125, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.5366967916488647, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 149.58334350585938, "completions/mean_terminated_length": 149.58334350585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9038262668045501, "grad_norm": 3.0454811913733706, "kl": 0.099609375, "learning_rate": 2.3134740847786715e-08, "loss": 0.004, "num_tokens": 71513515.0, "reward": 1.5416667461395264, "reward_std": 0.20693820714950562, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 179.33334350585938, "completions/mean_terminated_length": 179.33334350585938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9048603929679421, "grad_norm": 2.930325510601697, "kl": 0.054931640625, "learning_rate": 2.2648861224117856e-08, "loss": 0.0022, "num_tokens": 71591131.0, "reward": 1.3541667461395264, "reward_std": 0.3433460593223572, "rewards/reasoning_reward/mean": 1.3541666269302368, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 179.20834350585938, "completions/mean_terminated_length": 179.20834350585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.905894519131334, "grad_norm": 4.432380095813641, "kl": 0.0673828125, "learning_rate": 2.2168019904563683e-08, "loss": 0.0027, "num_tokens": 71668936.0, "reward": 0.5833333730697632, "reward_std": 0.3900056481361389, "rewards/reasoning_reward/mean": 0.5833333134651184, "rewards/reasoning_reward/std": 0.5036101341247559, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 136.125, "completions/mean_terminated_length": 136.125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.9069286452947259, "grad_norm": 3.138554136523989, "kl": 0.072265625, "learning_rate": 2.1692221964266123e-08, "loss": 0.0029, "num_tokens": 71752219.0, "reward": 1.2083333730697632, "reward_std": 0.3535533845424652, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9079627714581179, "grad_norm": 2.9483481427069043, "kl": 0.044677734375, "learning_rate": 2.122147242513578e-08, "loss": 0.0018, "num_tokens": 71832360.0, "reward": 0.7083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.7083333134651184, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 150.08334350585938, "completions/mean_terminated_length": 150.08334350585938, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9089968976215098, "grad_norm": 3.1073607500565905, "kl": 0.048828125, "learning_rate": 2.0755776255798718e-08, "loss": 0.002, "num_tokens": 71916330.0, "reward": 0.8333333730697632, "reward_std": 0.39000558853149414, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.637022078037262, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 175.5416717529297, "completions/mean_terminated_length": 175.5416717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.9100310237849017, "grad_norm": 3.8150186805318964, "kl": 0.06884765625, "learning_rate": 2.0295138371544228e-08, "loss": 0.0028, "num_tokens": 71994855.0, "reward": 0.9583333730697632, "reward_std": 0.46288391947746277, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.7506036162376404, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 107.95833587646484, "completions/mean_terminated_length": 107.95833587646484, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.9110651499482937, "grad_norm": 2.4581793568003634, "kl": 0.05078125, "learning_rate": 1.9839563634272972e-08, "loss": 0.002, "num_tokens": 72073422.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 156.0416717529297, "completions/mean_terminated_length": 156.0416717529297, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9120992761116856, "grad_norm": 2.6154409593707686, "kl": 0.0732421875, "learning_rate": 1.938905685244513e-08, "loss": 0.0029, "num_tokens": 72157559.0, "reward": 0.5416666865348816, "reward_std": 0.24800793826580048, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5882299542427063, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 136.95834350585938, "completions/mean_terminated_length": 136.95834350585938, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.9131334022750776, "grad_norm": 3.5680063997463867, "kl": 0.07763671875, "learning_rate": 1.8943622781030564e-08, "loss": 0.0031, "num_tokens": 72236238.0, "reward": 0.2916666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.2916666567325592, "rewards/reasoning_reward/std": 0.4643056094646454, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 149.45834350585938, "completions/mean_terminated_length": 149.45834350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.9141675284384695, "grad_norm": 3.2270543081927117, "kl": 0.072265625, "learning_rate": 1.850326612145775e-08, "loss": 0.0029, "num_tokens": 72320209.0, "reward": 1.1041667461395264, "reward_std": 0.323208749294281, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.6753286719322205, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 169.83334350585938, "completions/mean_terminated_length": 169.83334350585938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9152016546018614, "grad_norm": 4.260735191605596, "kl": 0.0556640625, "learning_rate": 1.8067991521564852e-08, "loss": 0.0022, "num_tokens": 72399093.0, "reward": 0.8958333730697632, "reward_std": 0.3960779905319214, "rewards/reasoning_reward/mean": 0.8958333134651184, "rewards/reasoning_reward/std": 0.7220015525817871, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9162357807652534, "grad_norm": 3.3552268374877303, "kl": 0.076171875, "learning_rate": 1.7637803575550115e-08, "loss": 0.003, "num_tokens": 72476500.0, "reward": 0.8333333730697632, "reward_std": 0.5009793043136597, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.601929247379303, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 147.33334350585938, "completions/mean_terminated_length": 147.33334350585938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.9172699069286453, "grad_norm": 2.054383529313265, "kl": 0.0517578125, "learning_rate": 1.7212706823923674e-08, "loss": 0.0021, "num_tokens": 72554484.0, "reward": 0.5416666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9183040330920372, "grad_norm": 4.1619103374208235, "kl": 0.08837890625, "learning_rate": 1.6792705753459757e-08, "loss": 0.0035, "num_tokens": 72632119.0, "reward": 0.875, "reward_std": 0.5383754968643188, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.5757792592048645, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 152.95834350585938, "completions/mean_terminated_length": 152.95834350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9193381592554292, "grad_norm": 2.5645639037488097, "kl": 0.06494140625, "learning_rate": 1.6377804797148788e-08, "loss": 0.0026, "num_tokens": 72718998.0, "reward": 1.2291667461395264, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 1.2291666269302368, "rewards/reasoning_reward/std": 0.48854637145996094, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 152.5416717529297, "completions/mean_terminated_length": 152.5416717529297, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9203722854188211, "grad_norm": 3.725569433437225, "kl": 0.07177734375, "learning_rate": 1.596800833415135e-08, "loss": 0.0029, "num_tokens": 72796523.0, "reward": 1.0416667461395264, "reward_std": 0.2616034746170044, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.4871538281440735, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 191.5416717529297, "completions/mean_terminated_length": 191.5416717529297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.921406411582213, "grad_norm": 2.795098631521861, "kl": 0.0556640625, "learning_rate": 1.5563320689751192e-08, "loss": 0.0022, "num_tokens": 72874368.0, "reward": 0.8402777910232544, "reward_std": 0.38484740257263184, "rewards/reasoning_reward/mean": 0.8402777314186096, "rewards/reasoning_reward/std": 0.7824759483337402, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 165.58334350585938, "completions/mean_terminated_length": 165.58334350585938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.922440537745605, "grad_norm": 2.3803367727934823, "kl": 0.0634765625, "learning_rate": 1.5163746135310186e-08, "loss": 0.0025, "num_tokens": 72953078.0, "reward": 0.875, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 355.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 193.33334350585938, "completions/mean_terminated_length": 186.30435180664062, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9234746639089969, "grad_norm": 3.5618288413220407, "kl": 0.057861328125, "learning_rate": 1.4769288888222985e-08, "loss": 0.0023, "num_tokens": 73031438.0, "reward": 0.7916666865348816, "reward_std": 0.3917974829673767, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.5089774131774902, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 154.0416717529297, "completions/mean_terminated_length": 154.0416717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.9245087900723888, "grad_norm": 4.201487410221962, "kl": 0.11279296875, "learning_rate": 1.4379953111872456e-08, "loss": 0.0045, "num_tokens": 73114015.0, "reward": 0.625, "reward_std": 0.5280382037162781, "rewards/reasoning_reward/mean": 0.625, "rewards/reasoning_reward/std": 0.5943574905395508, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.9255429162357808, "grad_norm": 1.774495070242477, "kl": 0.083984375, "learning_rate": 1.3995742915585806e-08, "loss": 0.0034, "num_tokens": 73203993.0, "reward": 1.5208333730697632, "reward_std": 0.13908717036247253, "rewards/reasoning_reward/mean": 1.5208333730697632, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 174.7916717529297, "completions/mean_terminated_length": 174.7916717529297, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.9265770423991727, "grad_norm": 1.9576728453570886, "kl": 0.0517578125, "learning_rate": 1.3616662354591356e-08, "loss": 0.0021, "num_tokens": 73284876.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.9276111685625646, "grad_norm": 3.2466683150458957, "kl": 0.07373046875, "learning_rate": 1.3242715429975515e-08, "loss": 0.003, "num_tokens": 73365827.0, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.7019641399383545, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 160.9166717529297, "completions/mean_terminated_length": 160.9166717529297, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9286452947259566, "grad_norm": 3.1708164450347276, "kl": 0.09814453125, "learning_rate": 1.2873906088640474e-08, "loss": 0.0039, "num_tokens": 73443857.0, "reward": 0.7708333730697632, "reward_std": 0.3116035461425781, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.46576645970344543, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.9296794208893485, "grad_norm": 2.0259653414483583, "kl": 0.052001953125, "learning_rate": 1.251023822326308e-08, "loss": 0.0021, "num_tokens": 73524719.0, "reward": 0.375, "reward_std": 0.07715167850255966, "rewards/reasoning_reward/mean": 0.375, "rewards/reasoning_reward/std": 0.5565811395645142, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 161.5416717529297, "completions/mean_terminated_length": 161.5416717529297, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.9307135470527405, "grad_norm": 3.3236866100891342, "kl": 0.051025390625, "learning_rate": 1.2151715672252983e-08, "loss": 0.002, "num_tokens": 73601612.0, "reward": 0.9166666865348816, "reward_std": 0.30416232347488403, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 147.6666717529297, "completions/mean_terminated_length": 147.6666717529297, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.9317476732161324, "grad_norm": 2.423969131690449, "kl": 0.07470703125, "learning_rate": 1.179834221971282e-08, "loss": 0.003, "num_tokens": 73686812.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.2916666269302368, "rewards/reasoning_reward/std": 0.550032913684845, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 140.1666717529297, "completions/mean_terminated_length": 140.1666717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9327817993795243, "grad_norm": 3.842346880234536, "kl": 0.0615234375, "learning_rate": 1.145012159539771e-08, "loss": 0.0025, "num_tokens": 73762864.0, "reward": 0.6041666865348816, "reward_std": 0.4130779504776001, "rewards/reasoning_reward/mean": 0.6041666865348816, "rewards/reasoning_reward/std": 0.6753286719322205, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 142.08334350585938, "completions/mean_terminated_length": 142.08334350585938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9338159255429163, "grad_norm": 3.166928125853971, "kl": 0.10595703125, "learning_rate": 1.110705747467644e-08, "loss": 0.0043, "num_tokens": 73844906.0, "reward": 0.9652777910232544, "reward_std": 0.2400643527507782, "rewards/reasoning_reward/mean": 0.9652777314186096, "rewards/reasoning_reward/std": 0.7420743107795715, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 163.1666717529297, "completions/mean_terminated_length": 163.1666717529297, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9348500517063082, "grad_norm": 4.500549858418397, "kl": 0.1015625, "learning_rate": 1.076915347849211e-08, "loss": 0.0041, "num_tokens": 73922094.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.2948839068412781, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 142.6666717529297, "completions/mean_terminated_length": 142.6666717529297, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9358841778697001, "grad_norm": 4.169667991174683, "kl": 0.10302734375, "learning_rate": 1.0436413173324387e-08, "loss": 0.0041, "num_tokens": 74015702.0, "reward": 1.5416667461395264, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.5416666269302368, "rewards/reasoning_reward/std": 0.6412736773490906, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 170.58334350585938, "completions/mean_terminated_length": 170.58334350585938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9369183040330921, "grad_norm": 4.512863571654002, "kl": 0.07861328125, "learning_rate": 1.0108840071151648e-08, "loss": 0.0031, "num_tokens": 74091884.0, "reward": 1.0625, "reward_std": 0.6556500792503357, "rewards/reasoning_reward/mean": 1.0625, "rewards/reasoning_reward/std": 0.6964584589004517, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.937952430196484, "grad_norm": 2.964181511090769, "kl": 0.09423828125, "learning_rate": 9.786437629413669e-09, "loss": 0.0038, "num_tokens": 74174724.0, "reward": 1.4166667461395264, "reward_std": 0.1346571445465088, "rewards/reasoning_reward/mean": 1.4166666269302368, "rewards/reasoning_reward/std": 0.39927470684051514, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9389865563598759, "grad_norm": 0.17194578664988994, "kl": 0.06396484375, "learning_rate": 9.469209250975774e-09, "loss": 0.0026, "num_tokens": 74255520.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9400206825232679, "grad_norm": 3.278300391654624, "kl": 0.055419921875, "learning_rate": 9.157158284092248e-09, "loss": 0.0022, "num_tokens": 74340668.0, "reward": 0.7916666865348816, "reward_std": 0.4082186222076416, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.9410548086866598, "grad_norm": 3.3566487108649325, "kl": 0.055908203125, "learning_rate": 8.850288022371478e-09, "loss": 0.0022, "num_tokens": 74419544.0, "reward": 0.7916666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148510992527008, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 123.66667175292969, "completions/mean_terminated_length": 123.66667175292969, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9420889348500517, "grad_norm": 2.6002648351979984, "kl": 0.0478515625, "learning_rate": 8.548601704740754e-09, "loss": 0.0019, "num_tokens": 74500136.0, "reward": 0.7916666865348816, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.4148511290550232, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9431230610134437, "grad_norm": 3.0400992365987864, "kl": 0.044189453125, "learning_rate": 8.25210251541264e-09, "loss": 0.0018, "num_tokens": 74576941.0, "reward": 0.9583333730697632, "reward_std": 0.29602527618408203, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.9441571871768356, "grad_norm": 3.3459982295899047, "kl": 0.0830078125, "learning_rate": 7.960793583850767e-09, "loss": 0.0033, "num_tokens": 74667907.0, "reward": 1.3194445371627808, "reward_std": 0.38191652297973633, "rewards/reasoning_reward/mean": 1.3194445371627808, "rewards/reasoning_reward/std": 0.5580258965492249, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 147.20834350585938, "completions/mean_terminated_length": 147.20834350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.9451913133402275, "grad_norm": 3.428054229956691, "kl": 0.08642578125, "learning_rate": 7.674677984737255e-09, "loss": 0.0035, "num_tokens": 74750248.0, "reward": 1.1666667461395264, "reward_std": 0.3535081148147583, "rewards/reasoning_reward/mean": 1.1666666269302368, "rewards/reasoning_reward/std": 0.4340573847293854, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 154.20834350585938, "completions/mean_terminated_length": 154.20834350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9462254395036195, "grad_norm": 1.9872121372484721, "kl": 0.042724609375, "learning_rate": 7.393758737940126e-09, "loss": 0.0017, "num_tokens": 74829717.0, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 0.9583333134651184, "rewards/reasoning_reward/std": 0.20412415266036987, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.9472595656670114, "grad_norm": 0.19738482494940646, "kl": 0.0498046875, "learning_rate": 7.1180388084811635e-09, "loss": 0.002, "num_tokens": 74909541.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/reasoning_reward/mean": 0.6666666865348816, "rewards/reasoning_reward/std": 0.4815434217453003, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 156.08334350585938, "completions/mean_terminated_length": 156.08334350585938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9482936918304034, "grad_norm": 3.2992755635594575, "kl": 0.07373046875, "learning_rate": 6.847521106505105e-09, "loss": 0.0029, "num_tokens": 74987655.0, "reward": 1.277777910232544, "reward_std": 0.33984312415122986, "rewards/reasoning_reward/mean": 1.2777777910232544, "rewards/reasoning_reward/std": 0.478187620639801, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9493278179937953, "grad_norm": 4.365967227862626, "kl": 0.07666015625, "learning_rate": 6.582208487248497e-09, "loss": 0.0031, "num_tokens": 75070441.0, "reward": 1.3333333730697632, "reward_std": 0.24043500423431396, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.5582062602043152, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 170.4166717529297, "completions/mean_terminated_length": 170.4166717529297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9503619441571872, "grad_norm": 2.937333755573114, "kl": 0.07470703125, "learning_rate": 6.322103751009833e-09, "loss": 0.003, "num_tokens": 75154867.0, "reward": 1.1041667461395264, "reward_std": 0.2644323706626892, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.642332136631012, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9513960703205792, "grad_norm": 2.975341812717674, "kl": 0.06591796875, "learning_rate": 6.067209643119908e-09, "loss": 0.0026, "num_tokens": 75245144.0, "reward": 1.2569444179534912, "reward_std": 0.3012464940547943, "rewards/reasoning_reward/mean": 1.2569444179534912, "rewards/reasoning_reward/std": 0.8708637952804565, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 160.20834350585938, "completions/mean_terminated_length": 160.20834350585938, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9524301964839711, "grad_norm": 4.142950589572342, "kl": 0.06640625, "learning_rate": 5.817528853912735e-09, "loss": 0.0027, "num_tokens": 75324685.0, "reward": 0.5416666865348816, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 170.58334350585938, "completions/mean_terminated_length": 170.58334350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.953464322647363, "grad_norm": 3.416223841212196, "kl": 0.0537109375, "learning_rate": 5.573064018697393e-09, "loss": 0.0021, "num_tokens": 75404139.0, "reward": 0.5416666865348816, "reward_std": 0.3268197476863861, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 187.6666717529297, "completions/mean_terminated_length": 187.6666717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.954498448810755, "grad_norm": 2.3838014741617886, "kl": 0.07275390625, "learning_rate": 5.333817717729894e-09, "loss": 0.0029, "num_tokens": 75481619.0, "reward": 1.625, "reward_std": 0.07715167850255966, "rewards/reasoning_reward/mean": 1.625, "rewards/reasoning_reward/std": 0.47204458713531494, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 132.7916717529297, "completions/mean_terminated_length": 132.7916717529297, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9555325749741469, "grad_norm": 4.457264325946904, "kl": 0.08251953125, "learning_rate": 5.099792476186249e-09, "loss": 0.0033, "num_tokens": 75563334.0, "reward": 1.125, "reward_std": 0.47920867800712585, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.5160468220710754, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 170.20834350585938, "completions/mean_terminated_length": 170.20834350585938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9565667011375388, "grad_norm": 5.647062715557137, "kl": 0.09619140625, "learning_rate": 4.870990764135552e-09, "loss": 0.0039, "num_tokens": 75647859.0, "reward": 0.75, "reward_std": 0.510651707649231, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.5316095352172852, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 126.04167175292969, "completions/mean_terminated_length": 126.04167175292969, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9576008273009308, "grad_norm": 3.442181606982195, "kl": 0.055908203125, "learning_rate": 4.647414996514276e-09, "loss": 0.0022, "num_tokens": 75723900.0, "reward": 1.3958333730697632, "reward_std": 0.23144195973873138, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.4164854884147644, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 137.83334350585938, "completions/mean_terminated_length": 137.83334350585938, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9586349534643226, "grad_norm": 2.5733431583919484, "kl": 0.0498046875, "learning_rate": 4.429067533100294e-09, "loss": 0.002, "num_tokens": 75802360.0, "reward": 0.7916666865348816, "reward_std": 0.3053751587867737, "rewards/reasoning_reward/mean": 0.7916666865348816, "rewards/reasoning_reward/std": 0.7790276408195496, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 166.9166717529297, "completions/mean_terminated_length": 166.9166717529297, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9596690796277145, "grad_norm": 3.220240618399756, "kl": 0.08544921875, "learning_rate": 4.2159506784884e-09, "loss": 0.0034, "num_tokens": 75885278.0, "reward": 1.3125, "reward_std": 0.22466278076171875, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.4848240315914154, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 174.08334350585938, "completions/mean_terminated_length": 174.08334350585938, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9607032057911065, "grad_norm": 3.6058446603983922, "kl": 0.061279296875, "learning_rate": 4.0080666820657135e-09, "loss": 0.0024, "num_tokens": 75970624.0, "reward": 0.9548611044883728, "reward_std": 0.18729600310325623, "rewards/reasoning_reward/mean": 0.9548611044883728, "rewards/reasoning_reward/std": 0.6269795298576355, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.9617373319544984, "grad_norm": 2.476675970867201, "kl": 0.041748046875, "learning_rate": 3.805417737988148e-09, "loss": 0.0017, "num_tokens": 76049588.0, "reward": 0.5416666865348816, "reward_std": 0.19416078925132751, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.5299029350280762, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 146.7916717529297, "completions/mean_terminated_length": 146.7916717529297, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9627714581178903, "grad_norm": 3.988164632708771, "kl": 0.12158203125, "learning_rate": 3.6080059851570366e-09, "loss": 0.0049, "num_tokens": 76137775.0, "reward": 1.375, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.375, "rewards/reasoning_reward/std": 0.494535356760025, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 159.5416717529297, "completions/mean_terminated_length": 159.5416717529297, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.9638055842812823, "grad_norm": 1.9405645993516027, "kl": 0.04052734375, "learning_rate": 3.415833507196764e-09, "loss": 0.0016, "num_tokens": 76215860.0, "reward": 0.5833333730697632, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.5833333134651184, "rewards/reasoning_reward/std": 0.5036101937294006, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 177.08334350585938, "completions/mean_terminated_length": 168.68182373046875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.9648397104446742, "grad_norm": 1.732646593438274, "kl": 0.045654296875, "learning_rate": 3.2289023324325592e-09, "loss": 0.0018, "num_tokens": 76291502.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 150.2916717529297, "completions/mean_terminated_length": 150.2916717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9658738366080661, "grad_norm": 2.1551686750168, "kl": 0.045166015625, "learning_rate": 3.0472144338692386e-09, "loss": 0.0018, "num_tokens": 76368853.0, "reward": 0.875, "reward_std": 0.17251639068126678, "rewards/reasoning_reward/mean": 0.875, "rewards/reasoning_reward/std": 0.337831974029541, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 161.0416717529297, "completions/mean_terminated_length": 161.0416717529297, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.9669079627714581, "grad_norm": 4.651808954793325, "kl": 0.064453125, "learning_rate": 2.8707717291704405e-09, "loss": 0.0026, "num_tokens": 76447278.0, "reward": 0.5416666865348816, "reward_std": 0.42201346158981323, "rewards/reasoning_reward/mean": 0.5416666865348816, "rewards/reasoning_reward/std": 0.4871537983417511, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 151.70834350585938, "completions/mean_terminated_length": 151.70834350585938, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.96794208893485, "grad_norm": 0.17710222219951927, "kl": 0.05078125, "learning_rate": 2.6995760806381994e-09, "loss": 0.002, "num_tokens": 76526415.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 163.2916717529297, "completions/mean_terminated_length": 163.2916717529297, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9689762150982419, "grad_norm": 4.476052750730717, "kl": 0.10498046875, "learning_rate": 2.5336292951933513e-09, "loss": 0.0042, "num_tokens": 76603662.0, "reward": 1.1458333730697632, "reward_std": 0.4981882572174072, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.5800893306732178, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 178.0416717529297, "completions/mean_terminated_length": 178.0416717529297, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.9700103412616339, "grad_norm": 3.9771878211904466, "kl": 0.0654296875, "learning_rate": 2.372933124356602e-09, "loss": 0.0026, "num_tokens": 76681175.0, "reward": 0.8541666865348816, "reward_std": 0.5118739604949951, "rewards/reasoning_reward/mean": 0.8541666865348816, "rewards/reasoning_reward/std": 0.6507381796836853, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9710444674250258, "grad_norm": 0.8056357466789215, "kl": 0.08740234375, "learning_rate": 2.2174892642298215e-09, "loss": 0.0035, "num_tokens": 76766011.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.3333333730697632, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 132.83334350585938, "completions/mean_terminated_length": 132.83334350585938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9720785935884177, "grad_norm": 2.873333533007421, "kl": 0.076171875, "learning_rate": 2.0672993554783356e-09, "loss": 0.003, "num_tokens": 76851615.0, "reward": 1.0416667461395264, "reward_std": 0.1178511306643486, "rewards/reasoning_reward/mean": 1.0416666269302368, "rewards/reasoning_reward/std": 0.20412413775920868, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 167.20834350585938, "completions/mean_terminated_length": 167.20834350585938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9731127197518097, "grad_norm": 2.6371055405899173, "kl": 0.060546875, "learning_rate": 1.9223649833133847e-09, "loss": 0.0024, "num_tokens": 76931660.0, "reward": 0.6527777910232544, "reward_std": 0.35072192549705505, "rewards/reasoning_reward/mean": 0.6527777314186096, "rewards/reasoning_reward/std": 0.533687949180603, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.9741468459152016, "grad_norm": 4.0770969322540385, "kl": 0.10791015625, "learning_rate": 1.782687677475747e-09, "loss": 0.0043, "num_tokens": 77014581.0, "reward": 1.0347223281860352, "reward_std": 0.5191335678100586, "rewards/reasoning_reward/mean": 1.0347222089767456, "rewards/reasoning_reward/std": 0.6862682700157166, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 136.20834350585938, "completions/mean_terminated_length": 136.20834350585938, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9751809720785936, "grad_norm": 3.5475163041115847, "kl": 0.091796875, "learning_rate": 1.6482689122191418e-09, "loss": 0.0037, "num_tokens": 77096658.0, "reward": 1.0208333730697632, "reward_std": 0.24056154489517212, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.6672325730323792, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9762150982419855, "grad_norm": 2.9003959757657087, "kl": 0.05078125, "learning_rate": 1.5191101062950186e-09, "loss": 0.002, "num_tokens": 77177521.0, "reward": 0.9791666865348816, "reward_std": 0.2041093111038208, "rewards/reasoning_reward/mean": 0.9791666865348816, "rewards/reasoning_reward/std": 0.7442411780357361, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.9772492244053774, "grad_norm": 2.6967137886335943, "kl": 0.052490234375, "learning_rate": 1.3952126229375693e-09, "loss": 0.0021, "num_tokens": 77257004.0, "reward": 0.75, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.75, "rewards/reasoning_reward/std": 0.5316095352172852, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 144.4166717529297, "completions/mean_terminated_length": 144.4166717529297, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.9782833505687694, "grad_norm": 3.0293411969549453, "kl": 0.055908203125, "learning_rate": 1.2765777698490188e-09, "loss": 0.0022, "num_tokens": 77339542.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.9166666865348816, "rewards/reasoning_reward/std": 0.28232985734939575, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 161.9166717529297, "completions/mean_terminated_length": 161.9166717529297, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9793174767321613, "grad_norm": 3.148361986897516, "kl": 0.044677734375, "learning_rate": 1.163206799186245e-09, "loss": 0.0018, "num_tokens": 77423660.0, "reward": 1.2083333730697632, "reward_std": 0.2721545100212097, "rewards/reasoning_reward/mean": 1.2083333730697632, "rewards/reasoning_reward/std": 0.5089773535728455, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 134.20834350585938, "completions/mean_terminated_length": 134.20834350585938, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9803516028955532, "grad_norm": 2.1261463422153004, "kl": 0.056396484375, "learning_rate": 1.0551009075471795e-09, "loss": 0.0023, "num_tokens": 77509713.0, "reward": 1.25, "reward_std": 0.15430335700511932, "rewards/reasoning_reward/mean": 1.25, "rewards/reasoning_reward/std": 0.4423258602619171, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 122.29167175292969, "completions/mean_terminated_length": 122.29167175292969, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9813857290589452, "grad_norm": 2.9707407010412505, "kl": 0.0703125, "learning_rate": 9.522612359585402e-10, "loss": 0.0028, "num_tokens": 77594632.0, "reward": 1.1458333730697632, "reward_std": 0.16517187654972076, "rewards/reasoning_reward/mean": 1.1458333730697632, "rewards/reasoning_reward/std": 0.7144345045089722, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 134.45834350585938, "completions/mean_terminated_length": 134.45834350585938, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9824198552223371, "grad_norm": 3.7149348047540585, "kl": 0.060302734375, "learning_rate": 8.546888698634513e-10, "loss": 0.0024, "num_tokens": 77671787.0, "reward": 1.4375, "reward_std": 0.24232356250286102, "rewards/reasoning_reward/mean": 1.4375, "rewards/reasoning_reward/std": 0.47348156571388245, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.983453981385729, "grad_norm": 3.0064934874302542, "kl": 0.06640625, "learning_rate": 7.623848391102305e-10, "loss": 0.0027, "num_tokens": 77750801.0, "reward": 0.8333333730697632, "reward_std": 0.30860671401023865, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 147.4166717529297, "completions/mean_terminated_length": 147.4166717529297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.984488107549121, "grad_norm": 4.044111150534412, "kl": 0.0732421875, "learning_rate": 6.753501179413423e-10, "loss": 0.0029, "num_tokens": 77836395.0, "reward": 1.125, "reward_std": 0.36751919984817505, "rewards/reasoning_reward/mean": 1.125, "rewards/reasoning_reward/std": 0.6634888052940369, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9855222337125129, "grad_norm": 3.045502458871555, "kl": 0.0859375, "learning_rate": 5.935856249833504e-10, "loss": 0.0034, "num_tokens": 77925307.0, "reward": 1.3541667461395264, "reward_std": 0.33324795961380005, "rewards/reasoning_reward/mean": 1.3541666269302368, "rewards/reasoning_reward/std": 0.47729232907295227, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 154.70834350585938, "completions/mean_terminated_length": 154.70834350585938, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9865563598759048, "grad_norm": 0.2945822889932008, "kl": 0.052978515625, "learning_rate": 5.170922232369257e-10, "loss": 0.0021, "num_tokens": 78002764.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.9875904860392968, "grad_norm": 0.1833079894818197, "kl": 0.046630859375, "learning_rate": 4.4587072006796455e-10, "loss": 0.0019, "num_tokens": 78081614.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.9886246122026887, "grad_norm": 3.4542538672410337, "kl": 0.051513671875, "learning_rate": 3.7992186719892907e-10, "loss": 0.0021, "num_tokens": 78160656.0, "reward": 0.3333333432674408, "reward_std": 0.2357022613286972, "rewards/reasoning_reward/mean": 0.3333333432674408, "rewards/reasoning_reward/std": 0.4815433919429779, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9896587383660806, "grad_norm": 3.6021539388001673, "kl": 0.10205078125, "learning_rate": 3.1924636070107535e-10, "loss": 0.0041, "num_tokens": 78238399.0, "reward": 0.7708333730697632, "reward_std": 0.31726133823394775, "rewards/reasoning_reward/mean": 0.7708333134651184, "rewards/reasoning_reward/std": 0.416485458612442, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 159.9166717529297, "completions/mean_terminated_length": 159.9166717529297, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9906928645294726, "grad_norm": 3.4288036998953846, "kl": 0.0771484375, "learning_rate": 2.6384484098690427e-10, "loss": 0.0031, "num_tokens": 78317653.0, "reward": 1.3958333730697632, "reward_std": 0.29116004705429077, "rewards/reasoning_reward/mean": 1.3958333730697632, "rewards/reasoning_reward/std": 0.4657664895057678, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 157.6666717529297, "completions/mean_terminated_length": 157.6666717529297, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9917269906928645, "grad_norm": 3.8235718441583564, "kl": 0.09716796875, "learning_rate": 2.1371789280355547e-10, "loss": 0.0039, "num_tokens": 78400205.0, "reward": 1.1041667461395264, "reward_std": 0.3190067708492279, "rewards/reasoning_reward/mean": 1.1041666269302368, "rewards/reasoning_reward/std": 0.8072924017906189, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 145.7916717529297, "completions/mean_terminated_length": 145.7916717529297, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.9927611168562565, "grad_norm": 3.9856327424940687, "kl": 0.083984375, "learning_rate": 1.6886604522659e-10, "loss": 0.0034, "num_tokens": 78481128.0, "reward": 1.0, "reward_std": 0.5582748055458069, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.7939992547035217, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 163.9166717529297, "completions/mean_terminated_length": 163.9166717529297, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9937952430196484, "grad_norm": 0.2081115188693481, "kl": 0.06591796875, "learning_rate": 1.292897716542729e-10, "loss": 0.0026, "num_tokens": 78560702.0, "reward": 1.0, "reward_std": 0.0, "rewards/reasoning_reward/mean": 1.0, "rewards/reasoning_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 168.70834350585938, "completions/mean_terminated_length": 168.70834350585938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9948293691830403, "grad_norm": 2.780255312640307, "kl": 0.062255859375, "learning_rate": 9.498948980291021e-11, "loss": 0.0025, "num_tokens": 78643983.0, "reward": 1.3125, "reward_std": 0.20665977895259857, "rewards/reasoning_reward/mean": 1.3125, "rewards/reasoning_reward/std": 0.355469673871994, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.9958634953464323, "grad_norm": 3.0502524409622622, "kl": 0.046142578125, "learning_rate": 6.59655617020749e-11, "loss": 0.0018, "num_tokens": 78724921.0, "reward": 1.0208333730697632, "reward_std": 0.2965203523635864, "rewards/reasoning_reward/mean": 1.0208333730697632, "rewards/reasoning_reward/std": 0.6507381200790405, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 132.08334350585938, "completions/mean_terminated_length": 132.08334350585938, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9968976215098242, "grad_norm": 2.2883893243678246, "kl": 0.10888671875, "learning_rate": 4.221829369094321e-11, "loss": 0.0043, "num_tokens": 78809019.0, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/reasoning_reward/mean": 0.8333333134651184, "rewards/reasoning_reward/std": 0.3806934952735901, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 140.95834350585938, "completions/mean_terminated_length": 140.95834350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9979317476732161, "grad_norm": 3.6433341964307564, "kl": 0.08447265625, "learning_rate": 2.374793641518602e-11, "loss": 0.0034, "num_tokens": 78883898.0, "reward": 0.7013888359069824, "reward_std": 0.3162800669670105, "rewards/reasoning_reward/mean": 0.7013888359069824, "rewards/reasoning_reward/std": 0.7222802639007568, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 190.4499969482422, "completions/mean_terminated_length": 190.4499969482422, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.9989658738366081, "grad_norm": 2.642107476748316, "kl": 0.05419921875, "learning_rate": 1.0554684824137794e-11, "loss": 0.0024, "num_tokens": 78951363.0, "reward": 1.0833333730697632, "reward_std": 0.2903675436973572, "rewards/reasoning_reward/mean": 1.0833333730697632, "rewards/reasoning_reward/std": 0.7172815203666687, "step": 966 }, { "epoch": 0.9989658738366081, "step": 966, "total_flos": 0.0, "train_loss": 0.002488073633344592, "train_runtime": 159825.8329, "train_samples_per_second": 0.018, "train_steps_per_second": 0.006 } ], "logging_steps": 1.0, "max_steps": 967, "num_input_tokens_seen": 78951363, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }