diff --git "a/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json" "b/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json" new file mode 100644--- /dev/null +++ "b/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json" @@ -0,0 +1,17890 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5298988040478381, + "eval_steps": 500, + "global_step": 576, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025743793230503798, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.920872470393078e-06, + "clip_ratio/high_mean": 1.2302181175982696e-06, + "clip_ratio/low_mean": 2.9912232776041492e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1142450779952924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14377.0, + "completions/max_terminated_length": 14377.0, + "completions/mean_length": 4861.1796875, + "completions/mean_terminated_length": 4861.1796875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0784558206796646, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023554943036288023, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 1437829.0, + "reward": 0.3515625, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.00045694064465351403, + "sampling/sampling_logp_difference/max": 7.690957069396973, + "sampling/sampling_logp_difference/mean": 0.018809247761964798, + "step": 2 + }, + { + "clip_ratio/high_max": 1.673043971095467e-05, + "clip_ratio/high_mean": 4.8752071961644106e-06, + "clip_ratio/low_mean": 2.1540331545111258e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6415538741275668e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15479.0, + "completions/mean_length": 6167.5078125, + "completions/mean_terminated_length": 5922.3125, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 1.1373522356152534, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002558506093919277, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 2245838.0, + "reward": 0.296875, + "reward_std": 0.2669745087623596, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000486373901367, + "sampling/importance_sampling_ratio/min": 2.8637201467063278e-05, + "sampling/sampling_logp_difference/max": 10.460803985595703, + "sampling/sampling_logp_difference/mean": 0.02123238891363144, + "step": 3 + }, + { + "clip_ratio/high_max": 4.3118818666698644e-05, + "clip_ratio/high_mean": 1.0779704666674661e-05, + "clip_ratio/low_mean": 3.257358957853285e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.335329458626802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 5691.9296875, + "completions/mean_terminated_length": 5435.3203125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "entropy": 1.1964457035064697, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001936351996846497, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 2998805.0, + "reward": 0.3046875, + "reward_std": 0.2727435827255249, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518990516663, + "sampling/importance_sampling_ratio/min": 9.316575415141415e-06, + "sampling/sampling_logp_difference/max": 11.583715438842773, + "sampling/sampling_logp_difference/mean": 0.021076630800962448, + "step": 4 + }, + { + "clip_ratio/high_max": 1.666655725784949e-05, + "clip_ratio/high_mean": 4.1666393144623726e-06, + "clip_ratio/low_mean": 2.0471738594096678e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4638378022245888e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 5535.828125, + "completions/mean_terminated_length": 5535.828125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 1.0935996025800705, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003226158209145069, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 3727959.0, + "reward": 0.3046875, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 5.9354013501433656e-05, + "sampling/sampling_logp_difference/max": 9.731990814208984, + "sampling/sampling_logp_difference/mean": 0.019589610397815704, + "step": 5 + }, + { + "clip_ratio/high_max": 1.9090986825176515e-05, + "clip_ratio/high_mean": 4.772746706294129e-06, + "clip_ratio/low_mean": 1.995503203033877e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4727778054511873e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14820.0, + "completions/mean_length": 4552.9296875, + "completions/mean_terminated_length": 4459.771484375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9019740223884583, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002911025658249855, + "learning_rate": 1e-05, + "loss": 0.0742, + "num_tokens": 4329342.0, + "reward": 0.4375, + "reward_std": 0.3448186218738556, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092817306519, + "sampling/importance_sampling_ratio/min": 0.0010333366226404905, + "sampling/sampling_logp_difference/max": 6.874962329864502, + "sampling/sampling_logp_difference/mean": 0.01768551766872406, + "step": 6 + }, + { + "clip_ratio/high_max": 9.186584293274791e-06, + "clip_ratio/high_mean": 2.2966460733186977e-06, + "clip_ratio/low_mean": 1.9561108047128073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.185775372254284e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14197.0, + "completions/mean_length": 5849.4921875, + "completions/mean_terminated_length": 5682.2783203125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1362405940890312, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018774238415062428, + "learning_rate": 1e-05, + "loss": 0.0106, + "num_tokens": 5097245.0, + "reward": 0.1953125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999154210090637, + "sampling/importance_sampling_ratio/min": 0.00020401047368068248, + "sampling/sampling_logp_difference/max": 8.497339248657227, + "sampling/sampling_logp_difference/mean": 0.020379718393087387, + "step": 7 + }, + { + "clip_ratio/high_max": 7.997417014848907e-06, + "clip_ratio/high_mean": 1.9993542537122266e-06, + "clip_ratio/low_mean": 4.003535150332027e-05, + "clip_ratio/low_min": 4.32017714047106e-06, + "clip_ratio/region_mean": 4.203470598440617e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16045.0, + "completions/mean_length": 5744.6796875, + "completions/mean_terminated_length": 5575.8017578125, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "entropy": 0.989105150103569, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0025437718722969294, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 5851844.0, + "reward": 0.375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 4.312803503125906e-05, + "sampling/sampling_logp_difference/max": 10.051337242126465, + "sampling/sampling_logp_difference/mean": 0.020163267850875854, + "step": 8 + }, + { + "clip_ratio/high_max": 5.422758022177732e-06, + "clip_ratio/high_mean": 1.355689505544433e-06, + "clip_ratio/low_mean": 3.697482691222831e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833051641777274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15874.0, + "completions/mean_length": 4075.9609375, + "completions/mean_terminated_length": 3979.047119140625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.8887222409248352, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024127138312906027, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 6392287.0, + "reward": 0.4140625, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 4.007668394478969e-05, + "sampling/sampling_logp_difference/max": 10.124715805053711, + "sampling/sampling_logp_difference/mean": 0.017202626913785934, + "step": 9 + }, + { + "clip_ratio/high_max": 1.9414138932916103e-05, + "clip_ratio/high_mean": 5.8681449672803865e-06, + "clip_ratio/low_mean": 4.918625745631289e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.5054402309906436e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15691.0, + "completions/mean_length": 5248.3984375, + "completions/mean_terminated_length": 4981.14404296875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.7111036106944084, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0028383845929056406, + "learning_rate": 1e-05, + "loss": 0.1027, + "num_tokens": 7081234.0, + "reward": 0.5625, + "reward_std": 0.4150439500808716, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589323997498, + "sampling/importance_sampling_ratio/min": 0.00037057927693240345, + "sampling/sampling_logp_difference/max": 7.900443077087402, + "sampling/sampling_logp_difference/mean": 0.01570993661880493, + "step": 10 + }, + { + "clip_ratio/high_max": 7.0035857788752764e-06, + "clip_ratio/high_mean": 1.7508964447188191e-06, + "clip_ratio/low_mean": 1.4078211620471848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5829108065190667e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16172.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 4956.6015625, + "completions/mean_terminated_length": 4956.6015625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 1.026921771466732, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001392067177221179, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 7735511.0, + "reward": 0.328125, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.00033587991492822766, + "sampling/sampling_logp_difference/max": 7.9987568855285645, + "sampling/sampling_logp_difference/mean": 0.019166938960552216, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9272594929352636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9272594929352636e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16229.0, + "completions/mean_length": 5858.953125, + "completions/mean_terminated_length": 5691.88916015625, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 1.1407905519008636, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018726681591942906, + "learning_rate": 1e-05, + "loss": 0.092, + "num_tokens": 8506089.0, + "reward": 0.25, + "reward_std": 0.2829982340335846, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998714327812195, + "sampling/importance_sampling_ratio/min": 2.4313605536008254e-05, + "sampling/sampling_logp_difference/max": 10.62447452545166, + "sampling/sampling_logp_difference/mean": 0.020790230482816696, + "step": 12 + }, + { + "clip_ratio/high_max": 4.318236733524827e-06, + "clip_ratio/high_mean": 1.0795591833812068e-06, + "clip_ratio/low_mean": 3.3191785689723474e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.427134498679152e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15040.0, + "completions/mean_length": 6801.09375, + "completions/mean_terminated_length": 6571.1044921875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 1.185454584658146, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031675526406615973, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 9398597.0, + "reward": 0.21875, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000109672546387, + "sampling/importance_sampling_ratio/min": 0.0010334982071071863, + "sampling/sampling_logp_difference/max": 6.874805927276611, + "sampling/sampling_logp_difference/mean": 0.021565770730376244, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3892819879401941e-05, + "clip_ratio/high_mean": 3.4732049698504852e-06, + "clip_ratio/low_mean": 2.9275798283379117e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2749003707976954e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15496.0, + "completions/mean_length": 4673.578125, + "completions/mean_terminated_length": 4581.3701171875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9907316789031029, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024632434360682964, + "learning_rate": 1e-05, + "loss": 0.0147, + "num_tokens": 10016559.0, + "reward": 0.3046875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000066757202148, + "sampling/importance_sampling_ratio/min": 0.001339821144938469, + "sampling/sampling_logp_difference/max": 6.6152191162109375, + "sampling/sampling_logp_difference/mean": 0.019262395799160004, + "step": 14 + }, + { + "clip_ratio/high_max": 1.6510958175786072e-05, + "clip_ratio/high_mean": 4.127739543946518e-06, + "clip_ratio/low_mean": 1.770910688492222e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1836846656242415e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 4617.4140625, + "completions/mean_terminated_length": 4524.763671875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 1.100720427930355, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032584660220891237, + "learning_rate": 1e-05, + "loss": 0.0047, + "num_tokens": 10628084.0, + "reward": 0.375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999375343322754, + "sampling/importance_sampling_ratio/min": 4.245261607138673e-06, + "sampling/sampling_logp_difference/max": 12.369707107543945, + "sampling/sampling_logp_difference/mean": 0.019928477704524994, + "step": 15 + }, + { + "clip_ratio/high_max": 9.921910532284528e-06, + "clip_ratio/high_mean": 3.5021869280171813e-06, + "clip_ratio/low_mean": 1.4621458831243217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.81236457592604e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13925.0, + "completions/mean_length": 5611.5625, + "completions/mean_terminated_length": 5353.0244140625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.0112926587462425, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001977710286155343, + "learning_rate": 1e-05, + "loss": -0.0229, + "num_tokens": 11364332.0, + "reward": 0.2109375, + "reward_std": 0.21146979928016663, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999548196792603, + "sampling/importance_sampling_ratio/min": 4.5400451199384406e-05, + "sampling/sampling_logp_difference/max": 9.999988555908203, + "sampling/sampling_logp_difference/mean": 0.019674532115459442, + "step": 16 + }, + { + "clip_ratio/high_max": 8.318262189277448e-06, + "clip_ratio/high_mean": 2.079565547319362e-06, + "clip_ratio/low_mean": 3.345101845297904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5530583886611566e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14463.0, + "completions/mean_length": 5321.7578125, + "completions/mean_terminated_length": 5234.6533203125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.9611762389540672, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002321678213775158, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 12067365.0, + "reward": 0.2734375, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 5.329983650881331e-06, + "sampling/sampling_logp_difference/max": 12.142162322998047, + "sampling/sampling_logp_difference/mean": 0.019090529531240463, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.1286541861372825e-05, + "clip_ratio/low_min": 4.589008312905207e-06, + "clip_ratio/region_mean": 5.1286541861372825e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15906.0, + "completions/mean_length": 6747.8125, + "completions/mean_terminated_length": 6516.54443359375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.8531035929918289, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003335036803036928, + "learning_rate": 1e-05, + "loss": 0.0494, + "num_tokens": 12950989.0, + "reward": 0.3515625, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999262690544128, + "sampling/importance_sampling_ratio/min": 0.0024787711445242167, + "sampling/sampling_logp_difference/max": 5.999992370605469, + "sampling/sampling_logp_difference/mean": 0.017946189269423485, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.059201583255344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.059201583255344e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14653.0, + "completions/mean_length": 5237.5390625, + "completions/mean_terminated_length": 5060.611328125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.9604798555374146, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028048555832356215, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 13641594.0, + "reward": 0.3359375, + "reward_std": 0.27851757407188416, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999921977519989, + "sampling/importance_sampling_ratio/min": 0.0003354719083290547, + "sampling/sampling_logp_difference/max": 7.999972343444824, + "sampling/sampling_logp_difference/mean": 0.01799672283232212, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7391609592086752e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7391609592086752e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14949.0, + "completions/mean_length": 5088.71875, + "completions/mean_terminated_length": 4999.779296875, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.9381079524755478, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015588597161695361, + "learning_rate": 1e-05, + "loss": 0.0593, + "num_tokens": 14310022.0, + "reward": 0.3515625, + "reward_std": 0.24723157286643982, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999968945980072, + "sampling/importance_sampling_ratio/min": 0.0008060967666096985, + "sampling/sampling_logp_difference/max": 7.123306751251221, + "sampling/sampling_logp_difference/mean": 0.018512990325689316, + "step": 20 + }, + { + "clip_ratio/high_max": 1.4323140021588188e-05, + "clip_ratio/high_mean": 3.580785005397047e-06, + "clip_ratio/low_mean": 2.3172296550910687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6753081669994572e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15268.0, + "completions/max_terminated_length": 15268.0, + "completions/mean_length": 5374.375, + "completions/mean_terminated_length": 5374.375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 1.198778212070465, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023761435877531767, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 15017710.0, + "reward": 0.21875, + "reward_std": 0.2432974874973297, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000046730041504, + "sampling/importance_sampling_ratio/min": 2.2531810827786103e-05, + "sampling/sampling_logp_difference/max": 10.700582504272461, + "sampling/sampling_logp_difference/mean": 0.02083735726773739, + "step": 21 + }, + { + "clip_ratio/high_max": 8.891734069038648e-06, + "clip_ratio/high_mean": 2.222933517259662e-06, + "clip_ratio/low_mean": 3.576970004814939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.799263345172221e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16131.0, + "completions/max_terminated_length": 16131.0, + "completions/mean_length": 5016.484375, + "completions/mean_terminated_length": 5016.484375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 1.0073698610067368, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024441592395305634, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 15680364.0, + "reward": 0.2734375, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.0009118849993683398, + "sampling/sampling_logp_difference/max": 6.999996662139893, + "sampling/sampling_logp_difference/mean": 0.019295595586299896, + "step": 22 + }, + { + "clip_ratio/high_max": 7.065739737299737e-06, + "clip_ratio/high_mean": 1.7664349343249341e-06, + "clip_ratio/low_mean": 4.2640075662347954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.440651059667289e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14751.0, + "completions/mean_length": 6798.171875, + "completions/mean_terminated_length": 6408.50390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0817051529884338, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035431634169071913, + "learning_rate": 1e-05, + "loss": -0.0282, + "num_tokens": 16572210.0, + "reward": 0.3046875, + "reward_std": 0.3645517826080322, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 0.00014901062240824103, + "sampling/sampling_logp_difference/max": 8.811492919921875, + "sampling/sampling_logp_difference/mean": 0.021285930648446083, + "step": 23 + }, + { + "clip_ratio/high_max": 1.8304424429516075e-05, + "clip_ratio/high_mean": 4.576106107379019e-06, + "clip_ratio/low_mean": 3.600540730985813e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0581513530923985e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5388.6875, + "completions/mean_terminated_length": 5302.1103515625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 1.1402523145079613, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003914100583642721, + "learning_rate": 1e-05, + "loss": 0.0017, + "num_tokens": 17282394.0, + "reward": 0.234375, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000356435775757, + "sampling/importance_sampling_ratio/min": 4.936015557177598e-06, + "sampling/sampling_logp_difference/max": 12.218952178955078, + "sampling/sampling_logp_difference/mean": 0.020141229033470154, + "step": 24 + }, + { + "clip_ratio/high_max": 3.6923258903698297e-06, + "clip_ratio/high_mean": 9.230814725924574e-07, + "clip_ratio/low_mean": 4.0747915363681386e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1670996779430425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15553.0, + "completions/mean_length": 5140.625, + "completions/mean_terminated_length": 4962.1591796875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.9437280669808388, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026927352882921696, + "learning_rate": 1e-05, + "loss": 0.0467, + "num_tokens": 17963970.0, + "reward": 0.3125, + "reward_std": 0.3009189963340759, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961256980896, + "sampling/importance_sampling_ratio/min": 6.243770621949807e-05, + "sampling/sampling_logp_difference/max": 9.681341171264648, + "sampling/sampling_logp_difference/mean": 0.02010953240096569, + "step": 25 + }, + { + "clip_ratio/high_max": 9.832700470724376e-06, + "clip_ratio/high_mean": 2.458175117681094e-06, + "clip_ratio/low_mean": 1.5558874792986899e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014062596979784e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12937.0, + "completions/max_terminated_length": 12937.0, + "completions/mean_length": 5454.8515625, + "completions/mean_terminated_length": 5454.8515625, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "entropy": 1.1385098099708557, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027164353523403406, + "learning_rate": 1e-05, + "loss": 0.009, + "num_tokens": 18680591.0, + "reward": 0.296875, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000078558921814, + "sampling/importance_sampling_ratio/min": 0.005307729355990887, + "sampling/sampling_logp_difference/max": 5.238591194152832, + "sampling/sampling_logp_difference/mean": 0.020798511803150177, + "step": 26 + }, + { + "clip_ratio/high_max": 1.8564560832601273e-05, + "clip_ratio/high_mean": 4.641140208150318e-06, + "clip_ratio/low_mean": 1.8977171066580922e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.361831138841808e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15341.0, + "completions/mean_length": 6053.4296875, + "completions/mean_terminated_length": 5972.08642578125, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "entropy": 1.006893776357174, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016045555239543319, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 19474438.0, + "reward": 0.2578125, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 4.606551374308765e-05, + "sampling/sampling_logp_difference/max": 9.985445976257324, + "sampling/sampling_logp_difference/mean": 0.01937020570039749, + "step": 27 + }, + { + "clip_ratio/high_max": 3.951194685214432e-06, + "clip_ratio/high_mean": 9.87798671303608e-07, + "clip_ratio/low_mean": 3.949826844973359e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.048606700735036e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 5732.6328125, + "completions/mean_terminated_length": 5563.56396484375, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "entropy": 1.0205800458788872, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017355874879285693, + "learning_rate": 1e-05, + "loss": 0.0254, + "num_tokens": 20229199.0, + "reward": 0.2578125, + "reward_std": 0.32695505023002625, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966561794281, + "sampling/importance_sampling_ratio/min": 9.611312270862982e-05, + "sampling/sampling_logp_difference/max": 9.249984741210938, + "sampling/sampling_logp_difference/mean": 0.020152747631072998, + "step": 28 + }, + { + "clip_ratio/high_max": 1.1344701988491579e-05, + "clip_ratio/high_mean": 2.8361754971228947e-06, + "clip_ratio/low_mean": 6.441893049213832e-05, + "clip_ratio/low_min": 3.704581786223571e-06, + "clip_ratio/region_mean": 6.72551062734783e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11633.0, + "completions/mean_length": 4968.0546875, + "completions/mean_terminated_length": 4786.849609375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 1.0484329834580421, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002361088991165161, + "learning_rate": 1e-05, + "loss": 0.1348, + "num_tokens": 20885790.0, + "reward": 0.265625, + "reward_std": 0.3180084228515625, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000216960906982, + "sampling/importance_sampling_ratio/min": 0.006972009316086769, + "sampling/sampling_logp_difference/max": 4.965851783752441, + "sampling/sampling_logp_difference/mean": 0.018748482689261436, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.939045106766571e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.939045106766571e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12655.0, + "completions/mean_length": 4634.640625, + "completions/mean_terminated_length": 4542.1259765625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0479918718338013, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002287437906488776, + "learning_rate": 1e-05, + "loss": -0.0157, + "num_tokens": 21497480.0, + "reward": 0.34375, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999415874481201, + "sampling/importance_sampling_ratio/min": 8.729176670385641e-07, + "sampling/sampling_logp_difference/max": 13.951424598693848, + "sampling/sampling_logp_difference/mean": 0.019327208399772644, + "step": 30 + }, + { + "clip_ratio/high_max": 2.4600531105534174e-05, + "clip_ratio/high_mean": 7.4163915542158065e-06, + "clip_ratio/low_mean": 3.8106682723082486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.552307382255094e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15999.0, + "completions/mean_length": 5922.8359375, + "completions/mean_terminated_length": 5840.46435546875, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 1.1925376057624817, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002192641608417034, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 22276267.0, + "reward": 0.1953125, + "reward_std": 0.22461041808128357, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999226987361908, + "sampling/importance_sampling_ratio/min": 1.546916053030145e-07, + "sampling/sampling_logp_difference/max": 15.681832313537598, + "sampling/sampling_logp_difference/mean": 0.026596486568450928, + "step": 31 + }, + { + "clip_ratio/high_max": 1.3442309864331037e-05, + "clip_ratio/high_mean": 3.360577466082759e-06, + "clip_ratio/low_mean": 2.185166863455379e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5212245873262873e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15082.0, + "completions/mean_length": 5835.5, + "completions/mean_terminated_length": 5752.44091796875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.229158878326416, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0007279868004843593, + "learning_rate": 1e-05, + "loss": 0.0081, + "num_tokens": 23044019.0, + "reward": 0.1796875, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998399019241333, + "sampling/importance_sampling_ratio/min": 1.414701245039396e-07, + "sampling/sampling_logp_difference/max": 15.771177291870117, + "sampling/sampling_logp_difference/mean": 0.020945575088262558, + "step": 32 + }, + { + "clip_ratio/high_max": 1.277465526072774e-05, + "clip_ratio/high_mean": 3.193663815181935e-06, + "clip_ratio/low_mean": 3.348547249970579e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.667913586014038e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14729.0, + "completions/max_terminated_length": 14729.0, + "completions/mean_length": 5070.1484375, + "completions/mean_terminated_length": 5070.1484375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 1.0323031097650528, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022168844006955624, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 23714878.0, + "reward": 0.3515625, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999499917030334, + "sampling/importance_sampling_ratio/min": 0.0037885017227381468, + "sampling/sampling_logp_difference/max": 5.575784683227539, + "sampling/sampling_logp_difference/mean": 0.01919984258711338, + "step": 33 + }, + { + "clip_ratio/high_max": 1.2069132026226725e-05, + "clip_ratio/high_mean": 3.0172830065566814e-06, + "clip_ratio/low_mean": 3.323697501400602e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6254257338441676e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15334.0, + "completions/mean_length": 4792.2578125, + "completions/mean_terminated_length": 4700.984375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.9981634542346001, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001841123914346099, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 24347119.0, + "reward": 0.4375, + "reward_std": 0.3524719774723053, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999489784240723, + "sampling/importance_sampling_ratio/min": 4.2607393879734445e-06, + "sampling/sampling_logp_difference/max": 12.366067886352539, + "sampling/sampling_logp_difference/mean": 0.018039174377918243, + "step": 34 + }, + { + "clip_ratio/high_max": 1.3947896150057204e-05, + "clip_ratio/high_mean": 4.6235029458330246e-06, + "clip_ratio/low_mean": 4.1055162455450045e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5678665628656745e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16032.0, + "completions/mean_length": 6841.375, + "completions/mean_terminated_length": 6453.46337890625, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "entropy": 1.0972845032811165, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00202017929404974, + "learning_rate": 1e-05, + "loss": -0.0092, + "num_tokens": 25241911.0, + "reward": 0.25, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999304413795471, + "sampling/importance_sampling_ratio/min": 0.00026355183217674494, + "sampling/sampling_logp_difference/max": 8.241260528564453, + "sampling/sampling_logp_difference/mean": 0.02115095779299736, + "step": 35 + }, + { + "clip_ratio/high_max": 4.14414989791112e-06, + "clip_ratio/high_mean": 1.03603747447778e-06, + "clip_ratio/low_mean": 4.4157833031022165e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.519387027812627e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16218.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 5645.6640625, + "completions/mean_terminated_length": 5645.6640625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.0653726011514664, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003633195301517844, + "learning_rate": 1e-05, + "loss": -0.0409, + "num_tokens": 25982588.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999817967414856, + "sampling/importance_sampling_ratio/min": 0.0007106869597919285, + "sampling/sampling_logp_difference/max": 7.249278545379639, + "sampling/sampling_logp_difference/mean": 0.02010509930551052, + "step": 36 + }, + { + "clip_ratio/high_max": 7.0509927354578394e-06, + "clip_ratio/high_mean": 1.7627481838644599e-06, + "clip_ratio/low_mean": 3.606558789215342e-05, + "clip_ratio/low_min": 3.3240260108868824e-06, + "clip_ratio/region_mean": 3.782833596233104e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 7335.1875, + "completions/mean_terminated_length": 7118.01611328125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.9340982511639595, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017444937257096171, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 26946156.0, + "reward": 0.171875, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998713731765747, + "sampling/importance_sampling_ratio/min": 2.5868248485494405e-05, + "sampling/sampling_logp_difference/max": 10.562494277954102, + "sampling/sampling_logp_difference/mean": 0.01965884119272232, + "step": 37 + }, + { + "clip_ratio/high_max": 1.1849869679281255e-05, + "clip_ratio/high_mean": 2.962467419820314e-06, + "clip_ratio/low_mean": 2.5232500775018707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8194967853778508e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14151.0, + "completions/mean_length": 5998.8671875, + "completions/mean_terminated_length": 5917.09423828125, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "entropy": 0.975816160440445, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020293404813855886, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 27733059.0, + "reward": 0.2734375, + "reward_std": 0.2908889353275299, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999889612197876, + "sampling/importance_sampling_ratio/min": 0.00892679300159216, + "sampling/sampling_logp_difference/max": 4.718698024749756, + "sampling/sampling_logp_difference/mean": 0.01972467266023159, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.05586318315909e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05586318315909e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5599.4375, + "completions/mean_terminated_length": 5599.4375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 1.006210096180439, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035929102450609207, + "learning_rate": 1e-05, + "loss": 0.02, + "num_tokens": 28468843.0, + "reward": 0.2578125, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.009500927291810513, + "sampling/sampling_logp_difference/max": 4.656365871429443, + "sampling/sampling_logp_difference/mean": 0.019885972142219543, + "step": 39 + }, + { + "clip_ratio/high_max": 1.1638733667496126e-05, + "clip_ratio/high_mean": 2.9096834168740315e-06, + "clip_ratio/low_mean": 3.210125066743785e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5010934084311884e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14843.0, + "completions/max_terminated_length": 14843.0, + "completions/mean_length": 5035.7734375, + "completions/mean_terminated_length": 5035.7734375, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 1.004905492067337, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023170222993940115, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 29133270.0, + "reward": 0.3046875, + "reward_std": 0.3037971258163452, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 4.264977542334236e-05, + "sampling/sampling_logp_difference/max": 10.062488555908203, + "sampling/sampling_logp_difference/mean": 0.019529584795236588, + "step": 40 + }, + { + "clip_ratio/high_max": 9.932905413734261e-06, + "clip_ratio/high_mean": 2.4832263534335652e-06, + "clip_ratio/low_mean": 4.655256179830758e-05, + "clip_ratio/low_min": 1.288991325054667e-05, + "clip_ratio/region_mean": 4.903578792436747e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 4865.6171875, + "completions/mean_terminated_length": 4774.92138671875, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "entropy": 0.9472262933850288, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0024069426581263542, + "learning_rate": 1e-05, + "loss": 0.0435, + "num_tokens": 29774973.0, + "reward": 0.4296875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 4.94040648391092e-07, + "sampling/sampling_logp_difference/max": 14.520648002624512, + "sampling/sampling_logp_difference/mean": 0.017961984500288963, + "step": 41 + }, + { + "clip_ratio/high_max": 1.4300524526333902e-05, + "clip_ratio/high_mean": 4.549106392914837e-06, + "clip_ratio/low_mean": 8.310655789500743e-05, + "clip_ratio/low_min": 3.895901500072796e-06, + "clip_ratio/region_mean": 8.765566417423543e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14604.0, + "completions/max_terminated_length": 14604.0, + "completions/mean_length": 5928.3828125, + "completions/mean_terminated_length": 5928.3828125, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.9451013877987862, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019357368582859635, + "learning_rate": 1e-05, + "loss": 0.0659, + "num_tokens": 30557014.0, + "reward": 0.2734375, + "reward_std": 0.3227117359638214, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000040054321289, + "sampling/importance_sampling_ratio/min": 4.787445504916832e-06, + "sampling/sampling_logp_difference/max": 12.249513626098633, + "sampling/sampling_logp_difference/mean": 0.020681140944361687, + "step": 42 + }, + { + "clip_ratio/high_max": 1.6088630218291655e-05, + "clip_ratio/high_mean": 4.022157554572914e-06, + "clip_ratio/low_mean": 4.4498895476863254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.852105257668882e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15935.0, + "completions/max_terminated_length": 15935.0, + "completions/mean_length": 5253.890625, + "completions/mean_terminated_length": 5253.890625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 1.0573822036385536, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027430339250713587, + "learning_rate": 1e-05, + "loss": -0.0295, + "num_tokens": 31252752.0, + "reward": 0.3828125, + "reward_std": 0.3564237058162689, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 0.0019366396591067314, + "sampling/sampling_logp_difference/max": 6.246800899505615, + "sampling/sampling_logp_difference/mean": 0.019426241517066956, + "step": 43 + }, + { + "clip_ratio/high_max": 1.80760021066817e-05, + "clip_ratio/high_mean": 4.519000526670425e-06, + "clip_ratio/low_mean": 2.491120585546014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9430206382130564e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12610.0, + "completions/mean_length": 4434.7890625, + "completions/mean_terminated_length": 4340.70068359375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 1.0309192687273026, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027177443262189627, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 31839885.0, + "reward": 0.359375, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999918520450592, + "sampling/importance_sampling_ratio/min": 0.0010315371910110116, + "sampling/sampling_logp_difference/max": 6.876705169677734, + "sampling/sampling_logp_difference/mean": 0.01883832737803459, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9404036808955425e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404036808955425e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14982.0, + "completions/mean_length": 6810.578125, + "completions/mean_terminated_length": 6735.19677734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 1.134837955236435, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025250029284507036, + "learning_rate": 1e-05, + "loss": -0.0016, + "num_tokens": 32734551.0, + "reward": 0.2421875, + "reward_std": 0.21436068415641785, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000019073486328, + "sampling/importance_sampling_ratio/min": 0.0014875066699460149, + "sampling/sampling_logp_difference/max": 6.510653972625732, + "sampling/sampling_logp_difference/mean": 0.02130994386970997, + "step": 45 + }, + { + "clip_ratio/high_max": 1.1104832083219662e-05, + "clip_ratio/high_mean": 2.7762080208049156e-06, + "clip_ratio/low_mean": 2.9984376055836037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.276058407664095e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16351.0, + "completions/mean_length": 6623.3359375, + "completions/mean_terminated_length": 6308.4755859375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.990560457110405, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018365891883149743, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 33600498.0, + "reward": 0.3203125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 5.727278562517313e-07, + "sampling/sampling_logp_difference/max": 14.372855186462402, + "sampling/sampling_logp_difference/mean": 0.019745903089642525, + "step": 46 + }, + { + "clip_ratio/high_max": 1.5849275314394617e-05, + "clip_ratio/high_mean": 3.962318828598654e-06, + "clip_ratio/low_mean": 2.2989276772023004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.695159548693482e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14530.0, + "completions/mean_length": 5414.046875, + "completions/mean_terminated_length": 5239.9208984375, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 1.213307112455368, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016015933360904455, + "learning_rate": 1e-05, + "loss": 0.0239, + "num_tokens": 34322776.0, + "reward": 0.2109375, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999943733215332, + "sampling/importance_sampling_ratio/min": 0.0006993028800934553, + "sampling/sampling_logp_difference/max": 7.2654266357421875, + "sampling/sampling_logp_difference/mean": 0.021634424105286598, + "step": 47 + }, + { + "clip_ratio/high_max": 3.0635404527856736e-05, + "clip_ratio/high_mean": 7.658851131964184e-06, + "clip_ratio/low_mean": 4.565159474623215e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3310446219256846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16066.0, + "completions/max_terminated_length": 16066.0, + "completions/mean_length": 6082.1015625, + "completions/mean_terminated_length": 6082.1015625, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.8880708515644073, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002024279674515128, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 35118853.0, + "reward": 0.4765625, + "reward_std": 0.3619031310081482, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 3.121717236354016e-05, + "sampling/sampling_logp_difference/max": 10.374542236328125, + "sampling/sampling_logp_difference/mean": 0.01861739531159401, + "step": 48 + }, + { + "clip_ratio/high_max": 1.718443036224926e-05, + "clip_ratio/high_mean": 4.296107590562315e-06, + "clip_ratio/low_mean": 3.4419200915181136e-05, + "clip_ratio/low_min": 3.7744964629382594e-06, + "clip_ratio/region_mean": 3.871530816468294e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16287.0, + "completions/mean_length": 6382.3828125, + "completions/mean_terminated_length": 6059.75, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "entropy": 0.8597949668765068, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002679568249732256, + "learning_rate": 1e-05, + "loss": 0.0749, + "num_tokens": 35956350.0, + "reward": 0.46875, + "reward_std": 0.39530590176582336, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000333786010742, + "sampling/importance_sampling_ratio/min": 0.0005964707233943045, + "sampling/sampling_logp_difference/max": 7.424480438232422, + "sampling/sampling_logp_difference/mean": 0.01830567792057991, + "step": 49 + }, + { + "clip_ratio/high_max": 7.470714990631677e-06, + "clip_ratio/high_mean": 1.8676787476579193e-06, + "clip_ratio/low_mean": 2.8441645326893195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0309323619803763e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 6112.7890625, + "completions/mean_terminated_length": 6112.7890625, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.9591199606657028, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0011262348853051662, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 36756171.0, + "reward": 0.359375, + "reward_std": 0.2743412256240845, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999510049819946, + "sampling/importance_sampling_ratio/min": 1.2219889867992606e-05, + "sampling/sampling_logp_difference/max": 11.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01950032450258732, + "step": 50 + }, + { + "clip_ratio/high_max": 3.7807756143592997e-06, + "clip_ratio/high_mean": 9.451939035898249e-07, + "clip_ratio/low_mean": 3.906526939090327e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.001046335133651e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16169.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 6744.390625, + "completions/mean_terminated_length": 6744.390625, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "entropy": 1.061469852924347, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002305408474057913, + "learning_rate": 1e-05, + "loss": 0.0496, + "num_tokens": 37643573.0, + "reward": 0.234375, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986124992371, + "sampling/importance_sampling_ratio/min": 9.516369573248085e-06, + "sampling/sampling_logp_difference/max": 11.56249713897705, + "sampling/sampling_logp_difference/mean": 0.020016517490148544, + "step": 51 + }, + { + "clip_ratio/high_max": 1.3845812645740807e-05, + "clip_ratio/high_mean": 3.4614531614352018e-06, + "clip_ratio/low_mean": 2.3906941066798026e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7368394228233228e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15606.0, + "completions/max_terminated_length": 15606.0, + "completions/mean_length": 5723.0859375, + "completions/mean_terminated_length": 5723.0859375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "entropy": 1.0918374806642532, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002079444006085396, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 38399000.0, + "reward": 0.34375, + "reward_std": 0.28353503346443176, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999094009399414, + "sampling/importance_sampling_ratio/min": 0.00247886567376554, + "sampling/sampling_logp_difference/max": 5.9999542236328125, + "sampling/sampling_logp_difference/mean": 0.02025545760989189, + "step": 52 + }, + { + "clip_ratio/high_max": 1.6330426660715602e-05, + "clip_ratio/high_mean": 4.082606665178901e-06, + "clip_ratio/low_mean": 4.608668984928954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0169297423963144e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15124.0, + "completions/mean_length": 6075.078125, + "completions/mean_terminated_length": 5827.6640625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "entropy": 1.0526456609368324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002548371907323599, + "learning_rate": 1e-05, + "loss": 0.0005, + "num_tokens": 39195762.0, + "reward": 0.28125, + "reward_std": 0.2903746962547302, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.0003802210558205843, + "sampling/sampling_logp_difference/max": 7.874757766723633, + "sampling/sampling_logp_difference/mean": 0.02132822386920452, + "step": 53 + }, + { + "clip_ratio/high_max": 1.2557530681078788e-05, + "clip_ratio/high_mean": 3.139382670269697e-06, + "clip_ratio/low_mean": 5.579355536156072e-05, + "clip_ratio/low_min": 6.314919346550596e-06, + "clip_ratio/region_mean": 5.893293734970939e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14981.0, + "completions/mean_length": 6273.203125, + "completions/mean_terminated_length": 6193.59033203125, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.9629805982112885, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001929077785462141, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 40016988.0, + "reward": 0.3828125, + "reward_std": 0.35718512535095215, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000178813934326, + "sampling/importance_sampling_ratio/min": 0.004126251209527254, + "sampling/sampling_logp_difference/max": 5.490386009216309, + "sampling/sampling_logp_difference/mean": 0.01974763534963131, + "step": 54 + }, + { + "clip_ratio/high_max": 5.326855898601934e-06, + "clip_ratio/high_mean": 1.3317139746504836e-06, + "clip_ratio/low_mean": 1.2195182989671594e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3526897078008915e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12113.0, + "completions/mean_length": 4658.1640625, + "completions/mean_terminated_length": 4565.83447265625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.950105108320713, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002910251496359706, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 40632681.0, + "reward": 0.390625, + "reward_std": 0.28353503346443176, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000574588775635, + "sampling/importance_sampling_ratio/min": 0.0017036369536072016, + "sampling/sampling_logp_difference/max": 6.374989986419678, + "sampling/sampling_logp_difference/mean": 0.018849056214094162, + "step": 55 + }, + { + "clip_ratio/high_max": 1.1988173810095759e-05, + "clip_ratio/high_mean": 2.9970434525239398e-06, + "clip_ratio/low_mean": 2.1473538311056473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4470581195146224e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 6582.953125, + "completions/mean_terminated_length": 5756.94921875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.8884479179978371, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018201791681349277, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 41498939.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.00011687594087561592, + "sampling/sampling_logp_difference/max": 9.054397583007812, + "sampling/sampling_logp_difference/mean": 0.018637457862496376, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9767679873439192e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9767679873439192e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15902.0, + "completions/mean_length": 6408.4453125, + "completions/mean_terminated_length": 6250.103515625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "entropy": 1.0724121406674385, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027558596339076757, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 42338436.0, + "reward": 0.2578125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000157356262207, + "sampling/importance_sampling_ratio/min": 2.144563404726796e-05, + "sampling/sampling_logp_difference/max": 10.74998950958252, + "sampling/sampling_logp_difference/mean": 0.020520739257335663, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.615732708160067e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.615732708160067e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 4527.8984375, + "completions/mean_terminated_length": 4243.35205078125, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 0.9734272584319115, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018782512051984668, + "learning_rate": 1e-05, + "loss": 0.0726, + "num_tokens": 42936215.0, + "reward": 0.4375, + "reward_std": 0.2890765368938446, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626874923706, + "sampling/importance_sampling_ratio/min": 6.564679324583267e-07, + "sampling/sampling_logp_difference/max": 14.2363920211792, + "sampling/sampling_logp_difference/mean": 0.018541917204856873, + "step": 58 + }, + { + "clip_ratio/high_max": 1.9634914679045323e-05, + "clip_ratio/high_mean": 4.908728669761331e-06, + "clip_ratio/low_mean": 3.605886263358116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096759084859514e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14340.0, + "completions/max_terminated_length": 14340.0, + "completions/mean_length": 5389.609375, + "completions/mean_terminated_length": 5389.609375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.035320296883583, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003410179866477847, + "learning_rate": 1e-05, + "loss": 0.1109, + "num_tokens": 43643733.0, + "reward": 0.4609375, + "reward_std": 0.3040394186973572, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303221702576, + "sampling/importance_sampling_ratio/min": 7.063792872941121e-05, + "sampling/sampling_logp_difference/max": 9.557943344116211, + "sampling/sampling_logp_difference/mean": 0.01980186253786087, + "step": 59 + }, + { + "clip_ratio/high_max": 3.324525869174977e-05, + "clip_ratio/high_mean": 9.664479989623942e-06, + "clip_ratio/low_mean": 3.5182122701371554e-05, + "clip_ratio/low_min": 1.1718383575498592e-05, + "clip_ratio/region_mean": 4.484660291836917e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 5338.90625, + "completions/mean_terminated_length": 5251.93701171875, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.9680418893694878, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013158825458958745, + "learning_rate": 1e-05, + "loss": 0.0851, + "num_tokens": 44345177.0, + "reward": 0.4140625, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 1.941789093962143e-07, + "sampling/sampling_logp_difference/max": 15.454485893249512, + "sampling/sampling_logp_difference/mean": 0.019034607335925102, + "step": 60 + }, + { + "clip_ratio/high_max": 1.678188709774986e-05, + "clip_ratio/high_mean": 4.195471774437465e-06, + "clip_ratio/low_mean": 2.326147910025611e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.74569506473199e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15030.0, + "completions/mean_length": 5197.5859375, + "completions/mean_terminated_length": 5020.02392578125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.9385635256767273, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023898824583739042, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 45029716.0, + "reward": 0.328125, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999666213989258, + "sampling/importance_sampling_ratio/min": 0.0031843625474721193, + "sampling/sampling_logp_difference/max": 5.749503135681152, + "sampling/sampling_logp_difference/mean": 0.017856482416391373, + "step": 61 + }, + { + "clip_ratio/high_max": 2.8269179438211722e-05, + "clip_ratio/high_mean": 7.0672948595529306e-06, + "clip_ratio/low_mean": 4.551043662104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2577731821656926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6353.9375, + "completions/mean_terminated_length": 6194.73046875, + "completions/min_length": 1201.0, + "completions/min_terminated_length": 1201.0, + "entropy": 0.9195960611104965, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002777763642370701, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 45861388.0, + "reward": 0.4140625, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.00033647287636995316, + "sampling/sampling_logp_difference/max": 7.996993064880371, + "sampling/sampling_logp_difference/mean": 0.019472671672701836, + "step": 62 + }, + { + "clip_ratio/high_max": 8.376483492611442e-06, + "clip_ratio/high_mean": 2.0941208731528604e-06, + "clip_ratio/low_mean": 1.1372792755537375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3466913628690236e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 7125.265625, + "completions/mean_terminated_length": 6669.91748046875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.9209358915686607, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012601700145751238, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 46793902.0, + "reward": 0.265625, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999377727508545, + "sampling/importance_sampling_ratio/min": 2.034899989666883e-05, + "sampling/sampling_logp_difference/max": 10.802478790283203, + "sampling/sampling_logp_difference/mean": 0.0191169623285532, + "step": 63 + }, + { + "clip_ratio/high_max": 6.630596089962637e-06, + "clip_ratio/high_mean": 1.6576490224906593e-06, + "clip_ratio/low_mean": 3.7912880316071096e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.957052945224859e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14620.0, + "completions/mean_length": 5895.4453125, + "completions/mean_terminated_length": 5812.8583984375, + "completions/min_length": 708.0, + "completions/min_terminated_length": 708.0, + "entropy": 0.9421789273619652, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036641336046159267, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 47567543.0, + "reward": 0.359375, + "reward_std": 0.2937847673892975, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999691247940063, + "sampling/importance_sampling_ratio/min": 2.1912494048592634e-05, + "sampling/sampling_logp_difference/max": 10.728453636169434, + "sampling/sampling_logp_difference/mean": 0.018009435385465622, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.6876661106834945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6876661106834945e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13935.0, + "completions/mean_length": 4643.9921875, + "completions/mean_terminated_length": 4551.55126953125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 1.1234809532761574, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003017786890268326, + "learning_rate": 1e-05, + "loss": 0.0403, + "num_tokens": 48180998.0, + "reward": 0.328125, + "reward_std": 0.2198973000049591, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 1.4786172641834128e-06, + "sampling/sampling_logp_difference/max": 13.424403190612793, + "sampling/sampling_logp_difference/mean": 0.0194530226290226, + "step": 65 + }, + { + "clip_ratio/high_max": 1.1807285773102194e-05, + "clip_ratio/high_mean": 2.9518214432755485e-06, + "clip_ratio/low_mean": 1.7793156246170838e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0744977689446387e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 7134.5546875, + "completions/mean_terminated_length": 6679.66357421875, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "entropy": 1.0891609117388725, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021707366686314344, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 49113837.0, + "reward": 0.2578125, + "reward_std": 0.21778056025505066, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000441074371338, + "sampling/importance_sampling_ratio/min": 5.227705059951404e-06, + "sampling/sampling_logp_difference/max": 12.161538124084473, + "sampling/sampling_logp_difference/mean": 0.021074742078781128, + "step": 66 + }, + { + "clip_ratio/high_max": 1.785590688996308e-05, + "clip_ratio/high_mean": 4.46397672249077e-06, + "clip_ratio/low_mean": 4.4942946374249004e-05, + "clip_ratio/low_min": 4.320774223742774e-06, + "clip_ratio/region_mean": 4.940692338095687e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16178.0, + "completions/mean_length": 6770.3984375, + "completions/mean_terminated_length": 6694.70068359375, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 1.14402187615633, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003913953434675932, + "learning_rate": 1e-05, + "loss": -0.0645, + "num_tokens": 49999984.0, + "reward": 0.2890625, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999771118164062, + "sampling/importance_sampling_ratio/min": 0.00039836866199038923, + "sampling/sampling_logp_difference/max": 7.828132629394531, + "sampling/sampling_logp_difference/mean": 0.021658796817064285, + "step": 67 + }, + { + "clip_ratio/high_max": 6.990269412199268e-06, + "clip_ratio/high_mean": 3.4296645026188344e-06, + "clip_ratio/low_mean": 3.069889220341793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.412855670603676e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 6743.3359375, + "completions/mean_terminated_length": 5926.33056640625, + "completions/min_length": 1195.0, + "completions/min_terminated_length": 1195.0, + "entropy": 0.8485476225614548, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0015872148796916008, + "learning_rate": 1e-05, + "loss": 0.0107, + "num_tokens": 50881939.0, + "reward": 0.2578125, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998895525932312, + "sampling/importance_sampling_ratio/min": 0.008705966174602509, + "sampling/sampling_logp_difference/max": 4.743746757507324, + "sampling/sampling_logp_difference/mean": 0.017901426181197166, + "step": 68 + }, + { + "clip_ratio/high_max": 1.300406438531354e-05, + "clip_ratio/high_mean": 3.251016096328385e-06, + "clip_ratio/low_mean": 3.055216484426637e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.380318116796843e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15756.0, + "completions/max_terminated_length": 15756.0, + "completions/mean_length": 5952.0234375, + "completions/mean_terminated_length": 5952.0234375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 1.1280141845345497, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037659234367311, + "learning_rate": 1e-05, + "loss": 0.1156, + "num_tokens": 51664814.0, + "reward": 0.2578125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009536743164, + "sampling/importance_sampling_ratio/min": 0.0037554434966295958, + "sampling/sampling_logp_difference/max": 5.5845489501953125, + "sampling/sampling_logp_difference/mean": 0.01998155191540718, + "step": 69 + }, + { + "clip_ratio/high_max": 9.465616585657699e-06, + "clip_ratio/high_mean": 2.3664041464144248e-06, + "clip_ratio/low_mean": 3.98842666982091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2250670958310366e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15301.0, + "completions/mean_length": 5533.171875, + "completions/mean_terminated_length": 5360.93701171875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.9313871935009956, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003427086630836129, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 52391076.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445080757141, + "sampling/importance_sampling_ratio/min": 2.0617162590497173e-05, + "sampling/sampling_logp_difference/max": 10.789386749267578, + "sampling/sampling_logp_difference/mean": 0.019165968522429466, + "step": 70 + }, + { + "clip_ratio/high_max": 1.4208102129487088e-05, + "clip_ratio/high_mean": 3.552025532371772e-06, + "clip_ratio/low_mean": 3.275496806054434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630699370660295e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16334.0, + "completions/mean_length": 7481.671875, + "completions/mean_terminated_length": 7194.5, + "completions/min_length": 1003.0, + "completions/min_terminated_length": 1003.0, + "entropy": 0.9429318532347679, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002845548093318939, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 53366314.0, + "reward": 0.34375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.00124227290507406, + "sampling/sampling_logp_difference/max": 6.690812587738037, + "sampling/sampling_logp_difference/mean": 0.019388489425182343, + "step": 71 + }, + { + "clip_ratio/high_max": 2.2517269826494157e-05, + "clip_ratio/high_mean": 5.629317456623539e-06, + "clip_ratio/low_mean": 6.0563696024473757e-05, + "clip_ratio/low_min": 6.892558758408995e-06, + "clip_ratio/region_mean": 6.61930134810973e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 6112.03125, + "completions/mean_terminated_length": 5865.50439453125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.9013729467988014, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017839284846559167, + "learning_rate": 1e-05, + "loss": 0.0758, + "num_tokens": 54165910.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.0015448861522600055, + "sampling/sampling_logp_difference/max": 6.472805023193359, + "sampling/sampling_logp_difference/mean": 0.019030068069696426, + "step": 72 + }, + { + "clip_ratio/high_max": 7.458678737748414e-06, + "clip_ratio/high_mean": 1.8646696844371036e-06, + "clip_ratio/low_mean": 2.7964613764197566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.982928344863467e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15853.0, + "completions/max_terminated_length": 15853.0, + "completions/mean_length": 4590.625, + "completions/mean_terminated_length": 4590.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.8759121596813202, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035294899716973305, + "learning_rate": 1e-05, + "loss": 0.0802, + "num_tokens": 54771526.0, + "reward": 0.4375, + "reward_std": 0.41268986463546753, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.0007238102261908352, + "sampling/sampling_logp_difference/max": 7.230981349945068, + "sampling/sampling_logp_difference/mean": 0.017765047028660774, + "step": 73 + }, + { + "clip_ratio/high_max": 1.460266958019929e-05, + "clip_ratio/high_mean": 3.6506673950498225e-06, + "clip_ratio/low_mean": 3.319967777315469e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.685034562295186e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 5152.234375, + "completions/mean_terminated_length": 5063.79541015625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.8593896478414536, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003597866278141737, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 55449820.0, + "reward": 0.4453125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999961853027344, + "sampling/importance_sampling_ratio/min": 0.0005548940971493721, + "sampling/sampling_logp_difference/max": 7.49673318862915, + "sampling/sampling_logp_difference/mean": 0.018061507493257523, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.4012571227794979e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4012571227794979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16195.0, + "completions/mean_length": 6629.2734375, + "completions/mean_terminated_length": 6474.43701171875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.106893703341484, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0014848506543785334, + "learning_rate": 1e-05, + "loss": -0.0128, + "num_tokens": 56318135.0, + "reward": 0.2109375, + "reward_std": 0.190433531999588, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 1.3627897033074987e-08, + "sampling/sampling_logp_difference/max": 18.111146926879883, + "sampling/sampling_logp_difference/mean": 0.021642908453941345, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.388627917251142e-05, + "clip_ratio/low_min": 5.944737495156005e-06, + "clip_ratio/region_mean": 4.388627917251142e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14845.0, + "completions/max_terminated_length": 14845.0, + "completions/mean_length": 5802.8828125, + "completions/mean_terminated_length": 5802.8828125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9879340082406998, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003300516167655587, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 57078080.0, + "reward": 0.3125, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000663995742798, + "sampling/importance_sampling_ratio/min": 0.0010333232348784804, + "sampling/sampling_logp_difference/max": 6.874975204467773, + "sampling/sampling_logp_difference/mean": 0.01895206607878208, + "step": 76 + }, + { + "clip_ratio/high_max": 1.071953920472879e-05, + "clip_ratio/high_mean": 2.6798848011821974e-06, + "clip_ratio/low_mean": 4.836337473079766e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.104325930460618e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14713.0, + "completions/max_terminated_length": 14713.0, + "completions/mean_length": 5293.1640625, + "completions/mean_terminated_length": 5293.1640625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.9724989607930183, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002898244420066476, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 57774093.0, + "reward": 0.4296875, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0031829492654651403, + "sampling/sampling_logp_difference/max": 5.7499470710754395, + "sampling/sampling_logp_difference/mean": 0.019694382324814796, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.102629304725269e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.102629304725269e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13987.0, + "completions/mean_length": 5771.5625, + "completions/mean_terminated_length": 5340.16259765625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.9740649163722992, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002261349931359291, + "learning_rate": 1e-05, + "loss": 0.0738, + "num_tokens": 58531293.0, + "reward": 0.25, + "reward_std": 0.26120057702064514, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999984502792358, + "sampling/importance_sampling_ratio/min": 7.037367322482169e-05, + "sampling/sampling_logp_difference/max": 9.561691284179688, + "sampling/sampling_logp_difference/mean": 0.019619958475232124, + "step": 78 + }, + { + "clip_ratio/high_max": 1.241475092683686e-05, + "clip_ratio/high_mean": 3.955232841690304e-06, + "clip_ratio/low_mean": 3.313706986318721e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.709230361437221e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 6832.59375, + "completions/mean_terminated_length": 6524.48388671875, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "entropy": 0.8907959461212158, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002895365934818983, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 59425137.0, + "reward": 0.4296875, + "reward_std": 0.36797165870666504, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000522136688232, + "sampling/importance_sampling_ratio/min": 0.000623974425252527, + "sampling/sampling_logp_difference/max": 7.379401206970215, + "sampling/sampling_logp_difference/mean": 0.019336842000484467, + "step": 79 + }, + { + "clip_ratio/high_max": 1.309858976128453e-05, + "clip_ratio/high_mean": 3.2746474403211323e-06, + "clip_ratio/low_mean": 3.091655224807255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.419119957470684e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15537.0, + "completions/mean_length": 5741.3515625, + "completions/mean_terminated_length": 5572.4208984375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.9363748207688332, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003053537104278803, + "learning_rate": 1e-05, + "loss": 0.0503, + "num_tokens": 60177006.0, + "reward": 0.3828125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999263882637024, + "sampling/importance_sampling_ratio/min": 0.0009319739765487611, + "sampling/sampling_logp_difference/max": 6.978205680847168, + "sampling/sampling_logp_difference/mean": 0.01948600634932518, + "step": 80 + }, + { + "clip_ratio/high_max": 2.1969835415802663e-05, + "clip_ratio/high_mean": 7.355770890171698e-06, + "clip_ratio/low_mean": 3.6011779457112425e-05, + "clip_ratio/low_min": 4.118887773074675e-06, + "clip_ratio/region_mean": 4.336755046097096e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 6333.078125, + "completions/mean_terminated_length": 6091.8564453125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.8286701366305351, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001936097047291696, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 61007192.0, + "reward": 0.2890625, + "reward_std": 0.3135277032852173, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999134540557861, + "sampling/importance_sampling_ratio/min": 0.00018122897017747164, + "sampling/sampling_logp_difference/max": 8.61574935913086, + "sampling/sampling_logp_difference/mean": 0.017766552045941353, + "step": 81 + }, + { + "clip_ratio/high_max": 3.815369746007491e-05, + "clip_ratio/high_mean": 1.1110751302112476e-05, + "clip_ratio/low_mean": 5.337692005014105e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.448767180700088e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14444.0, + "completions/mean_length": 4467.71875, + "completions/mean_terminated_length": 4373.8896484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0210246965289116, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00340029364451766, + "learning_rate": 1e-05, + "loss": -0.0143, + "num_tokens": 61606900.0, + "reward": 0.359375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999921441078186, + "sampling/importance_sampling_ratio/min": 0.004546399228274822, + "sampling/sampling_logp_difference/max": 5.3934197425842285, + "sampling/sampling_logp_difference/mean": 0.019704686477780342, + "step": 82 + }, + { + "clip_ratio/high_max": 1.4954135622247122e-05, + "clip_ratio/high_mean": 3.7385339055617806e-06, + "clip_ratio/low_mean": 3.632040886714094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0058942545329046e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15231.0, + "completions/mean_length": 5543.71875, + "completions/mean_terminated_length": 5283.55224609375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.9587382078170776, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016365943010896444, + "learning_rate": 1e-05, + "loss": 0.0057, + "num_tokens": 62335440.0, + "reward": 0.2421875, + "reward_std": 0.2964382767677307, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062346458435, + "sampling/importance_sampling_ratio/min": 1.835696679108878e-07, + "sampling/sampling_logp_difference/max": 15.510671615600586, + "sampling/sampling_logp_difference/mean": 0.019060850143432617, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1255708386670449e-05, + "clip_ratio/high_mean": 2.813927096667612e-06, + "clip_ratio/low_mean": 1.205687783567555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4870804704969487e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15514.0, + "completions/max_terminated_length": 15514.0, + "completions/mean_length": 5553.65625, + "completions/mean_terminated_length": 5553.65625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 1.0059658586978912, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028732717037200928, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 63071644.0, + "reward": 0.3046875, + "reward_std": 0.3098035454750061, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000003457069397, + "sampling/importance_sampling_ratio/min": 0.0030927264597266912, + "sampling/sampling_logp_difference/max": 5.778702259063721, + "sampling/sampling_logp_difference/mean": 0.01885710284113884, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.669913806130353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.669913806130353e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 5576.2265625, + "completions/mean_terminated_length": 5491.1259765625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.9912052825093269, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003957705572247505, + "learning_rate": 1e-05, + "loss": 0.0033, + "num_tokens": 63804529.0, + "reward": 0.2265625, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998383522033691, + "sampling/importance_sampling_ratio/min": 0.0004883196670562029, + "sampling/sampling_logp_difference/max": 7.624540328979492, + "sampling/sampling_logp_difference/mean": 0.019657567143440247, + "step": 85 + }, + { + "clip_ratio/high_max": 7.340359388763318e-06, + "clip_ratio/high_mean": 1.8350898471908295e-06, + "clip_ratio/low_mean": 4.2495241643791815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4330331377295806e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6716.9375, + "completions/mean_terminated_length": 6484.92822265625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.974421925842762, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027452034410089254, + "learning_rate": 1e-05, + "loss": -0.0238, + "num_tokens": 64684825.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998891949653625, + "sampling/importance_sampling_ratio/min": 0.00023439532378688455, + "sampling/sampling_logp_difference/max": 8.358501434326172, + "sampling/sampling_logp_difference/mean": 0.020278966054320335, + "step": 86 + }, + { + "clip_ratio/high_max": 1.1668500064843101e-05, + "clip_ratio/high_mean": 2.9171250162107754e-06, + "clip_ratio/low_mean": 2.278766351082595e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5704788185976213e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 6033.609375, + "completions/mean_terminated_length": 5869.31787109375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.9376208484172821, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014822481898590922, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 65476055.0, + "reward": 0.28125, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359846115112, + "sampling/importance_sampling_ratio/min": 0.0031867078505456448, + "sampling/sampling_logp_difference/max": 5.748766899108887, + "sampling/sampling_logp_difference/mean": 0.0203948225826025, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2838053666873748e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2838053666873748e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15593.0, + "completions/mean_length": 6561.4453125, + "completions/mean_terminated_length": 6405.5322265625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8753902241587639, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016284709563478827, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 66335528.0, + "reward": 0.3125, + "reward_std": 0.28535234928131104, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 7.897153409430757e-06, + "sampling/sampling_logp_difference/max": 11.749008178710938, + "sampling/sampling_logp_difference/mean": 0.01995038241147995, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7495306085256743e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7495306085256743e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12974.0, + "completions/mean_length": 5322.03125, + "completions/mean_terminated_length": 5234.92919921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 0.9731436967849731, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004579639527946711, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 67036244.0, + "reward": 0.3828125, + "reward_std": 0.2714630365371704, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000991821289062, + "sampling/importance_sampling_ratio/min": 0.00016946837422437966, + "sampling/sampling_logp_difference/max": 8.682844161987305, + "sampling/sampling_logp_difference/mean": 0.017986822873353958, + "step": 89 + }, + { + "clip_ratio/high_max": 9.390067589265527e-06, + "clip_ratio/high_mean": 2.347516897316382e-06, + "clip_ratio/low_mean": 2.9141255822651146e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.148877271996753e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 5428.1484375, + "completions/mean_terminated_length": 5254.24609375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.9560057744383812, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030614053830504417, + "learning_rate": 1e-05, + "loss": 0.0677, + "num_tokens": 67751911.0, + "reward": 0.40625, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998039603233337, + "sampling/importance_sampling_ratio/min": 0.00041119891102425754, + "sampling/sampling_logp_difference/max": 7.796433448791504, + "sampling/sampling_logp_difference/mean": 0.019884781911969185, + "step": 90 + }, + { + "clip_ratio/high_max": 1.3370414308155887e-05, + "clip_ratio/high_mean": 3.3426035770389717e-06, + "clip_ratio/low_mean": 2.5133818439826427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.84764220168654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16098.0, + "completions/mean_length": 6381.9140625, + "completions/mean_terminated_length": 6303.1572265625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 1.0577945485711098, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018679362256079912, + "learning_rate": 1e-05, + "loss": 0.0464, + "num_tokens": 68594620.0, + "reward": 0.1875, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 7.031799759715796e-05, + "sampling/sampling_logp_difference/max": 9.562482833862305, + "sampling/sampling_logp_difference/mean": 0.019965168088674545, + "step": 91 + }, + { + "clip_ratio/high_max": 5.103707280795788e-06, + "clip_ratio/high_mean": 1.275926820198947e-06, + "clip_ratio/low_mean": 4.938993617997767e-05, + "clip_ratio/low_min": 4.324361725593917e-06, + "clip_ratio/region_mean": 5.06658626591161e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14491.0, + "completions/mean_length": 5626.5703125, + "completions/mean_terminated_length": 5455.81787109375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.8880954682826996, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003913378342986107, + "learning_rate": 1e-05, + "loss": 0.078, + "num_tokens": 69335061.0, + "reward": 0.359375, + "reward_std": 0.4066115617752075, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001311302185, + "sampling/importance_sampling_ratio/min": 0.00010254964581690729, + "sampling/sampling_logp_difference/max": 9.185163497924805, + "sampling/sampling_logp_difference/mean": 0.018766846507787704, + "step": 92 + }, + { + "clip_ratio/high_max": 2.656613628460036e-05, + "clip_ratio/high_mean": 6.64153407115009e-06, + "clip_ratio/low_mean": 5.355309394872165e-05, + "clip_ratio/low_min": 6.923673481651349e-06, + "clip_ratio/region_mean": 6.019462853146251e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6252.5078125, + "completions/mean_terminated_length": 6172.732421875, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 1.0409839749336243, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002942018210887909, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 70158806.0, + "reward": 0.3515625, + "reward_std": 0.30221226811408997, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998798370361328, + "sampling/importance_sampling_ratio/min": 0.00027446431340649724, + "sampling/sampling_logp_difference/max": 8.200689315795898, + "sampling/sampling_logp_difference/mean": 0.02092035487294197, + "step": 93 + }, + { + "clip_ratio/high_max": 1.0007204764406197e-05, + "clip_ratio/high_mean": 2.501801191101549e-06, + "clip_ratio/low_mean": 6.03029346848416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.280473587594315e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15687.0, + "completions/mean_length": 5936.171875, + "completions/mean_terminated_length": 5770.33349609375, + "completions/min_length": 614.0, + "completions/min_terminated_length": 614.0, + "entropy": 0.9782606735825539, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018363922135904431, + "learning_rate": 1e-05, + "loss": 0.0037, + "num_tokens": 70938108.0, + "reward": 0.296875, + "reward_std": 0.31824085116386414, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999080300331116, + "sampling/importance_sampling_ratio/min": 0.0001234232186106965, + "sampling/sampling_logp_difference/max": 8.99989128112793, + "sampling/sampling_logp_difference/mean": 0.02028634399175644, + "step": 94 + }, + { + "clip_ratio/high_max": 2.2271185798672377e-05, + "clip_ratio/high_mean": 5.567796449668094e-06, + "clip_ratio/low_mean": 2.026856623160711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.583636239705811e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15826.0, + "completions/mean_length": 5796.34375, + "completions/mean_terminated_length": 5712.9765625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.9343783929944038, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036520177964121103, + "learning_rate": 1e-05, + "loss": 0.0465, + "num_tokens": 71697904.0, + "reward": 0.4296875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145435333252, + "sampling/importance_sampling_ratio/min": 0.0013267943868413568, + "sampling/sampling_logp_difference/max": 6.6249895095825195, + "sampling/sampling_logp_difference/mean": 0.01939292624592781, + "step": 95 + }, + { + "clip_ratio/high_max": 1.3236602853794466e-05, + "clip_ratio/high_mean": 5.30995015424196e-06, + "clip_ratio/low_mean": 2.4116298618537257e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.942624860224896e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16075.0, + "completions/mean_length": 5912.5078125, + "completions/mean_terminated_length": 5746.2939453125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8880549967288971, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002131880959495902, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 72472657.0, + "reward": 0.484375, + "reward_std": 0.3027363121509552, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 1.3350321736993465e-08, + "sampling/sampling_logp_difference/max": 18.131725311279297, + "sampling/sampling_logp_difference/mean": 0.019045043736696243, + "step": 96 + }, + { + "clip_ratio/high_max": 1.0632415978761856e-05, + "clip_ratio/high_mean": 2.658103994690464e-06, + "clip_ratio/low_mean": 3.596552733142744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862363143980474e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14486.0, + "completions/mean_length": 5471.203125, + "completions/mean_terminated_length": 5385.275390625, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.9127756953239441, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.0030769745353609324, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 73191403.0, + "reward": 0.5234375, + "reward_std": 0.4281895160675049, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999668598175049, + "sampling/importance_sampling_ratio/min": 1.3584097757757263e-07, + "sampling/sampling_logp_difference/max": 15.81178092956543, + "sampling/sampling_logp_difference/mean": 0.019179491326212883, + "step": 97 + }, + { + "clip_ratio/high_max": 6.134668183221947e-06, + "clip_ratio/high_mean": 1.5336670458054869e-06, + "clip_ratio/low_mean": 2.465653636818388e-05, + "clip_ratio/low_min": 3.4443801268935204e-06, + "clip_ratio/region_mean": 2.6190203413989366e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6366.5078125, + "completions/mean_terminated_length": 6207.50048828125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.9889310300350189, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027727377600967884, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 74026484.0, + "reward": 0.328125, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998502731323242, + "sampling/importance_sampling_ratio/min": 0.00011932474444620311, + "sampling/sampling_logp_difference/max": 9.033661842346191, + "sampling/sampling_logp_difference/mean": 0.01946873590350151, + "step": 98 + }, + { + "clip_ratio/high_max": 1.3569412203651154e-05, + "clip_ratio/high_mean": 3.3923530509127886e-06, + "clip_ratio/low_mean": 2.118610348134098e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4578456645940605e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16312.0, + "completions/max_terminated_length": 16312.0, + "completions/mean_length": 4089.6015625, + "completions/mean_terminated_length": 4089.6015625, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "entropy": 0.8083604946732521, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003628374310210347, + "learning_rate": 1e-05, + "loss": -0.002, + "num_tokens": 74567833.0, + "reward": 0.484375, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944269657135, + "sampling/importance_sampling_ratio/min": 0.000612107920460403, + "sampling/sampling_logp_difference/max": 7.39860200881958, + "sampling/sampling_logp_difference/mean": 0.017995744943618774, + "step": 99 + }, + { + "clip_ratio/high_max": 1.947620376085979e-05, + "clip_ratio/high_mean": 5.989323312860506e-06, + "clip_ratio/low_mean": 2.8597964728760417e-05, + "clip_ratio/low_min": 7.570710295112804e-06, + "clip_ratio/region_mean": 3.458728804162092e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16340.0, + "completions/mean_length": 5678.7890625, + "completions/mean_terminated_length": 5508.865234375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.880424402654171, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004177773837000132, + "learning_rate": 1e-05, + "loss": 0.0595, + "num_tokens": 75314022.0, + "reward": 0.4765625, + "reward_std": 0.4105730950832367, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999686479568481, + "sampling/importance_sampling_ratio/min": 3.343528805999085e-05, + "sampling/sampling_logp_difference/max": 10.305898666381836, + "sampling/sampling_logp_difference/mean": 0.018467536196112633, + "step": 100 + }, + { + "clip_ratio/high_max": 1.4969179119361797e-05, + "clip_ratio/high_mean": 3.7422947798404493e-06, + "clip_ratio/low_mean": 5.1001184147025924e-05, + "clip_ratio/low_min": 7.801042556820903e-06, + "clip_ratio/region_mean": 5.474347858580586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5253.0234375, + "completions/mean_terminated_length": 5253.0234375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.9227524027228355, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015437579713761806, + "learning_rate": 1e-05, + "loss": 0.0445, + "num_tokens": 76005417.0, + "reward": 0.3515625, + "reward_std": 0.34586966037750244, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999125003814697, + "sampling/importance_sampling_ratio/min": 5.159151623956859e-05, + "sampling/sampling_logp_difference/max": 9.872153282165527, + "sampling/sampling_logp_difference/mean": 0.018250152468681335, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3062932339380495e-05, + "clip_ratio/high_mean": 3.265733084845124e-06, + "clip_ratio/low_mean": 3.931676133106521e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2582495325405034e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15289.0, + "completions/mean_length": 5956.921875, + "completions/mean_terminated_length": 5533.056640625, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.892315685749054, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019212538609281182, + "learning_rate": 1e-05, + "loss": 0.0688, + "num_tokens": 76787623.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054074287415, + "sampling/importance_sampling_ratio/min": 0.0012463966850191355, + "sampling/sampling_logp_difference/max": 6.687498569488525, + "sampling/sampling_logp_difference/mean": 0.018439805135130882, + "step": 102 + }, + { + "clip_ratio/high_max": 2.714365291467402e-05, + "clip_ratio/high_mean": 6.785913228668505e-06, + "clip_ratio/low_mean": 3.920890912922914e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5994822471584484e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14715.0, + "completions/mean_length": 5575.09375, + "completions/mean_terminated_length": 5315.68017578125, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 1.0225786119699478, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029739944729954004, + "learning_rate": 1e-05, + "loss": 0.0482, + "num_tokens": 77520091.0, + "reward": 0.3203125, + "reward_std": 0.29719969630241394, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 1.9004226032848237e-06, + "sampling/sampling_logp_difference/max": 13.173434257507324, + "sampling/sampling_logp_difference/mean": 0.020432481542229652, + "step": 103 + }, + { + "clip_ratio/high_max": 1.1180974752278416e-05, + "clip_ratio/high_mean": 2.795243688069604e-06, + "clip_ratio/low_mean": 5.534062506740156e-05, + "clip_ratio/low_min": 4.409326720633544e-06, + "clip_ratio/region_mean": 5.813586813019356e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16346.0, + "completions/mean_length": 7777.171875, + "completions/mean_terminated_length": 7499.5322265625, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "entropy": 0.8798429742455482, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021529686637222767, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 78538993.0, + "reward": 0.3203125, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998733401298523, + "sampling/importance_sampling_ratio/min": 2.081840648315847e-06, + "sampling/sampling_logp_difference/max": 13.082258224487305, + "sampling/sampling_logp_difference/mean": 0.019486568868160248, + "step": 104 + }, + { + "clip_ratio/high_max": 1.4091711364017101e-05, + "clip_ratio/high_mean": 3.5229278410042753e-06, + "clip_ratio/low_mean": 4.0216968045569956e-05, + "clip_ratio/low_min": 4.320475454733241e-06, + "clip_ratio/region_mean": 4.3739896682382096e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15763.0, + "completions/mean_length": 6298.4296875, + "completions/mean_terminated_length": 6219.015625, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0422330349683762, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002485725563019514, + "learning_rate": 1e-05, + "loss": 0.0674, + "num_tokens": 79365144.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999951124191284, + "sampling/importance_sampling_ratio/min": 0.0008047395385801792, + "sampling/sampling_logp_difference/max": 7.1249918937683105, + "sampling/sampling_logp_difference/mean": 0.021251153200864792, + "step": 105 + }, + { + "clip_ratio/high_max": 5.182851054996718e-06, + "clip_ratio/high_mean": 1.2957127637491794e-06, + "clip_ratio/low_mean": 1.3408006566351105e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4703719102726609e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13199.0, + "completions/max_terminated_length": 13199.0, + "completions/mean_length": 5001.8515625, + "completions/mean_terminated_length": 5001.8515625, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9210668653249741, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018336179200559855, + "learning_rate": 1e-05, + "loss": -0.0075, + "num_tokens": 80024661.0, + "reward": 0.3984375, + "reward_std": 0.2969672679901123, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0015512153040617704, + "sampling/sampling_logp_difference/max": 6.468716621398926, + "sampling/sampling_logp_difference/mean": 0.018811997026205063, + "step": 106 + }, + { + "clip_ratio/high_max": 3.179798750352347e-05, + "clip_ratio/high_mean": 7.949496875880868e-06, + "clip_ratio/low_mean": 2.5010467197716935e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.29599640735978e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 6280.1875, + "completions/mean_terminated_length": 6119.81005859375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 1.0198880061507225, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00276190135627985, + "learning_rate": 1e-05, + "loss": 0.0474, + "num_tokens": 80845941.0, + "reward": 0.2578125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.00043450010707601905, + "sampling/sampling_logp_difference/max": 7.74131441116333, + "sampling/sampling_logp_difference/mean": 0.020783018320798874, + "step": 107 + }, + { + "clip_ratio/high_max": 1.0263617241434986e-05, + "clip_ratio/high_mean": 2.5659043103587464e-06, + "clip_ratio/low_mean": 2.2780154608881276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.534605857817951e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14812.0, + "completions/mean_length": 5617.109375, + "completions/mean_terminated_length": 5358.7041015625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0532233864068985, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020079545211046934, + "learning_rate": 1e-05, + "loss": 0.03, + "num_tokens": 81584099.0, + "reward": 0.3515625, + "reward_std": 0.3037971258163452, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000622272491455, + "sampling/importance_sampling_ratio/min": 0.0014304202049970627, + "sampling/sampling_logp_difference/max": 6.5497870445251465, + "sampling/sampling_logp_difference/mean": 0.019330721348524094, + "step": 108 + }, + { + "clip_ratio/high_max": 3.592160510379472e-06, + "clip_ratio/high_mean": 8.98040127594868e-07, + "clip_ratio/low_mean": 2.2189478841028176e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3087518968623044e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15901.0, + "completions/mean_length": 4336.828125, + "completions/mean_terminated_length": 4241.96826171875, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.8131270706653595, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002346212510019541, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 82157581.0, + "reward": 0.59375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981952667236, + "sampling/importance_sampling_ratio/min": 0.011126067489385605, + "sampling/sampling_logp_difference/max": 4.498464584350586, + "sampling/sampling_logp_difference/mean": 0.01748315989971161, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.621310563379666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.621310563379666e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15911.0, + "completions/mean_length": 6185.1640625, + "completions/mean_terminated_length": 6023.2783203125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.9515878483653069, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020737929735332727, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 82970866.0, + "reward": 0.296875, + "reward_std": 0.2580229640007019, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999544024467468, + "sampling/importance_sampling_ratio/min": 0.00021864472364541143, + "sampling/sampling_logp_difference/max": 8.428062438964844, + "sampling/sampling_logp_difference/mean": 0.019794369116425514, + "step": 110 + }, + { + "clip_ratio/high_max": 2.830697485478595e-05, + "clip_ratio/high_mean": 7.076743713696487e-06, + "clip_ratio/low_mean": 3.404362587389187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1120369132841006e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15649.0, + "completions/mean_length": 6042.359375, + "completions/mean_terminated_length": 5960.92919921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9405315592885017, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013609385350719094, + "learning_rate": 1e-05, + "loss": 0.0023, + "num_tokens": 83762664.0, + "reward": 0.265625, + "reward_std": 0.2937847375869751, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000874996185303, + "sampling/importance_sampling_ratio/min": 0.03007127158343792, + "sampling/sampling_logp_difference/max": 3.5041849613189697, + "sampling/sampling_logp_difference/mean": 0.02063683047890663, + "step": 111 + }, + { + "clip_ratio/high_max": 2.4490228042850504e-05, + "clip_ratio/high_mean": 7.702277343923924e-06, + "clip_ratio/low_mean": 4.2714329822501895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.04166071095824e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16288.0, + "completions/mean_length": 7036.859375, + "completions/mean_terminated_length": 6963.259765625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9034569710493088, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017795560415834188, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 84684566.0, + "reward": 0.359375, + "reward_std": 0.2977414131164551, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000296831130981, + "sampling/importance_sampling_ratio/min": 0.03753140941262245, + "sampling/sampling_logp_difference/max": 3.2825770378112793, + "sampling/sampling_logp_difference/mean": 0.019494226202368736, + "step": 112 + }, + { + "clip_ratio/high_max": 2.028518520091893e-05, + "clip_ratio/high_mean": 6.102377255956526e-06, + "clip_ratio/low_mean": 3.518054700180073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.128292380300991e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16308.0, + "completions/mean_length": 6958.6484375, + "completions/mean_terminated_length": 6413.3798828125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.9195531085133553, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027138369623571634, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 85598345.0, + "reward": 0.421875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999241828918457, + "sampling/importance_sampling_ratio/min": 0.0004585298302117735, + "sampling/sampling_logp_difference/max": 7.687485218048096, + "sampling/sampling_logp_difference/mean": 0.0201261006295681, + "step": 113 + }, + { + "clip_ratio/high_max": 7.460459528374486e-06, + "clip_ratio/high_mean": 3.464071141934255e-06, + "clip_ratio/low_mean": 3.825124849754502e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.171532009422663e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5773.890625, + "completions/mean_terminated_length": 5773.890625, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "entropy": 0.8253094777464867, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019655083306133747, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 86356403.0, + "reward": 0.390625, + "reward_std": 0.2635546922683716, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 2.981063744300627e-06, + "sampling/sampling_logp_difference/max": 12.723230361938477, + "sampling/sampling_logp_difference/mean": 0.018150178715586662, + "step": 114 + }, + { + "clip_ratio/high_max": 7.937012014735956e-06, + "clip_ratio/high_mean": 1.984253003683989e-06, + "clip_ratio/low_mean": 4.778610400535399e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9770356781664304e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15584.0, + "completions/mean_length": 5233.546875, + "completions/mean_terminated_length": 4873.8544921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8463557213544846, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0024442693684250116, + "learning_rate": 1e-05, + "loss": 0.1172, + "num_tokens": 87043681.0, + "reward": 0.375, + "reward_std": 0.3987257480621338, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265670776367, + "sampling/importance_sampling_ratio/min": 4.3303893448864983e-07, + "sampling/sampling_logp_difference/max": 14.652438163757324, + "sampling/sampling_logp_difference/mean": 0.01760055497288704, + "step": 115 + }, + { + "clip_ratio/high_max": 2.0049358681717422e-05, + "clip_ratio/high_mean": 6.392639988916926e-06, + "clip_ratio/low_mean": 2.7909350819754764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4301990581298014e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16070.0, + "completions/mean_length": 6098.5234375, + "completions/mean_terminated_length": 5851.67236328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.9961429908871651, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001763843116350472, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 87845012.0, + "reward": 0.3125, + "reward_std": 0.24329747259616852, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 0.0012967984657734632, + "sampling/sampling_logp_difference/max": 6.647856712341309, + "sampling/sampling_logp_difference/mean": 0.020430129021406174, + "step": 116 + }, + { + "clip_ratio/high_max": 6.041565939085558e-06, + "clip_ratio/high_mean": 1.5103914847713895e-06, + "clip_ratio/low_mean": 3.8537290720341844e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004768220511323e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15364.0, + "completions/mean_length": 7306.828125, + "completions/mean_terminated_length": 6937.8369140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 1.0500907376408577, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023989977780729532, + "learning_rate": 1e-05, + "loss": 0.0383, + "num_tokens": 88799758.0, + "reward": 0.1875, + "reward_std": 0.23752352595329285, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998784065246582, + "sampling/importance_sampling_ratio/min": 0.00016530237917322665, + "sampling/sampling_logp_difference/max": 8.707734107971191, + "sampling/sampling_logp_difference/mean": 0.021274670958518982, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1037226335683954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1037226335683954e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15693.0, + "completions/mean_length": 5156.9765625, + "completions/mean_terminated_length": 4978.77001953125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 1.0691863298416138, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032527034636586905, + "learning_rate": 1e-05, + "loss": 0.1168, + "num_tokens": 89482459.0, + "reward": 0.4140625, + "reward_std": 0.3406246304512024, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999943375587463, + "sampling/importance_sampling_ratio/min": 0.00010107864363817498, + "sampling/sampling_logp_difference/max": 9.19961166381836, + "sampling/sampling_logp_difference/mean": 0.019853606820106506, + "step": 118 + }, + { + "clip_ratio/high_max": 2.2721950699633453e-05, + "clip_ratio/high_mean": 5.680487674908363e-06, + "clip_ratio/low_mean": 4.0971160615299596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6651648517581634e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6804.8125, + "completions/mean_terminated_length": 6495.80615234375, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.867309644818306, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019014904974028468, + "learning_rate": 1e-05, + "loss": 0.0593, + "num_tokens": 90372587.0, + "reward": 0.375, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 0.00012341687397565693, + "sampling/sampling_logp_difference/max": 8.999942779541016, + "sampling/sampling_logp_difference/mean": 0.018908457830548286, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0602929251035675e-05, + "clip_ratio/high_mean": 2.650732312758919e-06, + "clip_ratio/low_mean": 4.483750217332272e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.748823448608164e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15419.0, + "completions/max_terminated_length": 15419.0, + "completions/mean_length": 5354.2890625, + "completions/mean_terminated_length": 5354.2890625, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9092740416526794, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028308529872447252, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 91080912.0, + "reward": 0.3359375, + "reward_std": 0.34245961904525757, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000000238418579, + "sampling/importance_sampling_ratio/min": 0.003619713708758354, + "sampling/sampling_logp_difference/max": 5.6213603019714355, + "sampling/sampling_logp_difference/mean": 0.018408317118883133, + "step": 120 + }, + { + "clip_ratio/high_max": 7.076040446918341e-06, + "clip_ratio/high_mean": 1.7690101117295853e-06, + "clip_ratio/low_mean": 6.420628960768227e-05, + "clip_ratio/low_min": 9.37260915634397e-06, + "clip_ratio/region_mean": 6.59752995488816e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7653.1328125, + "completions/mean_terminated_length": 7371.49169921875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.9067098647356033, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026082738768309355, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 92080441.0, + "reward": 0.3125, + "reward_std": 0.3395638167858124, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999957084655762, + "sampling/importance_sampling_ratio/min": 3.7638976209564134e-05, + "sampling/sampling_logp_difference/max": 10.187470436096191, + "sampling/sampling_logp_difference/mean": 0.019849080592393875, + "step": 121 + }, + { + "clip_ratio/high_max": 4.642525709641632e-06, + "clip_ratio/high_mean": 1.8333832940697903e-06, + "clip_ratio/low_mean": 4.188668265214801e-05, + "clip_ratio/low_min": 6.032381861587055e-06, + "clip_ratio/region_mean": 4.3720065264096775e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 7864.796875, + "completions/mean_terminated_length": 7220.48779296875, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.0423363894224167, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001708728028461337, + "learning_rate": 1e-05, + "loss": 0.0394, + "num_tokens": 93107607.0, + "reward": 0.2265625, + "reward_std": 0.23933593928813934, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999992311000824, + "sampling/importance_sampling_ratio/min": 4.743846602650592e-06, + "sampling/sampling_logp_difference/max": 12.258662223815918, + "sampling/sampling_logp_difference/mean": 0.02070365846157074, + "step": 122 + }, + { + "clip_ratio/high_max": 6.424297680496238e-06, + "clip_ratio/high_mean": 1.6060744201240595e-06, + "clip_ratio/low_mean": 4.487338674152852e-05, + "clip_ratio/low_min": 7.803849257470574e-06, + "clip_ratio/region_mean": 4.647946116165258e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7690.6328125, + "completions/mean_terminated_length": 7622.18115234375, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 1.061365969479084, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026824623346328735, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 94111296.0, + "reward": 0.2890625, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998635649681091, + "sampling/importance_sampling_ratio/min": 0.00014029098383616656, + "sampling/sampling_logp_difference/max": 8.87179183959961, + "sampling/sampling_logp_difference/mean": 0.021192047744989395, + "step": 123 + }, + { + "clip_ratio/high_max": 5.478851562656928e-06, + "clip_ratio/high_mean": 1.369712890664232e-06, + "clip_ratio/low_mean": 1.5870192100919667e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.72399049915839e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15693.0, + "completions/mean_length": 5871.2265625, + "completions/mean_terminated_length": 5618.92041015625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 1.0346312001347542, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0012895551044493914, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 94883061.0, + "reward": 0.3125, + "reward_std": 0.16675156354904175, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999569654464722, + "sampling/importance_sampling_ratio/min": 0.007269685622304678, + "sampling/sampling_logp_difference/max": 4.924042224884033, + "sampling/sampling_logp_difference/mean": 0.02043779566884041, + "step": 124 + }, + { + "clip_ratio/high_max": 9.75199873209931e-06, + "clip_ratio/high_mean": 3.4236486499139573e-06, + "clip_ratio/low_mean": 3.807359871643712e-05, + "clip_ratio/low_min": 6.6283109845244326e-06, + "clip_ratio/region_mean": 4.1497247366351075e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15671.0, + "completions/mean_length": 7205.0703125, + "completions/mean_terminated_length": 6908.9755859375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.8426484614610672, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024157650768756866, + "learning_rate": 1e-05, + "loss": 0.0334, + "num_tokens": 95831798.0, + "reward": 0.3671875, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579191207886, + "sampling/importance_sampling_ratio/min": 0.00780851487070322, + "sampling/sampling_logp_difference/max": 4.852540493011475, + "sampling/sampling_logp_difference/mean": 0.01930900476872921, + "step": 125 + }, + { + "clip_ratio/high_max": 7.827117542547057e-06, + "clip_ratio/high_mean": 1.9567793856367643e-06, + "clip_ratio/low_mean": 2.85506193904439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0507398662393825e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15557.0, + "completions/mean_length": 6770.2578125, + "completions/mean_terminated_length": 6539.5283203125, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 0.8648517951369286, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018663652008399367, + "learning_rate": 1e-05, + "loss": 0.0353, + "num_tokens": 96716079.0, + "reward": 0.3671875, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147057533264, + "sampling/importance_sampling_ratio/min": 0.0013688995968550444, + "sampling/sampling_logp_difference/max": 6.593748092651367, + "sampling/sampling_logp_difference/mean": 0.019091933965682983, + "step": 126 + }, + { + "clip_ratio/high_max": 8.396982593694702e-06, + "clip_ratio/high_mean": 2.0992456484236754e-06, + "clip_ratio/low_mean": 3.30035152273922e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5102760875815875e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16138.0, + "completions/mean_length": 7880.8359375, + "completions/mean_terminated_length": 7745.86572265625, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.9396157637238503, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016418134327977896, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 97744506.0, + "reward": 0.2109375, + "reward_std": 0.22225633263587952, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507069587708, + "sampling/importance_sampling_ratio/min": 0.0072977589443326, + "sampling/sampling_logp_difference/max": 4.920187950134277, + "sampling/sampling_logp_difference/mean": 0.02041018195450306, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.872459816671835e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.872459816671835e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 6425.3515625, + "completions/mean_terminated_length": 6267.2783203125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.9397681280970573, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002677743323147297, + "learning_rate": 1e-05, + "loss": 0.0076, + "num_tokens": 98587647.0, + "reward": 0.359375, + "reward_std": 0.2567248046398163, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 5.40250198355352e-07, + "sampling/sampling_logp_difference/max": 14.431233406066895, + "sampling/sampling_logp_difference/mean": 0.020279735326766968, + "step": 128 + }, + { + "clip_ratio/high_max": 1.306506624132453e-05, + "clip_ratio/high_mean": 3.2662665603311325e-06, + "clip_ratio/low_mean": 3.8350387626451266e-05, + "clip_ratio/low_min": 9.45358260651119e-06, + "clip_ratio/region_mean": 4.161665401625214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 7129.4609375, + "completions/mean_terminated_length": 6907.3525390625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 1.1336064785718918, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032464349642395973, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 99522458.0, + "reward": 0.3046875, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999245405197144, + "sampling/importance_sampling_ratio/min": 0.0046671414747834206, + "sampling/sampling_logp_difference/max": 5.367208480834961, + "sampling/sampling_logp_difference/mean": 0.021748989820480347, + "step": 129 + }, + { + "clip_ratio/high_max": 9.463296464673476e-06, + "clip_ratio/high_mean": 2.365824116168369e-06, + "clip_ratio/low_mean": 3.497452934198009e-05, + "clip_ratio/low_min": 6.806807050452335e-06, + "clip_ratio/region_mean": 3.734035340130504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 7264.7421875, + "completions/mean_terminated_length": 7119.99267578125, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.8998278677463531, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026182979345321655, + "learning_rate": 1e-05, + "loss": 0.1161, + "num_tokens": 100474137.0, + "reward": 0.46875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000280141830444, + "sampling/importance_sampling_ratio/min": 0.021124430000782013, + "sampling/sampling_logp_difference/max": 3.8573250770568848, + "sampling/sampling_logp_difference/mean": 0.019057951867580414, + "step": 130 + }, + { + "clip_ratio/high_max": 8.944165074353805e-06, + "clip_ratio/high_mean": 2.236041268588451e-06, + "clip_ratio/low_mean": 4.6521246076736134e-05, + "clip_ratio/low_min": 7.112780167517485e-06, + "clip_ratio/region_mean": 4.875728745901142e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15064.0, + "completions/mean_length": 5473.71875, + "completions/mean_terminated_length": 5387.81103515625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9666230976581573, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0020499166566878557, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 101191861.0, + "reward": 0.328125, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999291896820068, + "sampling/importance_sampling_ratio/min": 1.8367816210229648e-06, + "sampling/sampling_logp_difference/max": 13.20749568939209, + "sampling/sampling_logp_difference/mean": 0.019896289333701134, + "step": 131 + }, + { + "clip_ratio/high_max": 2.054391302408476e-05, + "clip_ratio/high_mean": 5.13597825602119e-06, + "clip_ratio/low_mean": 6.0949954104216886e-05, + "clip_ratio/low_min": 1.2865434428022127e-05, + "clip_ratio/region_mean": 6.608593298551568e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 6679.9765625, + "completions/mean_terminated_length": 5946.05908203125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.8775574564933777, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0024929519277065992, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 102070058.0, + "reward": 0.3671875, + "reward_std": 0.41398313641548157, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.004311627708375454, + "sampling/sampling_logp_difference/max": 5.446439743041992, + "sampling/sampling_logp_difference/mean": 0.018816513940691948, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.7019791250259004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7019791250259004e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16345.0, + "completions/mean_length": 6549.0625, + "completions/mean_terminated_length": 6313.0244140625, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 0.8732621371746063, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002134882379323244, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 102926522.0, + "reward": 0.3828125, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000419616699219, + "sampling/importance_sampling_ratio/min": 0.0017044072737917304, + "sampling/sampling_logp_difference/max": 6.374537944793701, + "sampling/sampling_logp_difference/mean": 0.019951295107603073, + "step": 133 + }, + { + "clip_ratio/high_max": 3.6268677376938285e-06, + "clip_ratio/high_mean": 9.067169344234571e-07, + "clip_ratio/low_mean": 3.5008752547582844e-05, + "clip_ratio/low_min": 3.866736733471043e-06, + "clip_ratio/region_mean": 3.591546965253656e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16306.0, + "completions/mean_length": 6011.8359375, + "completions/mean_terminated_length": 5677.25, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9975898712873459, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037468743976205587, + "learning_rate": 1e-05, + "loss": 0.0818, + "num_tokens": 103714277.0, + "reward": 0.359375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000693798065186, + "sampling/importance_sampling_ratio/min": 0.002192396903410554, + "sampling/sampling_logp_difference/max": 6.122759819030762, + "sampling/sampling_logp_difference/mean": 0.019433926790952682, + "step": 134 + }, + { + "clip_ratio/high_max": 2.6430232992424862e-05, + "clip_ratio/high_mean": 6.607558248106216e-06, + "clip_ratio/low_mean": 3.3786116432565905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0393675021732633e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15027.0, + "completions/mean_length": 6270.203125, + "completions/mean_terminated_length": 6190.56689453125, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.7808161675930023, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035926424898207188, + "learning_rate": 1e-05, + "loss": 0.1162, + "num_tokens": 104537295.0, + "reward": 0.4921875, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999791383743286, + "sampling/importance_sampling_ratio/min": 0.00840076245367527, + "sampling/sampling_logp_difference/max": 4.779432773590088, + "sampling/sampling_logp_difference/mean": 0.017456334084272385, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.908255777991144e-05, + "clip_ratio/low_min": 7.643389835720882e-06, + "clip_ratio/region_mean": 4.908255777991144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 4916.25, + "completions/mean_terminated_length": 4734.22265625, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8354851230978966, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004259355366230011, + "learning_rate": 1e-05, + "loss": 0.0879, + "num_tokens": 105184551.0, + "reward": 0.4609375, + "reward_std": 0.3656175136566162, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000231266021729, + "sampling/importance_sampling_ratio/min": 0.003178094746544957, + "sampling/sampling_logp_difference/max": 5.751473426818848, + "sampling/sampling_logp_difference/mean": 0.01745998114347458, + "step": 136 + }, + { + "clip_ratio/high_max": 6.184750873217126e-06, + "clip_ratio/high_mean": 2.3343936845776625e-06, + "clip_ratio/low_mean": 3.130356230940379e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.363795599398145e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14083.0, + "completions/mean_length": 5317.515625, + "completions/mean_terminated_length": 5230.3779296875, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.9808826446533203, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021007952746003866, + "learning_rate": 1e-05, + "loss": -0.0037, + "num_tokens": 105889289.0, + "reward": 0.4296875, + "reward_std": 0.3151204586029053, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.004087196197360754, + "sampling/sampling_logp_difference/max": 5.499896049499512, + "sampling/sampling_logp_difference/mean": 0.020308660343289375, + "step": 137 + }, + { + "clip_ratio/high_max": 6.264094281505095e-06, + "clip_ratio/high_mean": 1.5660235703762737e-06, + "clip_ratio/low_mean": 4.276942695469188e-05, + "clip_ratio/low_min": 5.777519618277438e-06, + "clip_ratio/region_mean": 4.4335450525068154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16052.0, + "completions/mean_length": 7302.3671875, + "completions/mean_terminated_length": 6776.9833984375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.8526253402233124, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001218521734699607, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 106849048.0, + "reward": 0.28125, + "reward_std": 0.22331714630126953, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999129772186279, + "sampling/importance_sampling_ratio/min": 0.010783779434859753, + "sampling/sampling_logp_difference/max": 4.529712200164795, + "sampling/sampling_logp_difference/mean": 0.019228527322411537, + "step": 138 + }, + { + "clip_ratio/high_max": 1.1513777735672193e-05, + "clip_ratio/high_mean": 2.878444433918048e-06, + "clip_ratio/low_mean": 3.477262850992702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7651072489097714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14681.0, + "completions/mean_length": 4603.46875, + "completions/mean_terminated_length": 4510.70849609375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.7025937959551811, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002826553536579013, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 107456676.0, + "reward": 0.625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932050704956, + "sampling/importance_sampling_ratio/min": 0.0006447202758863568, + "sampling/sampling_logp_difference/max": 7.346693992614746, + "sampling/sampling_logp_difference/mean": 0.016313642263412476, + "step": 139 + }, + { + "clip_ratio/high_max": 4.341936346463626e-06, + "clip_ratio/high_mean": 1.0854840866159066e-06, + "clip_ratio/low_mean": 4.9752483846532414e-05, + "clip_ratio/low_min": 1.0369344636274036e-05, + "clip_ratio/region_mean": 5.083796850158251e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 7055.921875, + "completions/mean_terminated_length": 6755.01611328125, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.8677415996789932, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015939075965434313, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 108380090.0, + "reward": 0.359375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.007212483324110508, + "sampling/sampling_logp_difference/max": 4.931941986083984, + "sampling/sampling_logp_difference/mean": 0.019018646329641342, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.017062949264073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.017062949264073e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15374.0, + "completions/mean_length": 6947.546875, + "completions/mean_terminated_length": 6563.951171875, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "entropy": 0.9537070691585541, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014140130952000618, + "learning_rate": 1e-05, + "loss": 0.0685, + "num_tokens": 109288008.0, + "reward": 0.28125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532103538513, + "sampling/importance_sampling_ratio/min": 0.002557439962401986, + "sampling/sampling_logp_difference/max": 5.968748569488525, + "sampling/sampling_logp_difference/mean": 0.02024715766310692, + "step": 141 + }, + { + "clip_ratio/high_max": 1.4431375348067377e-05, + "clip_ratio/high_mean": 3.607843837016844e-06, + "clip_ratio/low_mean": 2.80186426380169e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.162648749821528e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16053.0, + "completions/mean_length": 5742.4140625, + "completions/mean_terminated_length": 5658.6220703125, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 0.8954835087060928, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012910671066492796, + "learning_rate": 1e-05, + "loss": 0.0939, + "num_tokens": 110041333.0, + "reward": 0.4375, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 2.282886634930037e-05, + "sampling/sampling_logp_difference/max": 10.687484741210938, + "sampling/sampling_logp_difference/mean": 0.017754144966602325, + "step": 142 + }, + { + "clip_ratio/high_max": 3.2560687031946145e-05, + "clip_ratio/high_mean": 9.421434697287623e-06, + "clip_ratio/low_mean": 2.801389479145655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7435329431900755e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14702.0, + "completions/max_terminated_length": 14702.0, + "completions/mean_length": 5582.1640625, + "completions/mean_terminated_length": 5582.1640625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.9963158369064331, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002162793418392539, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 110775762.0, + "reward": 0.3359375, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999851584434509, + "sampling/importance_sampling_ratio/min": 0.0010016339365392923, + "sampling/sampling_logp_difference/max": 6.90612268447876, + "sampling/sampling_logp_difference/mean": 0.020483866333961487, + "step": 143 + }, + { + "clip_ratio/high_max": 1.746983889461262e-05, + "clip_ratio/high_mean": 7.333224402827909e-06, + "clip_ratio/low_mean": 3.6373660350363934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3706885207939195e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13832.0, + "completions/mean_length": 6047.8984375, + "completions/mean_terminated_length": 5883.83349609375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.913147509098053, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00287337857298553, + "learning_rate": 1e-05, + "loss": 0.045, + "num_tokens": 111568589.0, + "reward": 0.4453125, + "reward_std": 0.3453328609466553, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 9.964095625036862e-06, + "sampling/sampling_logp_difference/max": 11.516522407531738, + "sampling/sampling_logp_difference/mean": 0.018301380798220634, + "step": 144 + }, + { + "clip_ratio/high_max": 2.6439459361426998e-05, + "clip_ratio/high_mean": 6.6098648403567495e-06, + "clip_ratio/low_mean": 4.587054809235269e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.248041247796209e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 6462.28125, + "completions/mean_terminated_length": 6224.16015625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 1.1468544080853462, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017887315480038524, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 112414673.0, + "reward": 0.2734375, + "reward_std": 0.23592589795589447, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.0007102306117303669, + "sampling/sampling_logp_difference/max": 7.249920845031738, + "sampling/sampling_logp_difference/mean": 0.021768372505903244, + "step": 145 + }, + { + "clip_ratio/high_max": 1.6320968370564515e-05, + "clip_ratio/high_mean": 5.031390969634231e-06, + "clip_ratio/low_mean": 3.567474152532668e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0706131812839885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16126.0, + "completions/mean_length": 6897.0078125, + "completions/mean_terminated_length": 6822.30712890625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.9793258458375931, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022704254370182753, + "learning_rate": 1e-05, + "loss": 0.0423, + "num_tokens": 113321722.0, + "reward": 0.2890625, + "reward_std": 0.34297874569892883, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000591278076172, + "sampling/importance_sampling_ratio/min": 5.476621663547121e-05, + "sampling/sampling_logp_difference/max": 9.812437057495117, + "sampling/sampling_logp_difference/mean": 0.020364979282021523, + "step": 146 + }, + { + "clip_ratio/high_max": 8.64622779772617e-06, + "clip_ratio/high_mean": 2.1615569494315423e-06, + "clip_ratio/low_mean": 4.702959677160834e-05, + "clip_ratio/low_min": 6.21032540948363e-06, + "clip_ratio/region_mean": 4.9191153607353044e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15914.0, + "completions/mean_length": 6779.7421875, + "completions/mean_terminated_length": 6307.4013671875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9858463555574417, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022105660755187273, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 114210841.0, + "reward": 0.390625, + "reward_std": 0.3676722049713135, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 4.2232295527355745e-06, + "sampling/sampling_logp_difference/max": 12.374910354614258, + "sampling/sampling_logp_difference/mean": 0.021493885666131973, + "step": 147 + }, + { + "clip_ratio/high_max": 9.080286417884054e-06, + "clip_ratio/high_mean": 2.2700716044710134e-06, + "clip_ratio/low_mean": 3.73501702597423e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9620241750526475e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15145.0, + "completions/mean_length": 6204.34375, + "completions/mean_terminated_length": 5960.0322265625, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.9073990881443024, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021019333507865667, + "learning_rate": 1e-05, + "loss": 0.0985, + "num_tokens": 115023469.0, + "reward": 0.4375, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999035596847534, + "sampling/importance_sampling_ratio/min": 7.850129009057127e-07, + "sampling/sampling_logp_difference/max": 14.057565689086914, + "sampling/sampling_logp_difference/mean": 0.019073951989412308, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.07747756223398e-05, + "clip_ratio/low_min": 6.719346401951043e-06, + "clip_ratio/region_mean": 7.07747756223398e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14484.0, + "completions/mean_length": 6382.890625, + "completions/mean_terminated_length": 5891.0322265625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.8928572610020638, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002703179605305195, + "learning_rate": 1e-05, + "loss": 0.1215, + "num_tokens": 115860183.0, + "reward": 0.46875, + "reward_std": 0.3924228549003601, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999281764030457, + "sampling/importance_sampling_ratio/min": 0.002329134149476886, + "sampling/sampling_logp_difference/max": 6.062258720397949, + "sampling/sampling_logp_difference/mean": 0.018461842089891434, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.991344158293941e-05, + "clip_ratio/low_min": 4.287576302886009e-06, + "clip_ratio/region_mean": 3.991344158293941e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 6856.25, + "completions/mean_terminated_length": 6387.671875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.9867237955331802, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025988349225372076, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 116757023.0, + "reward": 0.34375, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999343156814575, + "sampling/importance_sampling_ratio/min": 2.9312623155419715e-05, + "sampling/sampling_logp_difference/max": 10.437492370605469, + "sampling/sampling_logp_difference/mean": 0.019526638090610504, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.096957769661458e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096957769661458e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15873.0, + "completions/mean_length": 6312.1328125, + "completions/mean_terminated_length": 5816.794921875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.8896873891353607, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036364132538437843, + "learning_rate": 1e-05, + "loss": 0.0579, + "num_tokens": 117584064.0, + "reward": 0.2578125, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998835325241089, + "sampling/importance_sampling_ratio/min": 0.0009706970304250717, + "sampling/sampling_logp_difference/max": 6.937496185302734, + "sampling/sampling_logp_difference/mean": 0.019127443432807922, + "step": 151 + }, + { + "clip_ratio/high_max": 3.0199071261449717e-06, + "clip_ratio/high_mean": 7.549767815362429e-07, + "clip_ratio/low_mean": 4.133729697741728e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.20922739863272e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16279.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 5875.625, + "completions/mean_terminated_length": 5875.625, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9082999676465988, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025688125751912594, + "learning_rate": 1e-05, + "loss": 0.0737, + "num_tokens": 118354672.0, + "reward": 0.453125, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999657273292542, + "sampling/importance_sampling_ratio/min": 0.0024201429914683104, + "sampling/sampling_logp_difference/max": 6.023928642272949, + "sampling/sampling_logp_difference/mean": 0.019491348415613174, + "step": 152 + }, + { + "clip_ratio/high_max": 5.6563644648122136e-06, + "clip_ratio/high_mean": 1.4140911162030534e-06, + "clip_ratio/low_mean": 4.235651454109757e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.377060565730062e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13490.0, + "completions/mean_length": 6524.6015625, + "completions/mean_terminated_length": 6123.81298828125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.9052172750234604, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026063446421176195, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 119210997.0, + "reward": 0.2109375, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999611377716064, + "sampling/importance_sampling_ratio/min": 8.774310117587447e-06, + "sampling/sampling_logp_difference/max": 11.643682479858398, + "sampling/sampling_logp_difference/mean": 0.019871948286890984, + "step": 153 + }, + { + "clip_ratio/high_max": 2.8274008855078137e-05, + "clip_ratio/high_mean": 7.068502213769534e-06, + "clip_ratio/low_mean": 5.824237177876057e-05, + "clip_ratio/low_min": 9.362729997519637e-06, + "clip_ratio/region_mean": 6.531087387884327e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14731.0, + "completions/mean_length": 6606.34375, + "completions/mean_terminated_length": 6208.8779296875, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 0.923908606171608, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002111563691869378, + "learning_rate": 1e-05, + "loss": 0.0834, + "num_tokens": 120076777.0, + "reward": 0.3359375, + "reward_std": 0.32879000902175903, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999362230300903, + "sampling/importance_sampling_ratio/min": 7.220578579492098e-10, + "sampling/sampling_logp_difference/max": 21.04891586303711, + "sampling/sampling_logp_difference/mean": 0.01944371685385704, + "step": 154 + }, + { + "clip_ratio/high_max": 2.226728611276485e-05, + "clip_ratio/high_mean": 6.534373824251816e-06, + "clip_ratio/low_mean": 2.137331728135905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7907691105610866e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 7156.2578125, + "completions/mean_terminated_length": 6934.79248046875, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 1.0026871338486671, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002556675113737583, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 121013298.0, + "reward": 0.2890625, + "reward_std": 0.26013973355293274, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322295188904, + "sampling/importance_sampling_ratio/min": 1.3007297638978343e-05, + "sampling/sampling_logp_difference/max": 11.25, + "sampling/sampling_logp_difference/mean": 0.02018606849014759, + "step": 155 + }, + { + "clip_ratio/high_max": 9.798196060728515e-06, + "clip_ratio/high_mean": 2.4495490151821286e-06, + "clip_ratio/low_mean": 6.042695122232544e-05, + "clip_ratio/low_min": 1.0388962436991278e-05, + "clip_ratio/region_mean": 6.287649966907338e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15184.0, + "completions/mean_length": 6177.3828125, + "completions/mean_terminated_length": 5848.13671875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.7995355725288391, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0032885256223380566, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 121820851.0, + "reward": 0.4609375, + "reward_std": 0.35141900181770325, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.3007570487388875e-05, + "sampling/sampling_logp_difference/max": 11.249979019165039, + "sampling/sampling_logp_difference/mean": 0.018013037741184235, + "step": 156 + }, + { + "clip_ratio/high_max": 1.836798173826537e-05, + "clip_ratio/high_mean": 4.591995434566343e-06, + "clip_ratio/low_mean": 5.0241384542459855e-05, + "clip_ratio/low_min": 7.033341489659506e-06, + "clip_ratio/region_mean": 5.483338100020774e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15941.0, + "completions/mean_length": 6033.359375, + "completions/mean_terminated_length": 5612.6015625, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.8770530596375465, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0035782051272690296, + "learning_rate": 1e-05, + "loss": 0.1015, + "num_tokens": 122615329.0, + "reward": 0.421875, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000176429748535, + "sampling/importance_sampling_ratio/min": 8.344570233020931e-05, + "sampling/sampling_logp_difference/max": 9.391314506530762, + "sampling/sampling_logp_difference/mean": 0.018681444227695465, + "step": 157 + }, + { + "clip_ratio/high_max": 1.2653852763833129e-05, + "clip_ratio/high_mean": 4.80866970065108e-06, + "clip_ratio/low_mean": 3.11289915089219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.593766109588614e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14860.0, + "completions/mean_length": 8237.46875, + "completions/mean_terminated_length": 7974.67724609375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.9543669074773788, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026586023159325123, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 123688709.0, + "reward": 0.328125, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228119850159, + "sampling/importance_sampling_ratio/min": 0.00017198453133460134, + "sampling/sampling_logp_difference/max": 8.668106079101562, + "sampling/sampling_logp_difference/mean": 0.020768223330378532, + "step": 158 + }, + { + "clip_ratio/high_max": 4.32630758950836e-06, + "clip_ratio/high_mean": 1.08157689737709e-06, + "clip_ratio/low_mean": 3.721513610344118e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.829671300081827e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6649.1015625, + "completions/mean_terminated_length": 6000.10888671875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.8519875407218933, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028182135429233313, + "learning_rate": 1e-05, + "loss": 0.0528, + "num_tokens": 124557298.0, + "reward": 0.4140625, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999065399169922, + "sampling/importance_sampling_ratio/min": 6.050919910194352e-05, + "sampling/sampling_logp_difference/max": 9.712715148925781, + "sampling/sampling_logp_difference/mean": 0.019195500761270523, + "step": 159 + }, + { + "clip_ratio/high_max": 9.812353937377338e-06, + "clip_ratio/high_mean": 2.4530884843443346e-06, + "clip_ratio/low_mean": 1.864515820670931e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1098246747897065e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14946.0, + "completions/mean_length": 6262.125, + "completions/mean_terminated_length": 5587.33349609375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "entropy": 0.9227473363280296, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018271139124408364, + "learning_rate": 1e-05, + "loss": 0.0162, + "num_tokens": 125378002.0, + "reward": 0.421875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998780488967896, + "sampling/importance_sampling_ratio/min": 1.1365813179509132e-06, + "sampling/sampling_logp_difference/max": 13.687485694885254, + "sampling/sampling_logp_difference/mean": 0.018991345539689064, + "step": 160 + }, + { + "clip_ratio/high_max": 1.976754219867871e-05, + "clip_ratio/high_mean": 5.881085598957725e-06, + "clip_ratio/low_mean": 4.014476598968031e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6025852043385385e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16334.0, + "completions/mean_length": 6543.2734375, + "completions/mean_terminated_length": 6465.78759765625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.9931852892041206, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028531099669635296, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 126236133.0, + "reward": 0.2734375, + "reward_std": 0.3148259222507477, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000286102294922, + "sampling/importance_sampling_ratio/min": 1.9964969396824017e-05, + "sampling/sampling_logp_difference/max": 10.821531295776367, + "sampling/sampling_logp_difference/mean": 0.020335232838988304, + "step": 161 + }, + { + "clip_ratio/high_max": 2.1589371499430854e-05, + "clip_ratio/high_mean": 8.165637723323016e-06, + "clip_ratio/low_mean": 6.554757646881626e-05, + "clip_ratio/low_min": 5.570906523644226e-06, + "clip_ratio/region_mean": 7.371321362370509e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13107.0, + "completions/mean_length": 5567.2890625, + "completions/mean_terminated_length": 5482.1181640625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9842768535017967, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017482106341049075, + "learning_rate": 1e-05, + "loss": 0.0019, + "num_tokens": 126974666.0, + "reward": 0.25, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999868631362915, + "sampling/importance_sampling_ratio/min": 0.011517977342009544, + "sampling/sampling_logp_difference/max": 4.463846206665039, + "sampling/sampling_logp_difference/mean": 0.020022576674818993, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0515780559217092e-05, + "clip_ratio/high_mean": 2.628945139804273e-06, + "clip_ratio/low_mean": 5.164334470464382e-05, + "clip_ratio/low_min": 3.369817250131746e-06, + "clip_ratio/region_mean": 5.427229007182177e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14865.0, + "completions/mean_length": 7232.6328125, + "completions/mean_terminated_length": 6937.42724609375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9599866047501564, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001637064153328538, + "learning_rate": 1e-05, + "loss": 0.0918, + "num_tokens": 127921331.0, + "reward": 0.3671875, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000075101852417, + "sampling/importance_sampling_ratio/min": 0.00023060032981447875, + "sampling/sampling_logp_difference/max": 8.374824523925781, + "sampling/sampling_logp_difference/mean": 0.01991824433207512, + "step": 163 + }, + { + "clip_ratio/high_max": 1.7373587070323993e-05, + "clip_ratio/high_mean": 4.343396767580998e-06, + "clip_ratio/low_mean": 2.182850187182339e-05, + "clip_ratio/low_min": 4.473072294786107e-06, + "clip_ratio/region_mean": 2.6171898525717552e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15075.0, + "completions/max_terminated_length": 15075.0, + "completions/mean_length": 4948.546875, + "completions/mean_terminated_length": 4948.546875, + "completions/min_length": 609.0, + "completions/min_terminated_length": 609.0, + "entropy": 0.9903113394975662, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00162114470731467, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 128575785.0, + "reward": 0.4140625, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999828040599823, + "sampling/importance_sampling_ratio/min": 3.263082589910482e-06, + "sampling/sampling_logp_difference/max": 12.632838249206543, + "sampling/sampling_logp_difference/mean": 0.019144343212246895, + "step": 164 + }, + { + "clip_ratio/high_max": 1.2063027497788426e-05, + "clip_ratio/high_mean": 4.366232360553113e-06, + "clip_ratio/low_mean": 3.965049324961001e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4016725382789446e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6205.234375, + "completions/mean_terminated_length": 6125.08642578125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.9164782017469406, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021650632843375206, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 129389191.0, + "reward": 0.5078125, + "reward_std": 0.3214311897754669, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.0009118906455114484, + "sampling/sampling_logp_difference/max": 6.999990463256836, + "sampling/sampling_logp_difference/mean": 0.01929439604282379, + "step": 165 + }, + { + "clip_ratio/high_max": 2.6859754598262953e-05, + "clip_ratio/high_mean": 6.714938649565738e-06, + "clip_ratio/low_mean": 1.6451138890261063e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.31660775398268e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15741.0, + "completions/max_terminated_length": 15741.0, + "completions/mean_length": 4911.25, + "completions/mean_terminated_length": 4911.25, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.9057909473776817, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019606768619269133, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 130036711.0, + "reward": 0.296875, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999849796295166, + "sampling/importance_sampling_ratio/min": 0.0008691518451087177, + "sampling/sampling_logp_difference/max": 7.047992706298828, + "sampling/sampling_logp_difference/mean": 0.020085586234927177, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.847699741119868e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.847699741119868e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15152.0, + "completions/mean_length": 6222.0859375, + "completions/mean_terminated_length": 5978.2001953125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.102900318801403, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0013436009176075459, + "learning_rate": 1e-05, + "loss": 0.0116, + "num_tokens": 130854714.0, + "reward": 0.21875, + "reward_std": 0.1825428307056427, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322891235352, + "sampling/importance_sampling_ratio/min": 3.319984534755349e-05, + "sampling/sampling_logp_difference/max": 10.312965393066406, + "sampling/sampling_logp_difference/mean": 0.02261950448155403, + "step": 167 + }, + { + "clip_ratio/high_max": 1.0113483313034521e-05, + "clip_ratio/high_mean": 3.4217127904412337e-06, + "clip_ratio/low_mean": 3.916404375559068e-05, + "clip_ratio/low_min": 4.7332350732176565e-06, + "clip_ratio/region_mean": 4.258575745552662e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 6490.7734375, + "completions/mean_terminated_length": 6333.73828125, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "entropy": 0.9576810225844383, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025689650792628527, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 131703429.0, + "reward": 0.3515625, + "reward_std": 0.3385029733181, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 0.00037599547067657113, + "sampling/sampling_logp_difference/max": 7.8859333992004395, + "sampling/sampling_logp_difference/mean": 0.01931593380868435, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.780203212500055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.780203212500055e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14897.0, + "completions/mean_length": 6957.453125, + "completions/mean_terminated_length": 6653.37060546875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.9904302433133125, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002132449997588992, + "learning_rate": 1e-05, + "loss": 0.0848, + "num_tokens": 132614583.0, + "reward": 0.34375, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999384880065918, + "sampling/importance_sampling_ratio/min": 9.969094350026353e-08, + "sampling/sampling_logp_difference/max": 16.121191024780273, + "sampling/sampling_logp_difference/mean": 0.019748074933886528, + "step": 169 + }, + { + "clip_ratio/high_max": 1.6620725091343047e-05, + "clip_ratio/high_mean": 6.429913469219173e-06, + "clip_ratio/low_mean": 6.847188262781856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.49017954149167e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 6781.3828125, + "completions/mean_terminated_length": 6391.0322265625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.7702180370688438, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037141458597034216, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 133500672.0, + "reward": 0.4140625, + "reward_std": 0.39294689893722534, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0015879785642027855, + "sampling/sampling_logp_difference/max": 6.445293426513672, + "sampling/sampling_logp_difference/mean": 0.017618997022509575, + "step": 170 + }, + { + "clip_ratio/high_max": 8.414747526330757e-06, + "clip_ratio/high_mean": 2.1036868815826892e-06, + "clip_ratio/low_mean": 2.6748189156933222e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8851876209046168e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7167.6953125, + "completions/mean_terminated_length": 7095.1259765625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 1.0333677157759666, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021144442725926638, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 134437361.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999521970748901, + "sampling/importance_sampling_ratio/min": 0.0020202873274683952, + "sampling/sampling_logp_difference/max": 6.20451545715332, + "sampling/sampling_logp_difference/mean": 0.021626941859722137, + "step": 171 + }, + { + "clip_ratio/high_max": 7.359868050116347e-06, + "clip_ratio/high_mean": 1.8399670125290868e-06, + "clip_ratio/low_mean": 3.642534238679218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826530939932127e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15035.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "entropy": 0.8884351700544357, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025075129233300686, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 135215690.0, + "reward": 0.5078125, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145435333252, + "sampling/importance_sampling_ratio/min": 8.12270229744172e-07, + "sampling/sampling_logp_difference/max": 14.023432731628418, + "sampling/sampling_logp_difference/mean": 0.018633443862199783, + "step": 172 + }, + { + "clip_ratio/high_max": 6.931506504770368e-06, + "clip_ratio/high_mean": 1.732876626192592e-06, + "clip_ratio/low_mean": 6.461201871843514e-05, + "clip_ratio/low_min": 9.272769602830522e-06, + "clip_ratio/region_mean": 6.634489625412243e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 7267.296875, + "completions/mean_terminated_length": 7048.49609375, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 1.072906270623207, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023191061336547136, + "learning_rate": 1e-05, + "loss": 0.1216, + "num_tokens": 136165880.0, + "reward": 0.3046875, + "reward_std": 0.3400956988334656, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999949932098389, + "sampling/importance_sampling_ratio/min": 8.937300299294293e-05, + "sampling/sampling_logp_difference/max": 9.322691917419434, + "sampling/sampling_logp_difference/mean": 0.02122514694929123, + "step": 173 + }, + { + "clip_ratio/high_max": 7.245442930070567e-06, + "clip_ratio/high_mean": 1.8113607325176417e-06, + "clip_ratio/low_mean": 5.239449455984868e-05, + "clip_ratio/low_min": 7.146442158045829e-06, + "clip_ratio/region_mean": 5.420585534920974e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 7433.1640625, + "completions/mean_terminated_length": 7362.68505859375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 1.0957217290997505, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029631280340254307, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 137140413.0, + "reward": 0.265625, + "reward_std": 0.28749164938926697, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999362230300903, + "sampling/importance_sampling_ratio/min": 0.0086804935708642, + "sampling/sampling_logp_difference/max": 4.746676921844482, + "sampling/sampling_logp_difference/mean": 0.022480733692646027, + "step": 174 + }, + { + "clip_ratio/high_max": 6.239364211069187e-06, + "clip_ratio/high_mean": 1.5598410527672968e-06, + "clip_ratio/low_mean": 3.690561521807467e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.846545632768539e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15985.0, + "completions/mean_length": 7073.90625, + "completions/mean_terminated_length": 6926.12744140625, + "completions/min_length": 1398.0, + "completions/min_terminated_length": 1398.0, + "entropy": 0.9333122596144676, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.000832411227747798, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 138064537.0, + "reward": 0.3671875, + "reward_std": 0.13888052105903625, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998854994773865, + "sampling/importance_sampling_ratio/min": 0.0002638234291225672, + "sampling/sampling_logp_difference/max": 8.240230560302734, + "sampling/sampling_logp_difference/mean": 0.019753674045205116, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.8504628946611774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8504628946611774e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15719.0, + "completions/mean_length": 5680.59375, + "completions/mean_terminated_length": 5596.31494140625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 0.9720541462302208, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002570893382653594, + "learning_rate": 1e-05, + "loss": 0.0289, + "num_tokens": 138809293.0, + "reward": 0.3515625, + "reward_std": 0.3703257441520691, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 1.1064497584811761e-07, + "sampling/sampling_logp_difference/max": 16.016939163208008, + "sampling/sampling_logp_difference/mean": 0.019471734762191772, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.860648109821341e-05, + "clip_ratio/low_min": 6.799404218327254e-06, + "clip_ratio/region_mean": 3.860648109821341e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15983.0, + "completions/mean_length": 8024.34375, + "completions/mean_terminated_length": 7540.72705078125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 1.0136078596115112, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017353243893012404, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 139856281.0, + "reward": 0.3046875, + "reward_std": 0.2551271915435791, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999872446060181, + "sampling/importance_sampling_ratio/min": 0.0012184304650872946, + "sampling/sampling_logp_difference/max": 6.71019172668457, + "sampling/sampling_logp_difference/mean": 0.021411728113889694, + "step": 177 + }, + { + "clip_ratio/high_max": 2.0505477323240484e-05, + "clip_ratio/high_mean": 5.126369330810121e-06, + "clip_ratio/low_mean": 5.543978954847262e-05, + "clip_ratio/low_min": 6.273411372603732e-06, + "clip_ratio/region_mean": 6.056615916349983e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 7543.96875, + "completions/mean_terminated_length": 7032.5615234375, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.9921196177601814, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019490106496959925, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 140843861.0, + "reward": 0.296875, + "reward_std": 0.34717273712158203, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728202819824, + "sampling/importance_sampling_ratio/min": 0.002482798881828785, + "sampling/sampling_logp_difference/max": 5.998368740081787, + "sampling/sampling_logp_difference/mean": 0.020561274141073227, + "step": 178 + }, + { + "clip_ratio/high_max": 2.1780562747153454e-05, + "clip_ratio/high_mean": 7.637661838089116e-06, + "clip_ratio/low_mean": 5.0004296554106986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.76419583921961e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16320.0, + "completions/max_terminated_length": 16320.0, + "completions/mean_length": 6285.1796875, + "completions/mean_terminated_length": 6285.1796875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.8724544793367386, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027221282944083214, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 141666372.0, + "reward": 0.3984375, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271631240845, + "sampling/importance_sampling_ratio/min": 0.0001951520098373294, + "sampling/sampling_logp_difference/max": 8.541731834411621, + "sampling/sampling_logp_difference/mean": 0.01924072578549385, + "step": 179 + }, + { + "clip_ratio/high_max": 1.2773067282978445e-05, + "clip_ratio/high_mean": 3.1932668207446113e-06, + "clip_ratio/low_mean": 5.425560334515467e-05, + "clip_ratio/low_min": 8.365065696125384e-06, + "clip_ratio/region_mean": 5.744886925640458e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 7659.6796875, + "completions/mean_terminated_length": 7230.6142578125, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.9285296350717545, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016997806960716844, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 142665635.0, + "reward": 0.328125, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 1.8975185867020627e-07, + "sampling/sampling_logp_difference/max": 15.477548599243164, + "sampling/sampling_logp_difference/mean": 0.020274491980671883, + "step": 180 + }, + { + "clip_ratio/high_max": 2.486542780388845e-05, + "clip_ratio/high_mean": 6.216356950972113e-06, + "clip_ratio/low_mean": 3.3204854901214276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9421211965873226e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14834.0, + "completions/max_terminated_length": 14834.0, + "completions/mean_length": 5331.03125, + "completions/mean_terminated_length": 5331.03125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.7720941603183746, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030591271352022886, + "learning_rate": 1e-05, + "loss": -0.0544, + "num_tokens": 143364919.0, + "reward": 0.5390625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 2.998966630585187e-09, + "sampling/sampling_logp_difference/max": 19.624998092651367, + "sampling/sampling_logp_difference/mean": 0.01690140925347805, + "step": 181 + }, + { + "clip_ratio/high_max": 1.0562233001110144e-05, + "clip_ratio/high_mean": 3.6131090155322454e-06, + "clip_ratio/low_mean": 5.028249574934307e-05, + "clip_ratio/low_min": 3.0328762932185782e-06, + "clip_ratio/region_mean": 5.3895605788056855e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15895.0, + "completions/mean_length": 7086.65625, + "completions/mean_terminated_length": 6708.71533203125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.8584504351019859, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0015365247381851077, + "learning_rate": 1e-05, + "loss": 0.0465, + "num_tokens": 144293867.0, + "reward": 0.2578125, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915791511536, + "sampling/importance_sampling_ratio/min": 0.00015850062482059002, + "sampling/sampling_logp_difference/max": 8.749752044677734, + "sampling/sampling_logp_difference/mean": 0.019430743530392647, + "step": 182 + }, + { + "clip_ratio/high_max": 6.546216354763601e-06, + "clip_ratio/high_mean": 1.6365540886909002e-06, + "clip_ratio/low_mean": 3.201156800969329e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364812232575787e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 5455.6484375, + "completions/mean_terminated_length": 5369.5986328125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "entropy": 0.8517125397920609, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003156432416290045, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 145013318.0, + "reward": 0.390625, + "reward_std": 0.25726157426834106, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.10733240842819214, + "sampling/sampling_logp_difference/max": 2.2318246364593506, + "sampling/sampling_logp_difference/mean": 0.01860412396490574, + "step": 183 + }, + { + "clip_ratio/high_max": 4.192453593532264e-05, + "clip_ratio/high_mean": 1.196126476088466e-05, + "clip_ratio/low_mean": 4.6358243707800284e-05, + "clip_ratio/low_min": 5.576871444645803e-06, + "clip_ratio/region_mean": 5.8319507388659986e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 6670.2890625, + "completions/mean_terminated_length": 6192.5654296875, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "entropy": 0.8807757273316383, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028573600575327873, + "learning_rate": 1e-05, + "loss": 0.1163, + "num_tokens": 145886291.0, + "reward": 0.46875, + "reward_std": 0.38269224762916565, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 0.0006086408975534141, + "sampling/sampling_logp_difference/max": 7.404282093048096, + "sampling/sampling_logp_difference/mean": 0.01879466325044632, + "step": 184 + }, + { + "clip_ratio/high_max": 5.954649168415926e-06, + "clip_ratio/high_mean": 1.4886622921039816e-06, + "clip_ratio/low_mean": 2.10815471746173e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.257020946672128e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12881.0, + "completions/max_terminated_length": 12881.0, + "completions/mean_length": 5849.8359375, + "completions/mean_terminated_length": 5849.8359375, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 0.879327155649662, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028504019137471914, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 146658174.0, + "reward": 0.4140625, + "reward_std": 0.2596206068992615, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999953508377075, + "sampling/importance_sampling_ratio/min": 0.0004885811940766871, + "sampling/sampling_logp_difference/max": 7.62400484085083, + "sampling/sampling_logp_difference/mean": 0.019282957538962364, + "step": 185 + }, + { + "clip_ratio/high_max": 1.0011702670453815e-05, + "clip_ratio/high_mean": 3.558776029422006e-06, + "clip_ratio/low_mean": 2.338160857107141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.694038448680658e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15885.0, + "completions/mean_length": 6376.7578125, + "completions/mean_terminated_length": 6297.96044921875, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "entropy": 1.0437361896038055, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026664668694138527, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 147494367.0, + "reward": 0.25, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999197721481323, + "sampling/importance_sampling_ratio/min": 5.43163696420379e-06, + "sampling/sampling_logp_difference/max": 12.123270034790039, + "sampling/sampling_logp_difference/mean": 0.020121946930885315, + "step": 186 + }, + { + "clip_ratio/high_max": 4.071263447258389e-06, + "clip_ratio/high_mean": 1.0178158618145972e-06, + "clip_ratio/low_mean": 5.679830292137922e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.781611889688065e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15314.0, + "completions/max_terminated_length": 15314.0, + "completions/mean_length": 6753.0390625, + "completions/mean_terminated_length": 6753.0390625, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.8704448491334915, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0013236560625955462, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 148377476.0, + "reward": 0.390625, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999928891658783, + "sampling/importance_sampling_ratio/min": 0.0005196586716920137, + "sampling/sampling_logp_difference/max": 7.562338352203369, + "sampling/sampling_logp_difference/mean": 0.019745871424674988, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1118761626203195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1118761626203195e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14670.0, + "completions/mean_length": 6334.5625, + "completions/mean_terminated_length": 6255.43310546875, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "entropy": 0.9675566852092743, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003227849490940571, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 149213140.0, + "reward": 0.265625, + "reward_std": 0.22331714630126953, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 2.0039660739712417e-06, + "sampling/sampling_logp_difference/max": 13.120382308959961, + "sampling/sampling_logp_difference/mean": 0.02062838338315487, + "step": 188 + }, + { + "clip_ratio/high_max": 2.159174937332864e-05, + "clip_ratio/high_mean": 7.343517381741549e-06, + "clip_ratio/low_mean": 2.7624131234915694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.496764873034408e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15878.0, + "completions/mean_length": 5986.3125, + "completions/mean_terminated_length": 5650.90283203125, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "entropy": 0.9257830232381821, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023177729453891516, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 149998732.0, + "reward": 0.4375, + "reward_std": 0.32589423656463623, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000395774841309, + "sampling/importance_sampling_ratio/min": 0.00015848006296437234, + "sampling/sampling_logp_difference/max": 8.749881744384766, + "sampling/sampling_logp_difference/mean": 0.018431315198540688, + "step": 189 + }, + { + "clip_ratio/high_max": 1.0338640322515857e-05, + "clip_ratio/high_mean": 2.5846600806289644e-06, + "clip_ratio/low_mean": 4.149641688400152e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.408107668041339e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15871.0, + "completions/mean_length": 7341.390625, + "completions/mean_terminated_length": 7049.693359375, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9617493599653244, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001992360921576619, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 150958414.0, + "reward": 0.2890625, + "reward_std": 0.29119330644607544, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.0011714966967701912, + "sampling/sampling_logp_difference/max": 6.7494730949401855, + "sampling/sampling_logp_difference/mean": 0.02040865272283554, + "step": 190 + }, + { + "clip_ratio/high_max": 1.402321640853188e-05, + "clip_ratio/high_mean": 4.2662558144002105e-06, + "clip_ratio/low_mean": 4.847697437071474e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.274322995774128e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15966.0, + "completions/mean_length": 6194.53125, + "completions/mean_terminated_length": 5605.0576171875, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.7917485684156418, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002244317904114723, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 151770450.0, + "reward": 0.46875, + "reward_std": 0.29432153701782227, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999660849571228, + "sampling/importance_sampling_ratio/min": 0.0007107177516445518, + "sampling/sampling_logp_difference/max": 7.249235153198242, + "sampling/sampling_logp_difference/mean": 0.016992967575788498, + "step": 191 + }, + { + "clip_ratio/high_max": 1.0843792097148253e-05, + "clip_ratio/high_mean": 2.710948024287063e-06, + "clip_ratio/low_mean": 5.327871485860669e-05, + "clip_ratio/low_min": 8.019090955713182e-06, + "clip_ratio/region_mean": 5.598966299658059e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15010.0, + "completions/mean_length": 6883.328125, + "completions/mean_terminated_length": 6808.51953125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.8912994414567947, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028390102088451385, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 152668740.0, + "reward": 0.3359375, + "reward_std": 0.3684907555580139, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127388000488, + "sampling/importance_sampling_ratio/min": 0.00014138928963802755, + "sampling/sampling_logp_difference/max": 8.863993644714355, + "sampling/sampling_logp_difference/mean": 0.018673548474907875, + "step": 192 + }, + { + "clip_ratio/high_max": 1.0902768735832069e-05, + "clip_ratio/high_mean": 2.7256921839580173e-06, + "clip_ratio/low_mean": 3.64547792059966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918047127626778e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15506.0, + "completions/mean_length": 7799.5234375, + "completions/mean_terminated_length": 7227.2255859375, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "entropy": 0.81409652531147, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031472526025027037, + "learning_rate": 1e-05, + "loss": 0.0106, + "num_tokens": 153684919.0, + "reward": 0.265625, + "reward_std": 0.2924865484237671, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999836802482605, + "sampling/importance_sampling_ratio/min": 0.0033896781969815493, + "sampling/sampling_logp_difference/max": 5.687020301818848, + "sampling/sampling_logp_difference/mean": 0.020041968673467636, + "step": 193 + }, + { + "clip_ratio/high_max": 9.558767487760633e-06, + "clip_ratio/high_mean": 2.3896918719401583e-06, + "clip_ratio/low_mean": 2.064374041310657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.303343228504673e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14882.0, + "completions/max_terminated_length": 14882.0, + "completions/mean_length": 6441.78125, + "completions/mean_terminated_length": 6441.78125, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 1.0110936611890793, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0008370456052944064, + "learning_rate": 1e-05, + "loss": 0.0398, + "num_tokens": 154527195.0, + "reward": 0.3984375, + "reward_std": 0.14677615463733673, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999023079872131, + "sampling/importance_sampling_ratio/min": 0.00020978205429855734, + "sampling/sampling_logp_difference/max": 8.469441413879395, + "sampling/sampling_logp_difference/mean": 0.021425459533929825, + "step": 194 + }, + { + "clip_ratio/high_max": 4.3503982851689216e-06, + "clip_ratio/high_mean": 1.0875995712922304e-06, + "clip_ratio/low_mean": 2.6103265497567918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7190865182546986e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15901.0, + "completions/mean_length": 7140.2890625, + "completions/mean_terminated_length": 6918.4404296875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.993028812110424, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004406601656228304, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 155457592.0, + "reward": 0.296875, + "reward_std": 0.24882915616035461, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998899698257446, + "sampling/importance_sampling_ratio/min": 0.005102821160107851, + "sampling/sampling_logp_difference/max": 5.277961730957031, + "sampling/sampling_logp_difference/mean": 0.020247166976332664, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.063482140281849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.063482140281849e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15280.0, + "completions/max_terminated_length": 15280.0, + "completions/mean_length": 6220.5703125, + "completions/mean_terminated_length": 6220.5703125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.9336734637618065, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013446965022012591, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 156277609.0, + "reward": 0.3671875, + "reward_std": 0.32089442014694214, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0036465052980929613, + "sampling/sampling_logp_difference/max": 5.613986015319824, + "sampling/sampling_logp_difference/mean": 0.018678557127714157, + "step": 196 + }, + { + "clip_ratio/high_max": 1.0170509995077737e-05, + "clip_ratio/high_mean": 2.542627498769434e-06, + "clip_ratio/low_mean": 2.2835527090592223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5378154816735332e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16143.0, + "completions/mean_length": 7230.3046875, + "completions/mean_terminated_length": 6935.02392578125, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.9315059334039688, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007178800296969712, + "learning_rate": 1e-05, + "loss": 0.0817, + "num_tokens": 157222744.0, + "reward": 0.4453125, + "reward_std": 0.17517909407615662, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.005948656238615513, + "sampling/sampling_logp_difference/max": 5.124589920043945, + "sampling/sampling_logp_difference/mean": 0.019229095429182053, + "step": 197 + }, + { + "clip_ratio/high_max": 8.961743105828646e-06, + "clip_ratio/high_mean": 2.2404357764571614e-06, + "clip_ratio/low_mean": 4.256807665115048e-05, + "clip_ratio/low_min": 4.9592349569138605e-06, + "clip_ratio/region_mean": 4.480851271182473e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15489.0, + "completions/mean_length": 7101.7890625, + "completions/mean_terminated_length": 6802.36279296875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.8410197496414185, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028408628422766924, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 158151901.0, + "reward": 0.3359375, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 1.1856438959512161e-06, + "sampling/sampling_logp_difference/max": 13.645224571228027, + "sampling/sampling_logp_difference/mean": 0.018435407429933548, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0979279042876442e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0979279042876442e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15693.0, + "completions/mean_length": 6822.109375, + "completions/mean_terminated_length": 6670.33349609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.9384881108999252, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003448180854320526, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 159043939.0, + "reward": 0.390625, + "reward_std": 0.2906692624092102, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.0018930588848888874, + "sampling/sampling_logp_difference/max": 6.269561290740967, + "sampling/sampling_logp_difference/mean": 0.01985720731317997, + "step": 199 + }, + { + "clip_ratio/high_max": 1.87569592071668e-05, + "clip_ratio/high_mean": 5.608627873243677e-06, + "clip_ratio/low_mean": 2.393421118540573e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.954283939970992e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16130.0, + "completions/mean_length": 6969.671875, + "completions/mean_terminated_length": 6665.98388671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.8700083270668983, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002675072755664587, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 159955905.0, + "reward": 0.34375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 4.222963980282657e-06, + "sampling/sampling_logp_difference/max": 12.37497329711914, + "sampling/sampling_logp_difference/mean": 0.018493790179491043, + "step": 200 + }, + { + "clip_ratio/high_max": 1.0003448096540524e-05, + "clip_ratio/high_mean": 2.500862024135131e-06, + "clip_ratio/low_mean": 2.7816862200324977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0317724281303526e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 6642.921875, + "completions/mean_terminated_length": 6409.13623046875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 1.0049321055412292, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034180639777332544, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 160825383.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999150037765503, + "sampling/importance_sampling_ratio/min": 0.000667327141854912, + "sampling/sampling_logp_difference/max": 7.312230110168457, + "sampling/sampling_logp_difference/mean": 0.020563330501317978, + "step": 201 + }, + { + "clip_ratio/high_max": 5.628348844766151e-06, + "clip_ratio/high_mean": 1.4070872111915378e-06, + "clip_ratio/low_mean": 3.0009771876393643e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1416859314958856e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15930.0, + "completions/mean_length": 6327.296875, + "completions/mean_terminated_length": 6085.9365234375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.8458633497357368, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016060187481343746, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 161653685.0, + "reward": 0.484375, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999157190322876, + "sampling/importance_sampling_ratio/min": 4.0065486246021464e-05, + "sampling/sampling_logp_difference/max": 10.124995231628418, + "sampling/sampling_logp_difference/mean": 0.018988098949193954, + "step": 202 + }, + { + "clip_ratio/high_max": 1.1031161648134002e-05, + "clip_ratio/high_mean": 2.7577904120335006e-06, + "clip_ratio/low_mean": 5.184456858842168e-05, + "clip_ratio/low_min": 3.209077931387583e-06, + "clip_ratio/region_mean": 5.460235854570783e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16008.0, + "completions/mean_length": 6871.4921875, + "completions/mean_terminated_length": 6643.1923828125, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.8635450080037117, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027431908529251814, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 162555796.0, + "reward": 0.296875, + "reward_std": 0.2906692326068878, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 1.8959757653647102e-05, + "sampling/sampling_logp_difference/max": 10.873191833496094, + "sampling/sampling_logp_difference/mean": 0.019010700285434723, + "step": 203 + }, + { + "clip_ratio/high_max": 1.122018943533476e-05, + "clip_ratio/high_mean": 2.80504735883369e-06, + "clip_ratio/low_mean": 3.166110184338322e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4466149031686655e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15032.0, + "completions/mean_length": 5741.7734375, + "completions/mean_terminated_length": 5657.9765625, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.820662334561348, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021551409736275673, + "learning_rate": 1e-05, + "loss": 0.0325, + "num_tokens": 163312831.0, + "reward": 0.3828125, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999495148658752, + "sampling/importance_sampling_ratio/min": 0.00020485777349676937, + "sampling/sampling_logp_difference/max": 8.493194580078125, + "sampling/sampling_logp_difference/mean": 0.018189631402492523, + "step": 204 + }, + { + "clip_ratio/high_max": 5.249454261502251e-06, + "clip_ratio/high_mean": 2.6246168545185355e-06, + "clip_ratio/low_mean": 5.6316800055356e-05, + "clip_ratio/low_min": 6.944251708773663e-06, + "clip_ratio/region_mean": 5.894141622775351e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15114.0, + "completions/max_terminated_length": 15114.0, + "completions/mean_length": 6707.234375, + "completions/mean_terminated_length": 6707.234375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.9361380413174629, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021163856144994497, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 164189605.0, + "reward": 0.21875, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998763799667358, + "sampling/importance_sampling_ratio/min": 6.894206876495446e-07, + "sampling/sampling_logp_difference/max": 14.187414169311523, + "sampling/sampling_logp_difference/mean": 0.020120715722441673, + "step": 205 + }, + { + "clip_ratio/high_max": 1.2976960988453357e-05, + "clip_ratio/high_mean": 3.244240247113339e-06, + "clip_ratio/low_mean": 4.118970764466212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.44339480054623e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15672.0, + "completions/mean_length": 7074.59375, + "completions/mean_terminated_length": 6774.2900390625, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "entropy": 0.9206110090017319, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003191466676071286, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 165114649.0, + "reward": 0.4296875, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999928891658783, + "sampling/importance_sampling_ratio/min": 0.0015704745892435312, + "sampling/sampling_logp_difference/max": 6.4563775062561035, + "sampling/sampling_logp_difference/mean": 0.020029421895742416, + "step": 206 + }, + { + "clip_ratio/high_max": 2.4998532580866595e-05, + "clip_ratio/high_mean": 6.947302438220504e-06, + "clip_ratio/low_mean": 4.305635661694396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.000365831620002e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15481.0, + "completions/mean_length": 6510.3984375, + "completions/mean_terminated_length": 6432.6533203125, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.9344880431890488, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002458518138155341, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 165971100.0, + "reward": 0.484375, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999246597290039, + "sampling/importance_sampling_ratio/min": 0.0011708823731169105, + "sampling/sampling_logp_difference/max": 6.749997615814209, + "sampling/sampling_logp_difference/mean": 0.02032654918730259, + "step": 207 + }, + { + "clip_ratio/high_max": 1.9761582279897993e-05, + "clip_ratio/high_mean": 4.940395569974498e-06, + "clip_ratio/low_mean": 2.598603293790802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.092642862156936e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 5363.4609375, + "completions/mean_terminated_length": 5363.4609375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8528282344341278, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020360907074064016, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 166676943.0, + "reward": 0.46875, + "reward_std": 0.3079911470413208, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0005493607022799551, + "sampling/sampling_logp_difference/max": 7.506755352020264, + "sampling/sampling_logp_difference/mean": 0.01911250874400139, + "step": 208 + }, + { + "clip_ratio/high_max": 6.622867658734322e-06, + "clip_ratio/high_mean": 1.6557169146835804e-06, + "clip_ratio/low_mean": 4.006644434184636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.172216131337336e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14735.0, + "completions/mean_length": 4550.203125, + "completions/mean_terminated_length": 4266.1923828125, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.7535714656114578, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015881177969276905, + "learning_rate": 1e-05, + "loss": 0.0952, + "num_tokens": 167278489.0, + "reward": 0.5546875, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999875009059906, + "sampling/importance_sampling_ratio/min": 7.485204696422443e-05, + "sampling/sampling_logp_difference/max": 9.49999713897705, + "sampling/sampling_logp_difference/mean": 0.016919689252972603, + "step": 209 + }, + { + "clip_ratio/high_max": 2.8397119422152173e-05, + "clip_ratio/high_mean": 7.099279855538043e-06, + "clip_ratio/low_mean": 2.2654034410152235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9753314493063954e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16087.0, + "completions/mean_length": 5080.078125, + "completions/mean_terminated_length": 4991.07080078125, + "completions/min_length": 684.0, + "completions/min_terminated_length": 684.0, + "entropy": 0.922355130314827, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021621519699692726, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 167949827.0, + "reward": 0.5546875, + "reward_std": 0.21829968690872192, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998918771743774, + "sampling/importance_sampling_ratio/min": 9.328075248049572e-05, + "sampling/sampling_logp_difference/max": 9.27989673614502, + "sampling/sampling_logp_difference/mean": 0.018358757719397545, + "step": 210 + }, + { + "clip_ratio/high_max": 1.3618362117995275e-05, + "clip_ratio/high_mean": 4.41220004177012e-06, + "clip_ratio/low_mean": 6.229132804946858e-05, + "clip_ratio/low_min": 1.1466368505352875e-05, + "clip_ratio/region_mean": 6.670352740911767e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15665.0, + "completions/max_terminated_length": 15665.0, + "completions/mean_length": 6371.9453125, + "completions/mean_terminated_length": 6371.9453125, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.8835635632276535, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003488079411908984, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 168781948.0, + "reward": 0.46875, + "reward_std": 0.4673760235309601, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999973773956299, + "sampling/importance_sampling_ratio/min": 4.154009047852014e-08, + "sampling/sampling_logp_difference/max": 16.996606826782227, + "sampling/sampling_logp_difference/mean": 0.01854466274380684, + "step": 211 + }, + { + "clip_ratio/high_max": 1.3789490822091466e-05, + "clip_ratio/high_mean": 3.4473727055228665e-06, + "clip_ratio/low_mean": 3.9819827861720114e-05, + "clip_ratio/low_min": 9.205373771692393e-06, + "clip_ratio/region_mean": 4.3267199771435116e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 7045.234375, + "completions/mean_terminated_length": 6665.609375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 0.8657141029834747, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002579214284196496, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 169704370.0, + "reward": 0.390625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 0.00038033726741559803, + "sampling/sampling_logp_difference/max": 7.874452114105225, + "sampling/sampling_logp_difference/mean": 0.020650357007980347, + "step": 212 + }, + { + "clip_ratio/high_max": 1.0065672540804371e-05, + "clip_ratio/high_mean": 2.516418135201093e-06, + "clip_ratio/low_mean": 2.5041783715096244e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7558201850297337e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13301.0, + "completions/mean_length": 4835.1015625, + "completions/mean_terminated_length": 4744.16552734375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.8166600242257118, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015265591209754348, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 170343191.0, + "reward": 0.4765625, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999908983707428, + "sampling/importance_sampling_ratio/min": 0.0008047395385801792, + "sampling/sampling_logp_difference/max": 7.1249918937683105, + "sampling/sampling_logp_difference/mean": 0.01807256042957306, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.965024677654583e-05, + "clip_ratio/low_min": 3.7946631437080214e-06, + "clip_ratio/region_mean": 3.965024677654583e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 6042.6328125, + "completions/mean_terminated_length": 5622.251953125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.8976519927382469, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019487867830321193, + "learning_rate": 1e-05, + "loss": 0.1108, + "num_tokens": 171136048.0, + "reward": 0.3828125, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.0011446340940892696, + "sampling/sampling_logp_difference/max": 6.772670269012451, + "sampling/sampling_logp_difference/mean": 0.019680369645357132, + "step": 214 + }, + { + "clip_ratio/high_max": 5.620756382995751e-06, + "clip_ratio/high_mean": 1.4051890957489377e-06, + "clip_ratio/low_mean": 4.3911951024711016e-05, + "clip_ratio/low_min": 3.7100794543221127e-06, + "clip_ratio/region_mean": 4.531714012045995e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16298.0, + "completions/mean_length": 6418.3359375, + "completions/mean_terminated_length": 6339.8662109375, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.8599612265825272, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018101281020790339, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 171976483.0, + "reward": 0.390625, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999486207962036, + "sampling/importance_sampling_ratio/min": 4.0352391806663945e-05, + "sampling/sampling_logp_difference/max": 10.117859840393066, + "sampling/sampling_logp_difference/mean": 0.01834172010421753, + "step": 215 + }, + { + "clip_ratio/high_max": 8.747987521928735e-06, + "clip_ratio/high_mean": 2.1869968804821838e-06, + "clip_ratio/low_mean": 1.736767285365204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9554669734134222e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15075.0, + "completions/mean_length": 5835.1484375, + "completions/mean_terminated_length": 5752.08642578125, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.930196188390255, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0009842904983088374, + "learning_rate": 1e-05, + "loss": 0.0174, + "num_tokens": 172743158.0, + "reward": 0.3515625, + "reward_std": 0.12863078713417053, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000227689743042, + "sampling/importance_sampling_ratio/min": 0.02929825149476528, + "sampling/sampling_logp_difference/max": 3.5302274227142334, + "sampling/sampling_logp_difference/mean": 0.020194582641124725, + "step": 216 + }, + { + "clip_ratio/high_max": 3.4560856420284836e-05, + "clip_ratio/high_mean": 1.2245807511135354e-05, + "clip_ratio/low_mean": 4.938034498991328e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.162615136418026e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15807.0, + "completions/mean_length": 4960.5234375, + "completions/mean_terminated_length": 4870.57470703125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.7726479545235634, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0032878813799470663, + "learning_rate": 1e-05, + "loss": -0.0492, + "num_tokens": 173400993.0, + "reward": 0.5, + "reward_std": 0.3924228549003601, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999999403953552, + "sampling/importance_sampling_ratio/min": 1.9806284399237484e-06, + "sampling/sampling_logp_difference/max": 13.132096290588379, + "sampling/sampling_logp_difference/mean": 0.018239401280879974, + "step": 217 + }, + { + "clip_ratio/high_max": 9.530344868835527e-06, + "clip_ratio/high_mean": 2.382586217208882e-06, + "clip_ratio/low_mean": 1.8789201192248584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1171787466300884e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15588.0, + "completions/max_terminated_length": 15588.0, + "completions/mean_length": 6778.453125, + "completions/mean_terminated_length": 6778.453125, + "completions/min_length": 709.0, + "completions/min_terminated_length": 709.0, + "entropy": 0.9891144260764122, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021506824996322393, + "learning_rate": 1e-05, + "loss": 0.0872, + "num_tokens": 174286163.0, + "reward": 0.3203125, + "reward_std": 0.23910348117351532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002121925354, + "sampling/importance_sampling_ratio/min": 3.8179036891961005e-06, + "sampling/sampling_logp_difference/max": 12.475809097290039, + "sampling/sampling_logp_difference/mean": 0.019467821344733238, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.731942322498071e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.731942322498071e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16363.0, + "completions/mean_length": 7835.8203125, + "completions/mean_terminated_length": 7768.51171875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "entropy": 1.1394712179899216, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0019394620321691036, + "learning_rate": 1e-05, + "loss": 0.0144, + "num_tokens": 175314884.0, + "reward": 0.1171875, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999979138374329, + "sampling/importance_sampling_ratio/min": 0.0006493349210359156, + "sampling/sampling_logp_difference/max": 7.339561939239502, + "sampling/sampling_logp_difference/mean": 0.02314554899930954, + "step": 219 + }, + { + "clip_ratio/high_max": 2.6689051992434543e-05, + "clip_ratio/high_mean": 1.0311606502000359e-05, + "clip_ratio/low_mean": 4.749879690280068e-05, + "clip_ratio/low_min": 1.1613257356657414e-05, + "clip_ratio/region_mean": 5.781040522379044e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15977.0, + "completions/max_terminated_length": 15977.0, + "completions/mean_length": 6552.640625, + "completions/mean_terminated_length": 6552.640625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.9301942139863968, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029180990532040596, + "learning_rate": 1e-05, + "loss": 0.0895, + "num_tokens": 176170070.0, + "reward": 0.4921875, + "reward_std": 0.3527093529701233, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000029802322388, + "sampling/importance_sampling_ratio/min": 0.004631850868463516, + "sampling/sampling_logp_difference/max": 5.374798774719238, + "sampling/sampling_logp_difference/mean": 0.01968369632959366, + "step": 220 + }, + { + "clip_ratio/high_max": 6.5973504206340294e-06, + "clip_ratio/high_mean": 1.6493376051585074e-06, + "clip_ratio/low_mean": 3.3509465310999076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.515880302984442e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 6035.296875, + "completions/mean_terminated_length": 5953.81103515625, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "entropy": 0.9439655765891075, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013513187877833843, + "learning_rate": 1e-05, + "loss": 0.0062, + "num_tokens": 176962084.0, + "reward": 0.453125, + "reward_std": 0.23645779490470886, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 7.028038817225024e-05, + "sampling/sampling_logp_difference/max": 9.563017845153809, + "sampling/sampling_logp_difference/mean": 0.020156048238277435, + "step": 221 + }, + { + "clip_ratio/high_max": 4.21926688431995e-06, + "clip_ratio/high_mean": 1.0548167210799875e-06, + "clip_ratio/low_mean": 3.7025285053005064e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8080101546711376e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15470.0, + "completions/mean_length": 7192.4296875, + "completions/mean_terminated_length": 6895.92724609375, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 0.8545770645141602, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035121457185596228, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 177901579.0, + "reward": 0.328125, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998663663864136, + "sampling/importance_sampling_ratio/min": 0.000296071550110355, + "sampling/sampling_logp_difference/max": 8.124909400939941, + "sampling/sampling_logp_difference/mean": 0.018486706539988518, + "step": 222 + }, + { + "clip_ratio/high_max": 3.974942046625074e-06, + "clip_ratio/high_mean": 9.937355116562685e-07, + "clip_ratio/low_mean": 3.2998319056787295e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.399205434106989e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 6525.328125, + "completions/mean_terminated_length": 6124.56884765625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.8625697493553162, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002456578193232417, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 178756773.0, + "reward": 0.3984375, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999394416809082, + "sampling/importance_sampling_ratio/min": 0.0001488614798290655, + "sampling/sampling_logp_difference/max": 8.812494277954102, + "sampling/sampling_logp_difference/mean": 0.018010437488555908, + "step": 223 + }, + { + "clip_ratio/high_max": 1.2826577403757256e-05, + "clip_ratio/high_mean": 4.401672981657612e-06, + "clip_ratio/low_mean": 7.05404337395521e-05, + "clip_ratio/low_min": 1.734040552037186e-05, + "clip_ratio/region_mean": 7.494210694858339e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14933.0, + "completions/mean_length": 7227.640625, + "completions/mean_terminated_length": 6932.27392578125, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.7740364670753479, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003040029900148511, + "learning_rate": 1e-05, + "loss": 0.1685, + "num_tokens": 179700639.0, + "reward": 0.515625, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996599555015564, + "sampling/importance_sampling_ratio/min": 3.1452334496862022e-06, + "sampling/sampling_logp_difference/max": 12.669622421264648, + "sampling/sampling_logp_difference/mean": 0.018948577344417572, + "step": 224 + }, + { + "clip_ratio/high_max": 7.97244683781173e-06, + "clip_ratio/high_mean": 1.9931117094529327e-06, + "clip_ratio/low_mean": 2.7227763212067657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.922087492152059e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15639.0, + "completions/mean_length": 7019.4375, + "completions/mean_terminated_length": 6870.7939453125, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.9501559659838676, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001853659632615745, + "learning_rate": 1e-05, + "loss": 0.0498, + "num_tokens": 180615847.0, + "reward": 0.390625, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999617338180542, + "sampling/importance_sampling_ratio/min": 0.0061973449774086475, + "sampling/sampling_logp_difference/max": 5.083634376525879, + "sampling/sampling_logp_difference/mean": 0.021023310720920563, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.039616189606022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.039616189606022e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16384.0, + "completions/mean_length": 6705.03125, + "completions/mean_terminated_length": 6229.01611328125, + "completions/min_length": 1130.0, + "completions/min_terminated_length": 1130.0, + "entropy": 0.9054799973964691, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014863376272842288, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 181493971.0, + "reward": 0.3515625, + "reward_std": 0.2396402806043625, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.0023789836559444666, + "sampling/sampling_logp_difference/max": 6.04108190536499, + "sampling/sampling_logp_difference/mean": 0.019701875746250153, + "step": 226 + }, + { + "clip_ratio/high_max": 1.4479510582532384e-05, + "clip_ratio/high_mean": 3.619877645633096e-06, + "clip_ratio/low_mean": 2.6611398709519563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0231276070935564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5421.390625, + "completions/mean_terminated_length": 5421.390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9483538940548897, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0039733098819851875, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 182208309.0, + "reward": 0.484375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999675154685974, + "sampling/importance_sampling_ratio/min": 0.011960627511143684, + "sampling/sampling_logp_difference/max": 5.5837554931640625, + "sampling/sampling_logp_difference/mean": 0.01952577941119671, + "step": 227 + }, + { + "clip_ratio/high_max": 4.601678483595606e-06, + "clip_ratio/high_mean": 1.1504196208989015e-06, + "clip_ratio/low_mean": 4.089345225111174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2043871189889614e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6497.28125, + "completions/mean_terminated_length": 6340.349609375, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.8902791813015938, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015076796989887953, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 183058249.0, + "reward": 0.4453125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000579357147217, + "sampling/importance_sampling_ratio/min": 0.011128061451017857, + "sampling/sampling_logp_difference/max": 4.498285293579102, + "sampling/sampling_logp_difference/mean": 0.019255032762885094, + "step": 228 + }, + { + "clip_ratio/high_max": 5.255413270788267e-06, + "clip_ratio/high_mean": 1.3138533176970668e-06, + "clip_ratio/low_mean": 3.985653711424675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1170390431943815e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14710.0, + "completions/max_terminated_length": 14710.0, + "completions/mean_length": 4411.4453125, + "completions/mean_terminated_length": 4411.4453125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 1.104304239153862, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002237006789073348, + "learning_rate": 1e-05, + "loss": 0.1124, + "num_tokens": 183645026.0, + "reward": 0.3203125, + "reward_std": 0.22461041808128357, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000056028366089, + "sampling/importance_sampling_ratio/min": 4.804155082638317e-07, + "sampling/sampling_logp_difference/max": 14.548614501953125, + "sampling/sampling_logp_difference/mean": 0.020417846739292145, + "step": 229 + }, + { + "clip_ratio/high_max": 4.956973498337902e-06, + "clip_ratio/high_mean": 1.2392433745844755e-06, + "clip_ratio/low_mean": 4.839278165036376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9632024911261396e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15486.0, + "completions/mean_length": 5763.3828125, + "completions/mean_terminated_length": 5508.48828125, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 0.7673545032739639, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0027243588119745255, + "learning_rate": 1e-05, + "loss": 0.0747, + "num_tokens": 184402387.0, + "reward": 0.4375, + "reward_std": 0.3661494255065918, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999270439147949, + "sampling/importance_sampling_ratio/min": 0.0008851620368659496, + "sampling/sampling_logp_difference/max": 7.029739856719971, + "sampling/sampling_logp_difference/mean": 0.01735807955265045, + "step": 230 + }, + { + "clip_ratio/high_max": 1.412869187333854e-05, + "clip_ratio/high_mean": 3.532172968334635e-06, + "clip_ratio/low_mean": 4.364474455087475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.717691729183571e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15837.0, + "completions/mean_length": 6143.3125, + "completions/mean_terminated_length": 5980.76220703125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 0.9383679181337357, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016755202086642385, + "learning_rate": 1e-05, + "loss": 0.1134, + "num_tokens": 185207315.0, + "reward": 0.40625, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.00010746628686320037, + "sampling/sampling_logp_difference/max": 9.138333320617676, + "sampling/sampling_logp_difference/mean": 0.01892942003905773, + "step": 231 + }, + { + "clip_ratio/high_max": 5.389092621044256e-06, + "clip_ratio/high_mean": 1.347273155261064e-06, + "clip_ratio/low_mean": 4.616663244405572e-05, + "clip_ratio/low_min": 5.818554200232029e-06, + "clip_ratio/region_mean": 4.7513905599316786e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16101.0, + "completions/mean_length": 6852.234375, + "completions/mean_terminated_length": 6623.47216796875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.9856249913573265, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036351638846099377, + "learning_rate": 1e-05, + "loss": 0.0413, + "num_tokens": 186104113.0, + "reward": 0.375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 0.0006267272983677685, + "sampling/sampling_logp_difference/max": 7.374999046325684, + "sampling/sampling_logp_difference/mean": 0.021776381880044937, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.837307613390294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.837307613390294e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 6634.1484375, + "completions/mean_terminated_length": 6479.38916015625, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 1.0182439163327217, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003553485032171011, + "learning_rate": 1e-05, + "loss": 0.0886, + "num_tokens": 186973796.0, + "reward": 0.34375, + "reward_std": 0.24381662905216217, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.00038018118357285857, + "sampling/sampling_logp_difference/max": 7.8748626708984375, + "sampling/sampling_logp_difference/mean": 0.02058180794119835, + "step": 233 + }, + { + "clip_ratio/high_max": 1.4436222500080476e-05, + "clip_ratio/high_mean": 3.609055625020119e-06, + "clip_ratio/low_mean": 5.134189859745675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.495095410879003e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14911.0, + "completions/mean_length": 6424.2421875, + "completions/mean_terminated_length": 6266.1513671875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.9030232205986977, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002669632900506258, + "learning_rate": 1e-05, + "loss": 0.0828, + "num_tokens": 187820443.0, + "reward": 0.34375, + "reward_std": 0.2817176878452301, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942183494568, + "sampling/importance_sampling_ratio/min": 0.004488746635615826, + "sampling/sampling_logp_difference/max": 5.406181812286377, + "sampling/sampling_logp_difference/mean": 0.01908625289797783, + "step": 234 + }, + { + "clip_ratio/high_max": 1.4932538306311471e-05, + "clip_ratio/high_mean": 3.733134576577868e-06, + "clip_ratio/low_mean": 2.516909023597691e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8902224585181102e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14473.0, + "completions/mean_length": 6582.21875, + "completions/mean_terminated_length": 6505.03955078125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "entropy": 0.9906348586082458, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021964670158922672, + "learning_rate": 1e-05, + "loss": 0.0122, + "num_tokens": 188682111.0, + "reward": 0.2734375, + "reward_std": 0.22908620536327362, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.01623692736029625, + "sampling/sampling_logp_difference/max": 4.9629387855529785, + "sampling/sampling_logp_difference/mean": 0.020555656403303146, + "step": 235 + }, + { + "clip_ratio/high_max": 1.3005691471335012e-05, + "clip_ratio/high_mean": 3.251422867833753e-06, + "clip_ratio/low_mean": 4.822792686809407e-05, + "clip_ratio/low_min": 4.575235379888909e-06, + "clip_ratio/region_mean": 5.147934950855415e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 6687.8359375, + "completions/mean_terminated_length": 6611.48828125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.9669140502810478, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0032587468158453703, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 189556570.0, + "reward": 0.375, + "reward_std": 0.36956924200057983, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.002121176104992628, + "sampling/sampling_logp_difference/max": 6.155784606933594, + "sampling/sampling_logp_difference/mean": 0.020776130259037018, + "step": 236 + }, + { + "clip_ratio/high_max": 2.541685034884722e-05, + "clip_ratio/high_mean": 6.354212587211805e-06, + "clip_ratio/low_mean": 4.488310526085115e-05, + "clip_ratio/low_min": 4.259959951014025e-06, + "clip_ratio/region_mean": 5.123731762068928e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14579.0, + "completions/mean_length": 5933.890625, + "completions/mean_terminated_length": 5851.6064453125, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 0.777520164847374, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023373132571578026, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 190333676.0, + "reward": 0.5390625, + "reward_std": 0.3577219247817993, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999416470527649, + "sampling/importance_sampling_ratio/min": 1.3007656889385544e-05, + "sampling/sampling_logp_difference/max": 11.249972343444824, + "sampling/sampling_logp_difference/mean": 0.017036860808730125, + "step": 237 + }, + { + "clip_ratio/high_max": 9.352454981126357e-06, + "clip_ratio/high_mean": 2.3381137452815892e-06, + "clip_ratio/low_mean": 3.286883497821691e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5206948496124824e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16254.0, + "completions/mean_length": 6691.53125, + "completions/mean_terminated_length": 6537.68310546875, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "entropy": 1.0021202191710472, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033220481127500534, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 191208240.0, + "reward": 0.2265625, + "reward_std": 0.23987272381782532, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999876618385315, + "sampling/importance_sampling_ratio/min": 0.006665683817118406, + "sampling/sampling_logp_difference/max": 5.010782718658447, + "sampling/sampling_logp_difference/mean": 0.02151130512356758, + "step": 238 + }, + { + "clip_ratio/high_max": 2.0475443307077512e-05, + "clip_ratio/high_mean": 5.118860826769378e-06, + "clip_ratio/low_mean": 4.199072691335459e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7109587512750295e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15653.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 5480.5078125, + "completions/mean_terminated_length": 5480.5078125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.774504691362381, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002824194496497512, + "learning_rate": 1e-05, + "loss": 0.0472, + "num_tokens": 191927753.0, + "reward": 0.5078125, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999160766601562, + "sampling/importance_sampling_ratio/min": 2.561557721492136e-06, + "sampling/sampling_logp_difference/max": 12.874895095825195, + "sampling/sampling_logp_difference/mean": 0.01758616417646408, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.71521939541708e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.71521939541708e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16232.0, + "completions/mean_length": 6245.171875, + "completions/mean_terminated_length": 6001.84033203125, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "entropy": 0.9671605005860329, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020431289449334145, + "learning_rate": 1e-05, + "loss": 0.0527, + "num_tokens": 192746327.0, + "reward": 0.3359375, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999756813049316, + "sampling/importance_sampling_ratio/min": 7.518127677030861e-05, + "sampling/sampling_logp_difference/max": 9.49560832977295, + "sampling/sampling_logp_difference/mean": 0.02066320925951004, + "step": 240 + }, + { + "clip_ratio/high_max": 1.1142639777972363e-05, + "clip_ratio/high_mean": 2.7856599444930907e-06, + "clip_ratio/low_mean": 4.276063509678352e-05, + "clip_ratio/low_min": 3.055412889807485e-06, + "clip_ratio/region_mean": 4.554629526865028e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16371.0, + "completions/max_terminated_length": 15709.0, + "completions/mean_length": 6828.8515625, + "completions/mean_terminated_length": 6677.38916015625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.9914879351854324, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019144542748108506, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193643468.0, + "reward": 0.34375, + "reward_std": 0.3264309763908386, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000360012054443, + "sampling/importance_sampling_ratio/min": 0.0003172139695379883, + "sampling/sampling_logp_difference/max": 8.055933952331543, + "sampling/sampling_logp_difference/mean": 0.020327996462583542, + "step": 241 + }, + { + "clip_ratio/high_max": 1.3134391338098794e-05, + "clip_ratio/high_mean": 3.2835978345246986e-06, + "clip_ratio/low_mean": 5.683154779489996e-05, + "clip_ratio/low_min": 4.3356108108127955e-06, + "clip_ratio/region_mean": 6.011514608417201e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16289.0, + "completions/mean_length": 6280.125, + "completions/mean_terminated_length": 5954.193359375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.8634965419769287, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022551591973751783, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 194465324.0, + "reward": 0.46875, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999502897262573, + "sampling/importance_sampling_ratio/min": 0.003390352241694927, + "sampling/sampling_logp_difference/max": 5.686821460723877, + "sampling/sampling_logp_difference/mean": 0.019659511744976044, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.619306153268553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619306153268553e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15646.0, + "completions/mean_length": 6910.5625, + "completions/mean_terminated_length": 6525.46337890625, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.9886282533407211, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012924466282129288, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 195369580.0, + "reward": 0.3984375, + "reward_std": 0.2590838074684143, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000083327293396, + "sampling/importance_sampling_ratio/min": 1.0787954124680255e-05, + "sampling/sampling_logp_difference/max": 11.437080383300781, + "sampling/sampling_logp_difference/mean": 0.020975295454263687, + "step": 243 + }, + { + "clip_ratio/high_max": 1.377244143441203e-05, + "clip_ratio/high_mean": 3.4431103586030076e-06, + "clip_ratio/low_mean": 2.4107489650759817e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7550600123049662e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12768.0, + "completions/mean_length": 5647.53125, + "completions/mean_terminated_length": 5562.9921875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.8360519111156464, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019667574670165777, + "learning_rate": 1e-05, + "loss": 0.0333, + "num_tokens": 196110328.0, + "reward": 0.4921875, + "reward_std": 0.33508312702178955, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999792575836182, + "sampling/importance_sampling_ratio/min": 0.00731487525627017, + "sampling/sampling_logp_difference/max": 4.917845249176025, + "sampling/sampling_logp_difference/mean": 0.017768483608961105, + "step": 244 + }, + { + "clip_ratio/high_max": 1.114784731726104e-05, + "clip_ratio/high_mean": 2.78696182931526e-06, + "clip_ratio/low_mean": 2.6054579166157055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8841540995472315e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15671.0, + "completions/mean_length": 6249.6171875, + "completions/mean_terminated_length": 6088.75439453125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.837661437690258, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017836211482062936, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 196926255.0, + "reward": 0.4453125, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443888664246, + "sampling/importance_sampling_ratio/min": 8.313281432492658e-05, + "sampling/sampling_logp_difference/max": 9.395071029663086, + "sampling/sampling_logp_difference/mean": 0.018142729997634888, + "step": 245 + }, + { + "clip_ratio/high_max": 3.1028919238451635e-06, + "clip_ratio/high_mean": 7.757229809612909e-07, + "clip_ratio/low_mean": 5.6368714012933196e-05, + "clip_ratio/low_min": 5.583348411164479e-06, + "clip_ratio/region_mean": 5.7144436595990555e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14663.0, + "completions/mean_length": 5561.796875, + "completions/mean_terminated_length": 5476.58251953125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 1.0337117239832878, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032067650463432074, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 197657021.0, + "reward": 0.421875, + "reward_std": 0.3603675961494446, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000261068344116, + "sampling/importance_sampling_ratio/min": 0.0026236141566187143, + "sampling/sampling_logp_difference/max": 5.943202495574951, + "sampling/sampling_logp_difference/mean": 0.02046290785074234, + "step": 246 + }, + { + "clip_ratio/high_max": 2.244927713945799e-05, + "clip_ratio/high_mean": 5.612319284864498e-06, + "clip_ratio/low_mean": 3.963059293710103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5242911710374756e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14806.0, + "completions/mean_length": 7230.09375, + "completions/mean_terminated_length": 7010.400390625, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "entropy": 0.9666887000203133, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002695069881156087, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 198604673.0, + "reward": 0.390625, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999954104423523, + "sampling/importance_sampling_ratio/min": 0.004087009001523256, + "sampling/sampling_logp_difference/max": 5.499941825866699, + "sampling/sampling_logp_difference/mean": 0.021222755312919617, + "step": 247 + }, + { + "clip_ratio/high_max": 6.0509246395668015e-06, + "clip_ratio/high_mean": 3.018646339114639e-06, + "clip_ratio/low_mean": 4.125545319766388e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4274099309404846e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14964.0, + "completions/mean_length": 7186.09375, + "completions/mean_terminated_length": 7040.095703125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.9754119142889977, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014327351236715913, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 199545181.0, + "reward": 0.328125, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 3.340628245496191e-05, + "sampling/sampling_logp_difference/max": 10.306766510009766, + "sampling/sampling_logp_difference/mean": 0.02061491459608078, + "step": 248 + }, + { + "clip_ratio/high_max": 1.3521318351195077e-05, + "clip_ratio/high_mean": 3.3803295877987694e-06, + "clip_ratio/low_mean": 4.744600971662294e-05, + "clip_ratio/low_min": 4.111165708309272e-06, + "clip_ratio/region_mean": 5.08263395886388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15812.0, + "completions/mean_length": 7464.1328125, + "completions/mean_terminated_length": 7322.5478515625, + "completions/min_length": 994.0, + "completions/min_terminated_length": 994.0, + "entropy": 1.0257701128721237, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017415130278095603, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 200521262.0, + "reward": 0.296875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000050067901611, + "sampling/importance_sampling_ratio/min": 0.004382971208542585, + "sampling/sampling_logp_difference/max": 5.430028438568115, + "sampling/sampling_logp_difference/mean": 0.02146603912115097, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6656134000168095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6656134000168095e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7929.0390625, + "completions/mean_terminated_length": 6973.2607421875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.8728866130113602, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018543615005910397, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 201553491.0, + "reward": 0.25, + "reward_std": 0.3237725794315338, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999157786369324, + "sampling/importance_sampling_ratio/min": 0.0002044498542090878, + "sampling/sampling_logp_difference/max": 8.495187759399414, + "sampling/sampling_logp_difference/mean": 0.01925993338227272, + "step": 250 + }, + { + "clip_ratio/high_max": 1.5812252968316898e-05, + "clip_ratio/high_mean": 3.9530632420792244e-06, + "clip_ratio/low_mean": 4.320342043229175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.715648356068414e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6577.84375, + "completions/mean_terminated_length": 6261.51611328125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.759723886847496, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001268691150471568, + "learning_rate": 1e-05, + "loss": 0.117, + "num_tokens": 202411655.0, + "reward": 0.515625, + "reward_std": 0.34822866320610046, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999426603317261, + "sampling/importance_sampling_ratio/min": 0.0004213420324958861, + "sampling/sampling_logp_difference/max": 7.77206563949585, + "sampling/sampling_logp_difference/mean": 0.018232906237244606, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.175654944698181e-05, + "clip_ratio/low_min": 8.377270660275826e-06, + "clip_ratio/region_mean": 3.175654944698181e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16117.0, + "completions/max_terminated_length": 16117.0, + "completions/mean_length": 6513.65625, + "completions/mean_terminated_length": 6513.65625, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "entropy": 1.0247815549373627, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004479583352804184, + "learning_rate": 1e-05, + "loss": -0.0114, + "num_tokens": 203265811.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999909400939941, + "sampling/importance_sampling_ratio/min": 0.011329792439937592, + "sampling/sampling_logp_difference/max": 4.480319499969482, + "sampling/sampling_logp_difference/mean": 0.02229863964021206, + "step": 252 + }, + { + "clip_ratio/high_max": 5.371261522668647e-06, + "clip_ratio/high_mean": 1.3428153806671617e-06, + "clip_ratio/low_mean": 4.290480364943505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4247618916415377e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16035.0, + "completions/max_terminated_length": 16035.0, + "completions/mean_length": 6013.6171875, + "completions/mean_terminated_length": 6013.6171875, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "entropy": 0.8476304411888123, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017210334772244096, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 204054186.0, + "reward": 0.5078125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998961687088013, + "sampling/importance_sampling_ratio/min": 3.32363242705469e-06, + "sampling/sampling_logp_difference/max": 12.614452362060547, + "sampling/sampling_logp_difference/mean": 0.018720701336860657, + "step": 253 + }, + { + "clip_ratio/high_max": 1.4894108517182758e-05, + "clip_ratio/high_mean": 3.7235271292956895e-06, + "clip_ratio/low_mean": 3.136672694381559e-05, + "clip_ratio/low_min": 3.941974227927858e-06, + "clip_ratio/region_mean": 3.509025418679812e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14989.0, + "completions/max_terminated_length": 14989.0, + "completions/mean_length": 7090.2109375, + "completions/mean_terminated_length": 7090.2109375, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.9804464280605316, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003268485888838768, + "learning_rate": 1e-05, + "loss": 0.0441, + "num_tokens": 204982085.0, + "reward": 0.3828125, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999740719795227, + "sampling/importance_sampling_ratio/min": 6.605670205317438e-05, + "sampling/sampling_logp_difference/max": 9.62499713897705, + "sampling/sampling_logp_difference/mean": 0.021524619311094284, + "step": 254 + }, + { + "clip_ratio/high_max": 1.3869113445252879e-05, + "clip_ratio/high_mean": 3.4672783613132196e-06, + "clip_ratio/low_mean": 3.1164222662027896e-05, + "clip_ratio/low_min": 2.928154799519689e-06, + "clip_ratio/region_mean": 3.46315009664977e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 6272.65625, + "completions/mean_terminated_length": 6112.1591796875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.8322838544845581, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002127156127244234, + "learning_rate": 1e-05, + "loss": 0.0142, + "num_tokens": 205805529.0, + "reward": 0.4296875, + "reward_std": 0.3385029733181, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999648928642273, + "sampling/importance_sampling_ratio/min": 0.00019322636944707483, + "sampling/sampling_logp_difference/max": 8.551648139953613, + "sampling/sampling_logp_difference/mean": 0.018514126539230347, + "step": 255 + }, + { + "clip_ratio/high_max": 7.213966455310583e-06, + "clip_ratio/high_mean": 4.349803020886611e-06, + "clip_ratio/low_mean": 3.907777556833025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3427579043964215e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6422.7109375, + "completions/mean_terminated_length": 5846.43798828125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.8222996592521667, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001939435489475727, + "learning_rate": 1e-05, + "loss": 0.1001, + "num_tokens": 206647908.0, + "reward": 0.4609375, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 6.205694808159024e-05, + "sampling/sampling_logp_difference/max": 9.687458038330078, + "sampling/sampling_logp_difference/mean": 0.018810249865055084, + "step": 256 + }, + { + "clip_ratio/high_max": 2.1247945142022218e-05, + "clip_ratio/high_mean": 6.189401005940454e-06, + "clip_ratio/low_mean": 4.7238423121598316e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342782378647826e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15664.0, + "completions/mean_length": 6179.8046875, + "completions/mean_terminated_length": 6099.45654296875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.031787522137165, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002615252509713173, + "learning_rate": 1e-05, + "loss": 0.0147, + "num_tokens": 207459043.0, + "reward": 0.5, + "reward_std": 0.3232533931732178, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 1.9359204088686965e-05, + "sampling/sampling_logp_difference/max": 10.85234260559082, + "sampling/sampling_logp_difference/mean": 0.020463883876800537, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5109407349409594e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5109407349409594e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16052.0, + "completions/mean_length": 7093.5390625, + "completions/mean_terminated_length": 6474.17529296875, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8378612920641899, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002656357828527689, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208389800.0, + "reward": 0.3828125, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998178482055664, + "sampling/importance_sampling_ratio/min": 2.1559546439675614e-05, + "sampling/sampling_logp_difference/max": 10.744691848754883, + "sampling/sampling_logp_difference/mean": 0.01860899105668068, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7354818396597693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7354818396597693e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 7782.46875, + "completions/mean_terminated_length": 7576.34423828125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 1.0068294331431389, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026847824919968843, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 209407212.0, + "reward": 0.3203125, + "reward_std": 0.2188364565372467, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 2.5824127078521997e-05, + "sampling/sampling_logp_difference/max": 10.564201354980469, + "sampling/sampling_logp_difference/mean": 0.021435359492897987, + "step": 259 + }, + { + "clip_ratio/high_max": 1.5335908301494783e-05, + "clip_ratio/high_mean": 3.833977075373696e-06, + "clip_ratio/low_mean": 3.303791140751855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6871888482892246e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 6713.3359375, + "completions/mean_terminated_length": 6637.18896484375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.8899351507425308, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019718443509191275, + "learning_rate": 1e-05, + "loss": 0.0167, + "num_tokens": 210286983.0, + "reward": 0.4140625, + "reward_std": 0.29719969630241394, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000264644622803, + "sampling/importance_sampling_ratio/min": 8.772138971835375e-05, + "sampling/sampling_logp_difference/max": 9.341344833374023, + "sampling/sampling_logp_difference/mean": 0.019354315474629402, + "step": 260 + }, + { + "clip_ratio/high_max": 2.0819897144974675e-05, + "clip_ratio/high_mean": 5.204974286243669e-06, + "clip_ratio/low_mean": 3.656347121250292e-05, + "clip_ratio/low_min": 5.0166554501629435e-06, + "clip_ratio/region_mean": 4.176844549874659e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14552.0, + "completions/mean_length": 6275.5390625, + "completions/mean_terminated_length": 6115.087890625, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "entropy": 0.901648998260498, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.0029727297369390726, + "learning_rate": 1e-05, + "loss": 0.0593, + "num_tokens": 211107380.0, + "reward": 0.40625, + "reward_std": 0.4373784065246582, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999792575836182, + "sampling/importance_sampling_ratio/min": 0.00043164435192011297, + "sampling/sampling_logp_difference/max": 7.747908592224121, + "sampling/sampling_logp_difference/mean": 0.019338306039571762, + "step": 261 + }, + { + "clip_ratio/high_max": 4.363734251455753e-05, + "clip_ratio/high_mean": 1.2403264463500818e-05, + "clip_ratio/low_mean": 4.217202859990721e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4575292381287e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 5959.7578125, + "completions/mean_terminated_length": 5877.67724609375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8542912155389786, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028311724308878183, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 211890237.0, + "reward": 0.515625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 0.0007836154545657337, + "sampling/sampling_logp_difference/max": 7.151592254638672, + "sampling/sampling_logp_difference/mean": 0.018685901537537575, + "step": 262 + }, + { + "clip_ratio/high_max": 1.514913219580194e-05, + "clip_ratio/high_mean": 3.787283048950485e-06, + "clip_ratio/low_mean": 3.2207174626819324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5994458357890835e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16008.0, + "completions/mean_length": 6882.1875, + "completions/mean_terminated_length": 6575.67724609375, + "completions/min_length": 1170.0, + "completions/min_terminated_length": 1170.0, + "entropy": 0.9642625227570534, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002143653342500329, + "learning_rate": 1e-05, + "loss": 0.0127, + "num_tokens": 212792813.0, + "reward": 0.359375, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999080896377563, + "sampling/importance_sampling_ratio/min": 0.0034667642321437597, + "sampling/sampling_logp_difference/max": 5.664533615112305, + "sampling/sampling_logp_difference/mean": 0.020183943212032318, + "step": 263 + }, + { + "clip_ratio/high_max": 1.7900180637298035e-05, + "clip_ratio/high_mean": 4.475045159324509e-06, + "clip_ratio/low_mean": 3.741970294868224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1894748392223846e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6941.8828125, + "completions/mean_terminated_length": 6715.2724609375, + "completions/min_length": 978.0, + "completions/min_terminated_length": 978.0, + "entropy": 0.9488044381141663, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014945612056180835, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 213703638.0, + "reward": 0.3984375, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999405145645142, + "sampling/importance_sampling_ratio/min": 0.0005360813229344785, + "sampling/sampling_logp_difference/max": 7.531224727630615, + "sampling/sampling_logp_difference/mean": 0.02019106224179268, + "step": 264 + }, + { + "clip_ratio/high_max": 4.028359853691654e-06, + "clip_ratio/high_mean": 1.0070899634229136e-06, + "clip_ratio/low_mean": 4.494676113608875e-05, + "clip_ratio/low_min": 3.771535375562962e-06, + "clip_ratio/region_mean": 4.595385098582483e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14403.0, + "completions/mean_length": 6453.2109375, + "completions/mean_terminated_length": 6295.57958984375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.9140987247228622, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001788914087228477, + "learning_rate": 1e-05, + "loss": 0.0573, + "num_tokens": 214551065.0, + "reward": 0.3984375, + "reward_std": 0.34245961904525757, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999093413352966, + "sampling/importance_sampling_ratio/min": 6.614608719246462e-05, + "sampling/sampling_logp_difference/max": 9.623644828796387, + "sampling/sampling_logp_difference/mean": 0.01938386633992195, + "step": 265 + }, + { + "clip_ratio/high_max": 1.3890341051592259e-05, + "clip_ratio/high_mean": 3.4725852628980647e-06, + "clip_ratio/low_mean": 2.701378042502256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0486365801607462e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16074.0, + "completions/mean_length": 7625.375, + "completions/mean_terminated_length": 7556.4091796875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.9313022494316101, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023314026184380054, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 215546625.0, + "reward": 0.3515625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 3.250058568937675e-07, + "sampling/sampling_logp_difference/max": 14.939422607421875, + "sampling/sampling_logp_difference/mean": 0.020401259884238243, + "step": 266 + }, + { + "clip_ratio/high_max": 2.9235679903649725e-05, + "clip_ratio/high_mean": 7.308919975912431e-06, + "clip_ratio/low_mean": 2.5110286742346943e-05, + "clip_ratio/low_min": 3.1065162602317287e-06, + "clip_ratio/region_mean": 3.24192064908857e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16084.0, + "completions/mean_length": 6315.3046875, + "completions/mean_terminated_length": 6155.484375, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "entropy": 0.8942855522036552, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003379981964826584, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 216377176.0, + "reward": 0.421875, + "reward_std": 0.31587696075439453, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999739527702332, + "sampling/importance_sampling_ratio/min": 0.008766444399952888, + "sampling/sampling_logp_difference/max": 4.736824035644531, + "sampling/sampling_logp_difference/mean": 0.01958339475095272, + "step": 267 + }, + { + "clip_ratio/high_max": 1.070113876266987e-05, + "clip_ratio/high_mean": 2.6752846906674677e-06, + "clip_ratio/low_mean": 3.970586050172642e-05, + "clip_ratio/low_min": 5.915619567531394e-06, + "clip_ratio/region_mean": 4.238114468080312e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15699.0, + "completions/mean_length": 7196.7109375, + "completions/mean_terminated_length": 6823.24365234375, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 1.0663049817085266, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025235258508473635, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 217316755.0, + "reward": 0.3359375, + "reward_std": 0.2893138825893402, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999923586845398, + "sampling/importance_sampling_ratio/min": 0.0007813565316610038, + "sampling/sampling_logp_difference/max": 7.154479026794434, + "sampling/sampling_logp_difference/mean": 0.02093672752380371, + "step": 268 + }, + { + "clip_ratio/high_max": 3.7446132409968413e-05, + "clip_ratio/high_mean": 1.0083826055051759e-05, + "clip_ratio/low_mean": 5.169025735085597e-05, + "clip_ratio/low_min": 5.641812549583847e-06, + "clip_ratio/region_mean": 6.177408295116038e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16286.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 6770.59375, + "completions/mean_terminated_length": 6770.59375, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 1.0205552130937576, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038966729771345854, + "learning_rate": 1e-05, + "loss": 0.0849, + "num_tokens": 218203975.0, + "reward": 0.4140625, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994924068450928, + "sampling/importance_sampling_ratio/min": 2.5875104370243207e-07, + "sampling/sampling_logp_difference/max": 15.167399406433105, + "sampling/sampling_logp_difference/mean": 0.025428105145692825, + "step": 269 + }, + { + "clip_ratio/high_max": 3.3825838272605324e-06, + "clip_ratio/high_mean": 8.456459568151331e-07, + "clip_ratio/low_mean": 2.8302461942075752e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9148108296794817e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15749.0, + "completions/mean_length": 7115.6953125, + "completions/mean_terminated_length": 6968.57958984375, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "entropy": 1.0728939920663834, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025828159414231777, + "learning_rate": 1e-05, + "loss": 0.0422, + "num_tokens": 219134568.0, + "reward": 0.2890625, + "reward_std": 0.21990221738815308, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999753832817078, + "sampling/importance_sampling_ratio/min": 0.0019932277500629425, + "sampling/sampling_logp_difference/max": 6.2179999351501465, + "sampling/sampling_logp_difference/mean": 0.02109808847308159, + "step": 270 + }, + { + "clip_ratio/high_max": 8.590399147578864e-06, + "clip_ratio/high_mean": 2.147599786894716e-06, + "clip_ratio/low_mean": 4.2856369077526324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5003969148638134e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15560.0, + "completions/mean_length": 6160.125, + "completions/mean_terminated_length": 5914.75244140625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.8673425689339638, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002692030044272542, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 219943376.0, + "reward": 0.4375, + "reward_std": 0.34717273712158203, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801350593567, + "sampling/importance_sampling_ratio/min": 0.0021331151947379112, + "sampling/sampling_logp_difference/max": 6.150171756744385, + "sampling/sampling_logp_difference/mean": 0.01947931945323944, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4606903429667e-05, + "clip_ratio/low_min": 4.498344424064271e-06, + "clip_ratio/region_mean": 4.4606903429667e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14763.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5778.0234375, + "completions/mean_terminated_length": 5778.0234375, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 1.1366781443357468, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002457446651533246, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 220702603.0, + "reward": 0.3828125, + "reward_std": 0.3400956988334656, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996986985206604, + "sampling/importance_sampling_ratio/min": 1.4515491386646318e-07, + "sampling/sampling_logp_difference/max": 15.745464324951172, + "sampling/sampling_logp_difference/mean": 0.021183129400014877, + "step": 272 + }, + { + "clip_ratio/high_max": 6.248437784961425e-06, + "clip_ratio/high_mean": 2.4186024347727653e-06, + "clip_ratio/low_mean": 1.783873301519634e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.025733522259543e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 7509.078125, + "completions/mean_terminated_length": 7296.08056640625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 1.071702554821968, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002503670286387205, + "learning_rate": 1e-05, + "loss": -0.0088, + "num_tokens": 221683925.0, + "reward": 0.2734375, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.00013993355969432741, + "sampling/sampling_logp_difference/max": 8.874342918395996, + "sampling/sampling_logp_difference/mean": 0.021589912474155426, + "step": 273 + }, + { + "clip_ratio/high_max": 2.347871304664295e-05, + "clip_ratio/high_mean": 6.97559880791232e-06, + "clip_ratio/low_mean": 2.81686479866039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.514424770401092e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15153.0, + "completions/mean_length": 7383.03125, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8432145267724991, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002299589104950428, + "learning_rate": 1e-05, + "loss": 0.0212, + "num_tokens": 222648865.0, + "reward": 0.3125, + "reward_std": 0.2845909595489502, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999674558639526, + "sampling/importance_sampling_ratio/min": 2.8099755581934005e-05, + "sampling/sampling_logp_difference/max": 10.47974967956543, + "sampling/sampling_logp_difference/mean": 0.018576428294181824, + "step": 274 + }, + { + "clip_ratio/high_max": 9.285309715778567e-06, + "clip_ratio/high_mean": 3.327153194732091e-06, + "clip_ratio/low_mean": 3.823394035862293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.156109298492083e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 6628.921875, + "completions/mean_terminated_length": 6552.1103515625, + "completions/min_length": 903.0, + "completions/min_terminated_length": 903.0, + "entropy": 0.9039670825004578, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024530349764972925, + "learning_rate": 1e-05, + "loss": 0.1161, + "num_tokens": 223519175.0, + "reward": 0.59375, + "reward_std": 0.3537701964378357, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999820590019226, + "sampling/importance_sampling_ratio/min": 0.0003009368374478072, + "sampling/sampling_logp_difference/max": 8.108610153198242, + "sampling/sampling_logp_difference/mean": 0.01871109940111637, + "step": 275 + }, + { + "clip_ratio/high_max": 1.5403714087369735e-05, + "clip_ratio/high_mean": 3.850928521842434e-06, + "clip_ratio/low_mean": 3.431152225630285e-05, + "clip_ratio/low_min": 4.570718374452554e-06, + "clip_ratio/region_mean": 3.816245106236238e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 7335.3359375, + "completions/mean_terminated_length": 7118.16845703125, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.8435061648488045, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019706569146364927, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 224479306.0, + "reward": 0.34375, + "reward_std": 0.28223684430122375, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 3.288762854936067e-06, + "sampling/sampling_logp_difference/max": 12.624999046325684, + "sampling/sampling_logp_difference/mean": 0.018783386796712875, + "step": 276 + }, + { + "clip_ratio/high_max": 1.979319677047897e-05, + "clip_ratio/high_mean": 4.948299192619743e-06, + "clip_ratio/low_mean": 2.4465696469633258e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9413995889626676e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 6052.1953125, + "completions/mean_terminated_length": 5718.9111328125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.8186529725790024, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001542358542792499, + "learning_rate": 1e-05, + "loss": 0.0906, + "num_tokens": 225273523.0, + "reward": 0.46875, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004768371582, + "sampling/importance_sampling_ratio/min": 0.0017039870144799352, + "sampling/sampling_logp_difference/max": 6.374784469604492, + "sampling/sampling_logp_difference/mean": 0.0183861143887043, + "step": 277 + }, + { + "clip_ratio/high_max": 2.5990090307459468e-05, + "clip_ratio/high_mean": 6.497522576864867e-06, + "clip_ratio/low_mean": 5.721013076254167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.370765299834602e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13648.0, + "completions/mean_length": 6560.75, + "completions/mean_terminated_length": 6404.82568359375, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 1.0198248624801636, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002488402184098959, + "learning_rate": 1e-05, + "loss": 0.0646, + "num_tokens": 226134235.0, + "reward": 0.375, + "reward_std": 0.3805803954601288, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 5.428586973721394e-06, + "sampling/sampling_logp_difference/max": 12.123831748962402, + "sampling/sampling_logp_difference/mean": 0.020803291350603104, + "step": 278 + }, + { + "clip_ratio/high_max": 1.1638113846856868e-05, + "clip_ratio/high_mean": 2.909528461714217e-06, + "clip_ratio/low_mean": 3.2134936191141605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.504446431179531e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12694.0, + "completions/max_terminated_length": 12694.0, + "completions/mean_length": 5217.140625, + "completions/mean_terminated_length": 5217.140625, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.8947679325938225, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035258245188742876, + "learning_rate": 1e-05, + "loss": 0.1095, + "num_tokens": 226821989.0, + "reward": 0.6015625, + "reward_std": 0.4092749357223511, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998952150344849, + "sampling/importance_sampling_ratio/min": 1.0208474122919142e-05, + "sampling/sampling_logp_difference/max": 11.492292404174805, + "sampling/sampling_logp_difference/mean": 0.018339669331908226, + "step": 279 + }, + { + "clip_ratio/high_max": 1.1735807220247807e-05, + "clip_ratio/high_mean": 2.9339518050619517e-06, + "clip_ratio/low_mean": 1.676440933806589e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9698360574693652e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 7622.609375, + "completions/mean_terminated_length": 7483.5400390625, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "entropy": 0.760207436978817, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001208966481499374, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 227815683.0, + "reward": 0.4609375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998728632926941, + "sampling/importance_sampling_ratio/min": 4.0069728129310533e-05, + "sampling/sampling_logp_difference/max": 10.124889373779297, + "sampling/sampling_logp_difference/mean": 0.018406979739665985, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.5826797437057394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5826797437057394e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 5981.90625, + "completions/mean_terminated_length": 5816.7939453125, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.902967743575573, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001788424444384873, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 228599647.0, + "reward": 0.4609375, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.0013331151567399502, + "sampling/sampling_logp_difference/max": 6.620236873626709, + "sampling/sampling_logp_difference/mean": 0.018927905708551407, + "step": 281 + }, + { + "clip_ratio/high_max": 1.6327461935361498e-05, + "clip_ratio/high_mean": 4.0818654838403745e-06, + "clip_ratio/low_mean": 3.461411097305245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.86959764000494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15850.0, + "completions/mean_length": 6156.0, + "completions/mean_terminated_length": 5993.6513671875, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "entropy": 0.8951378241181374, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039085340686142445, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 229405495.0, + "reward": 0.5234375, + "reward_std": 0.304566353559494, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 0.007635246496647596, + "sampling/sampling_logp_difference/max": 4.8749799728393555, + "sampling/sampling_logp_difference/mean": 0.018469247967004776, + "step": 282 + }, + { + "clip_ratio/high_max": 1.3168388704798417e-05, + "clip_ratio/high_mean": 3.2920971761996043e-06, + "clip_ratio/low_mean": 3.1043596322888334e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4335693726461614e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15808.0, + "completions/mean_length": 7229.234375, + "completions/mean_terminated_length": 6933.9189453125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 1.0803911909461021, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001881407224573195, + "learning_rate": 1e-05, + "loss": 0.0616, + "num_tokens": 230350725.0, + "reward": 0.2890625, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000948905944824, + "sampling/importance_sampling_ratio/min": 3.536981239449233e-05, + "sampling/sampling_logp_difference/max": 10.249651908874512, + "sampling/sampling_logp_difference/mean": 0.021804997697472572, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.664479729399318e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.664479729399318e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16135.0, + "completions/mean_length": 7486.2734375, + "completions/mean_terminated_length": 6971.52880859375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.9674680531024933, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0015280995285138488, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 231330664.0, + "reward": 0.234375, + "reward_std": 0.22620804607868195, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999110102653503, + "sampling/importance_sampling_ratio/min": 0.010103696957230568, + "sampling/sampling_logp_difference/max": 4.59485387802124, + "sampling/sampling_logp_difference/mean": 0.02071535401046276, + "step": 284 + }, + { + "clip_ratio/high_max": 7.207103408291005e-06, + "clip_ratio/high_mean": 3.596102942537982e-06, + "clip_ratio/low_mean": 4.2366073103039525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.596217695507221e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 6439.40625, + "completions/mean_terminated_length": 6361.1025390625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "entropy": 0.8368510156869888, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024581989273428917, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 232174804.0, + "reward": 0.40625, + "reward_std": 0.3527044653892517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999905228614807, + "sampling/importance_sampling_ratio/min": 0.0010985663393512368, + "sampling/sampling_logp_difference/max": 6.813749313354492, + "sampling/sampling_logp_difference/mean": 0.018181029707193375, + "step": 285 + }, + { + "clip_ratio/high_max": 2.0772107973243692e-05, + "clip_ratio/high_mean": 6.365107253714086e-06, + "clip_ratio/low_mean": 6.206619241311273e-05, + "clip_ratio/low_min": 1.0199641110375524e-05, + "clip_ratio/region_mean": 6.843129881417553e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15297.0, + "completions/mean_length": 6642.3984375, + "completions/mean_terminated_length": 6163.302734375, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 1.080193243920803, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026200765278190374, + "learning_rate": 1e-05, + "loss": 0.1, + "num_tokens": 233042999.0, + "reward": 0.3828125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808669090271, + "sampling/importance_sampling_ratio/min": 0.00035727949580177665, + "sampling/sampling_logp_difference/max": 7.936992168426514, + "sampling/sampling_logp_difference/mean": 0.020303232595324516, + "step": 286 + }, + { + "clip_ratio/high_max": 2.1764372377219843e-05, + "clip_ratio/high_mean": 5.441093094304961e-06, + "clip_ratio/low_mean": 8.049383222896722e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.593492520958534e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 5594.3984375, + "completions/mean_terminated_length": 5509.44091796875, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.8376244381070137, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028024003840982914, + "learning_rate": 1e-05, + "loss": 0.0317, + "num_tokens": 233778538.0, + "reward": 0.390625, + "reward_std": 0.3566610813140869, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999902844429016, + "sampling/importance_sampling_ratio/min": 0.030517347157001495, + "sampling/sampling_logp_difference/max": 3.489459991455078, + "sampling/sampling_logp_difference/mean": 0.01896265149116516, + "step": 287 + }, + { + "clip_ratio/high_max": 1.9571571556298295e-05, + "clip_ratio/high_mean": 4.892892889074574e-06, + "clip_ratio/low_mean": 1.3305952052178327e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8198844827566063e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16069.0, + "completions/mean_length": 6939.7890625, + "completions/mean_terminated_length": 6635.13671875, + "completions/min_length": 1303.0, + "completions/min_terminated_length": 1303.0, + "entropy": 0.923162192106247, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0004863851936534047, + "learning_rate": 1e-05, + "loss": 0.0663, + "num_tokens": 234683871.0, + "reward": 0.5234375, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 4.343670661910437e-05, + "sampling/sampling_logp_difference/max": 10.044205665588379, + "sampling/sampling_logp_difference/mean": 0.018946819007396698, + "step": 288 + }, + { + "clip_ratio/high_max": 2.6291640551789897e-05, + "clip_ratio/high_mean": 6.572910137947474e-06, + "clip_ratio/low_mean": 4.438247970028897e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0955390179296955e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15671.0, + "completions/mean_length": 5808.1796875, + "completions/mean_terminated_length": 5640.31005859375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.8330265805125237, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003028205828741193, + "learning_rate": 1e-05, + "loss": 0.0318, + "num_tokens": 235446758.0, + "reward": 0.5078125, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 0.05524001643061638, + "sampling/sampling_logp_difference/max": 3.001615524291992, + "sampling/sampling_logp_difference/mean": 0.018604904413223267, + "step": 289 + }, + { + "clip_ratio/high_max": 4.42854116045055e-06, + "clip_ratio/high_mean": 1.1071352901126374e-06, + "clip_ratio/low_mean": 3.1940794087859103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.30479292642849e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7106.125, + "completions/mean_terminated_length": 6806.83837890625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 1.0014382004737854, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022615960333496332, + "learning_rate": 1e-05, + "loss": 0.0369, + "num_tokens": 236377494.0, + "reward": 0.34375, + "reward_std": 0.33614397048950195, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999083280563354, + "sampling/importance_sampling_ratio/min": 0.0008234601118601859, + "sampling/sampling_logp_difference/max": 7.101995468139648, + "sampling/sampling_logp_difference/mean": 0.02129078283905983, + "step": 290 + }, + { + "clip_ratio/high_max": 9.011766906041885e-06, + "clip_ratio/high_mean": 2.252941726510471e-06, + "clip_ratio/low_mean": 2.9379379270721984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.163232122460613e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 6830.2109375, + "completions/mean_terminated_length": 6360.35205078125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.8726402744650841, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002122451551258564, + "learning_rate": 1e-05, + "loss": 0.0083, + "num_tokens": 237269977.0, + "reward": 0.484375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386072158813, + "sampling/importance_sampling_ratio/min": 0.0003835389798041433, + "sampling/sampling_logp_difference/max": 7.866069316864014, + "sampling/sampling_logp_difference/mean": 0.018967002630233765, + "step": 291 + }, + { + "clip_ratio/high_max": 3.987113814218901e-06, + "clip_ratio/high_mean": 9.967784535547253e-07, + "clip_ratio/low_mean": 2.8655875098593242e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9652653552147967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16246.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6704.171875, + "completions/mean_terminated_length": 6704.171875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.9421284720301628, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001218589604832232, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 238147359.0, + "reward": 0.3515625, + "reward_std": 0.2012200504541397, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.002478870330378413, + "sampling/sampling_logp_difference/max": 5.99995231628418, + "sampling/sampling_logp_difference/mean": 0.02092663012444973, + "step": 292 + }, + { + "clip_ratio/high_max": 8.067639100772794e-06, + "clip_ratio/high_mean": 2.0169097751931986e-06, + "clip_ratio/low_mean": 4.687528951308195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.889219928827515e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6142.8203125, + "completions/mean_terminated_length": 5639.1552734375, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.1285494044423103, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003979295492172241, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 238953104.0, + "reward": 0.265625, + "reward_std": 0.2756393849849701, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.00349772023037076, + "sampling/sampling_logp_difference/max": 5.655643939971924, + "sampling/sampling_logp_difference/mean": 0.022049173712730408, + "step": 293 + }, + { + "clip_ratio/high_max": 1.4033725619810866e-05, + "clip_ratio/high_mean": 3.5084314049527165e-06, + "clip_ratio/low_mean": 2.4028336156334262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7536767788660654e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15710.0, + "completions/mean_length": 5622.296875, + "completions/mean_terminated_length": 5275.14501953125, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.9032362103462219, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022260278929024935, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 239699350.0, + "reward": 0.53125, + "reward_std": 0.2748701572418213, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663829803467, + "sampling/importance_sampling_ratio/min": 9.907654748531058e-05, + "sampling/sampling_logp_difference/max": 9.21961784362793, + "sampling/sampling_logp_difference/mean": 0.018553178757429123, + "step": 294 + }, + { + "clip_ratio/high_max": 2.0970909417883377e-05, + "clip_ratio/high_mean": 7.081109117734741e-06, + "clip_ratio/low_mean": 2.478300689290336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.186411640854203e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15532.0, + "completions/mean_length": 7203.6640625, + "completions/mean_terminated_length": 6752.171875, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "entropy": 0.9958974272012711, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001666489290073514, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 240640387.0, + "reward": 0.484375, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999366998672485, + "sampling/importance_sampling_ratio/min": 0.003141714259982109, + "sampling/sampling_logp_difference/max": 5.762986660003662, + "sampling/sampling_logp_difference/mean": 0.02084190584719181, + "step": 295 + }, + { + "clip_ratio/high_max": 2.8518336421257118e-05, + "clip_ratio/high_mean": 1.1702542110469949e-05, + "clip_ratio/low_mean": 4.6755864048009244e-05, + "clip_ratio/low_min": 9.262003914045636e-06, + "clip_ratio/region_mean": 5.8458407011130475e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7692.4765625, + "completions/mean_terminated_length": 7412.2578125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9312580227851868, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019504680531099439, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 241647840.0, + "reward": 0.3828125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998970031738281, + "sampling/importance_sampling_ratio/min": 0.00011594472016440704, + "sampling/sampling_logp_difference/max": 9.062397003173828, + "sampling/sampling_logp_difference/mean": 0.02081790193915367, + "step": 296 + }, + { + "clip_ratio/high_max": 2.4005360501178075e-05, + "clip_ratio/high_mean": 6.001340125294519e-06, + "clip_ratio/low_mean": 3.910731970790948e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.510866097007238e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14196.0, + "completions/mean_length": 6142.09375, + "completions/mean_terminated_length": 6061.44873046875, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "entropy": 0.8636585548520088, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025938916951417923, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 242452692.0, + "reward": 0.515625, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999980926513672, + "sampling/importance_sampling_ratio/min": 2.320722842341638e-06, + "sampling/sampling_logp_difference/max": 12.973631858825684, + "sampling/sampling_logp_difference/mean": 0.019208990037441254, + "step": 297 + }, + { + "clip_ratio/high_max": 4.168055966147222e-06, + "clip_ratio/high_mean": 1.0420139915368054e-06, + "clip_ratio/low_mean": 3.8637008401565254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.967902239310206e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16030.0, + "completions/max_terminated_length": 16030.0, + "completions/mean_length": 6112.6171875, + "completions/mean_terminated_length": 6112.6171875, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.8610381335020065, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014701929176226258, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 243255243.0, + "reward": 0.3984375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271035194397, + "sampling/importance_sampling_ratio/min": 4.6073862904449925e-05, + "sampling/sampling_logp_difference/max": 9.985264778137207, + "sampling/sampling_logp_difference/mean": 0.018754754215478897, + "step": 298 + }, + { + "clip_ratio/high_max": 8.054383215494454e-06, + "clip_ratio/high_mean": 2.0135958038736135e-06, + "clip_ratio/low_mean": 4.2183424454833585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4197020486080874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16365.0, + "completions/mean_length": 7204.4375, + "completions/mean_terminated_length": 7132.1572265625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 1.0613816231489182, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023235646076500416, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 244198291.0, + "reward": 0.3203125, + "reward_std": 0.3119252324104309, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468326568604, + "sampling/importance_sampling_ratio/min": 3.256353693359415e-07, + "sampling/sampling_logp_difference/max": 14.937487602233887, + "sampling/sampling_logp_difference/mean": 0.02158042974770069, + "step": 299 + }, + { + "clip_ratio/high_max": 1.0963113709294703e-05, + "clip_ratio/high_mean": 3.833359528471192e-06, + "clip_ratio/low_mean": 4.1291930529041565e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5125290171199595e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16356.0, + "completions/mean_length": 6308.59375, + "completions/mean_terminated_length": 6066.7841796875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.8048126623034477, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002957145916298032, + "learning_rate": 1e-05, + "loss": 0.0926, + "num_tokens": 245022975.0, + "reward": 0.484375, + "reward_std": 0.3692649006843567, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999489188194275, + "sampling/importance_sampling_ratio/min": 0.0005304187070578337, + "sampling/sampling_logp_difference/max": 7.541843891143799, + "sampling/sampling_logp_difference/mean": 0.017426976934075356, + "step": 300 + }, + { + "clip_ratio/high_max": 1.863301304183551e-05, + "clip_ratio/high_mean": 4.658253260458878e-06, + "clip_ratio/low_mean": 7.454315527866129e-05, + "clip_ratio/low_min": 8.290224286611192e-06, + "clip_ratio/region_mean": 7.920140842543333e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6183.75, + "completions/mean_terminated_length": 5938.9443359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.8879657089710236, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002814161591231823, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 245831183.0, + "reward": 0.46875, + "reward_std": 0.3156445026397705, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352097511292, + "sampling/importance_sampling_ratio/min": 7.562734390376136e-05, + "sampling/sampling_logp_difference/max": 9.489692687988281, + "sampling/sampling_logp_difference/mean": 0.01883331872522831, + "step": 301 + }, + { + "clip_ratio/high_max": 9.606681487639435e-06, + "clip_ratio/high_mean": 2.4016703719098587e-06, + "clip_ratio/low_mean": 3.564927715160593e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.805094752351579e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 5656.8984375, + "completions/mean_terminated_length": 5310.86279296875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8461362943053246, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00238890596665442, + "learning_rate": 1e-05, + "loss": 0.1344, + "num_tokens": 246576170.0, + "reward": 0.3984375, + "reward_std": 0.37609970569610596, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999550580978394, + "sampling/importance_sampling_ratio/min": 0.000344505300745368, + "sampling/sampling_logp_difference/max": 7.973401069641113, + "sampling/sampling_logp_difference/mean": 0.01883539929986, + "step": 302 + }, + { + "clip_ratio/high_max": 3.868412022711709e-06, + "clip_ratio/high_mean": 9.671030056779273e-07, + "clip_ratio/low_mean": 4.4275341792854306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.524244479853223e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14949.0, + "completions/mean_length": 7402.484375, + "completions/mean_terminated_length": 7331.763671875, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.9303053691983223, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002722573932260275, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 247542448.0, + "reward": 0.359375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998664259910583, + "sampling/importance_sampling_ratio/min": 0.0015035009710118175, + "sampling/sampling_logp_difference/max": 6.4999589920043945, + "sampling/sampling_logp_difference/mean": 0.020525872707366943, + "step": 303 + }, + { + "clip_ratio/high_max": 3.7332376905396814e-06, + "clip_ratio/high_mean": 9.333094226349203e-07, + "clip_ratio/low_mean": 2.2581028019885707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3514337442520628e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15655.0, + "completions/mean_length": 6920.7734375, + "completions/mean_terminated_length": 6455.36865234375, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.9233825877308846, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024008466862142086, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 248446787.0, + "reward": 0.328125, + "reward_std": 0.2359210103750229, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999996423721313, + "sampling/importance_sampling_ratio/min": 0.00010231315536657348, + "sampling/sampling_logp_difference/max": 9.187472343444824, + "sampling/sampling_logp_difference/mean": 0.01887384243309498, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1328072105243336e-05, + "clip_ratio/high_mean": 2.832018026310834e-06, + "clip_ratio/low_mean": 3.6861969306301035e-05, + "clip_ratio/low_min": 4.25054395236657e-06, + "clip_ratio/region_mean": 3.969398790104606e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 6658.7109375, + "completions/mean_terminated_length": 6504.341796875, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9102077335119247, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016227345913648605, + "learning_rate": 1e-05, + "loss": 0.0684, + "num_tokens": 249318094.0, + "reward": 0.5078125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998591542243958, + "sampling/importance_sampling_ratio/min": 0.0038418183103203773, + "sampling/sampling_logp_difference/max": 5.561809539794922, + "sampling/sampling_logp_difference/mean": 0.019931891933083534, + "step": 305 + }, + { + "clip_ratio/high_max": 5.2942118600185495e-06, + "clip_ratio/high_mean": 1.3235529650046374e-06, + "clip_ratio/low_mean": 4.644989053304016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7773443156984285e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8597.84375, + "completions/mean_terminated_length": 8346.6767578125, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9965319409966469, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023056245408952236, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 250435674.0, + "reward": 0.296875, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.005126871634274721, + "sampling/sampling_logp_difference/max": 5.27325963973999, + "sampling/sampling_logp_difference/mean": 0.02132929116487503, + "step": 306 + }, + { + "clip_ratio/high_max": 8.388911510337493e-06, + "clip_ratio/high_mean": 2.0972278775843733e-06, + "clip_ratio/low_mean": 4.1705150920279266e-05, + "clip_ratio/low_min": 5.85781890549697e-06, + "clip_ratio/region_mean": 4.380237885470706e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14905.0, + "completions/max_terminated_length": 14905.0, + "completions/mean_length": 6053.0390625, + "completions/mean_terminated_length": 6053.0390625, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "entropy": 1.0717384740710258, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022299408446997404, + "learning_rate": 1e-05, + "loss": 0.0054, + "num_tokens": 251232847.0, + "reward": 0.3515625, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006914138794, + "sampling/importance_sampling_ratio/min": 0.0024789744056761265, + "sampling/sampling_logp_difference/max": 5.999910354614258, + "sampling/sampling_logp_difference/mean": 0.021233227103948593, + "step": 307 + }, + { + "clip_ratio/high_max": 1.0162047374251415e-05, + "clip_ratio/high_mean": 2.5405118435628538e-06, + "clip_ratio/low_mean": 5.296576864566305e-05, + "clip_ratio/low_min": 8.864200026437175e-06, + "clip_ratio/region_mean": 5.550628043238248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15929.0, + "completions/mean_length": 6553.7109375, + "completions/mean_terminated_length": 6476.30712890625, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.9829569607973099, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026091893669217825, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 252088154.0, + "reward": 0.4140625, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999917149543762, + "sampling/importance_sampling_ratio/min": 0.0010629174066707492, + "sampling/sampling_logp_difference/max": 6.846737861633301, + "sampling/sampling_logp_difference/mean": 0.020414084196090698, + "step": 308 + }, + { + "clip_ratio/high_max": 9.021045798363048e-06, + "clip_ratio/high_mean": 2.255261449590762e-06, + "clip_ratio/low_mean": 3.9386548451147974e-05, + "clip_ratio/low_min": 4.476596132008126e-06, + "clip_ratio/region_mean": 4.1641809502834803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15218.0, + "completions/mean_length": 6391.7421875, + "completions/mean_terminated_length": 5985.552734375, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.7887687161564827, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018632705323398113, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 252926073.0, + "reward": 0.4609375, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999994158744812, + "sampling/importance_sampling_ratio/min": 0.0001141107059083879, + "sampling/sampling_logp_difference/max": 9.078341484069824, + "sampling/sampling_logp_difference/mean": 0.016558727249503136, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.388932546182332e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.388932546182332e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15492.0, + "completions/mean_length": 7519.140625, + "completions/mean_terminated_length": 7306.38427734375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.8663278818130493, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014314674772322178, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 253908571.0, + "reward": 0.296875, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999785423278809, + "sampling/importance_sampling_ratio/min": 9.006411971768102e-08, + "sampling/sampling_logp_difference/max": 16.22274398803711, + "sampling/sampling_logp_difference/mean": 0.019052794203162193, + "step": 310 + }, + { + "clip_ratio/high_max": 4.941101906297263e-06, + "clip_ratio/high_mean": 1.2352754765743157e-06, + "clip_ratio/low_mean": 1.9741319533750357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0976595237698348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15343.0, + "completions/max_terminated_length": 15343.0, + "completions/mean_length": 5273.7265625, + "completions/mean_terminated_length": 5273.7265625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.973240926861763, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00404210714623332, + "learning_rate": 1e-05, + "loss": 0.0706, + "num_tokens": 254601856.0, + "reward": 0.4921875, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999933123588562, + "sampling/importance_sampling_ratio/min": 5.1447856094455346e-05, + "sampling/sampling_logp_difference/max": 9.8749418258667, + "sampling/sampling_logp_difference/mean": 0.01859421283006668, + "step": 311 + }, + { + "clip_ratio/high_max": 9.725902600621339e-06, + "clip_ratio/high_mean": 2.4314756501553347e-06, + "clip_ratio/low_mean": 2.9865542501283926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2297018492499774e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 6674.5390625, + "completions/mean_terminated_length": 6598.08642578125, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "entropy": 0.9493648260831833, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003681440372020006, + "learning_rate": 1e-05, + "loss": 0.0347, + "num_tokens": 255474357.0, + "reward": 0.359375, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998538494110107, + "sampling/importance_sampling_ratio/min": 4.5425484131556004e-05, + "sampling/sampling_logp_difference/max": 9.99943733215332, + "sampling/sampling_logp_difference/mean": 0.020322658121585846, + "step": 312 + }, + { + "clip_ratio/high_max": 1.3442999488688656e-05, + "clip_ratio/high_mean": 4.46992856950601e-06, + "clip_ratio/low_mean": 4.9175514504895546e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3645443131244974e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7113.59375, + "completions/mean_terminated_length": 6736.74755859375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.8717286512255669, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014825655380263925, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 256405745.0, + "reward": 0.3984375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999269247055054, + "sampling/importance_sampling_ratio/min": 0.0015039225108921528, + "sampling/sampling_logp_difference/max": 6.499678611755371, + "sampling/sampling_logp_difference/mean": 0.019822338595986366, + "step": 313 + }, + { + "clip_ratio/high_max": 2.0328425534898997e-05, + "clip_ratio/high_mean": 6.525457763473241e-06, + "clip_ratio/low_mean": 1.983899721835769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.636445498183093e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15655.0, + "completions/mean_length": 5819.9765625, + "completions/mean_terminated_length": 5736.79541015625, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.9206694886088371, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002185023855417967, + "learning_rate": 1e-05, + "loss": 0.0957, + "num_tokens": 257171214.0, + "reward": 0.4375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 0.0011616232804954052, + "sampling/sampling_logp_difference/max": 6.757936954498291, + "sampling/sampling_logp_difference/mean": 0.018492478877305984, + "step": 314 + }, + { + "clip_ratio/high_max": 2.2664371726932586e-05, + "clip_ratio/high_mean": 6.88441667762163e-06, + "clip_ratio/low_mean": 4.306056735003949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.994498453925189e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16170.0, + "completions/mean_length": 6754.7109375, + "completions/mean_terminated_length": 6523.6083984375, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "entropy": 0.8881036639213562, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022363397292792797, + "learning_rate": 1e-05, + "loss": 0.1086, + "num_tokens": 258064049.0, + "reward": 0.5234375, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0005261205951683223, + "sampling/sampling_logp_difference/max": 7.549980163574219, + "sampling/sampling_logp_difference/mean": 0.01989433914422989, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.3297232107543095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3297232107543095e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15599.0, + "completions/mean_length": 7953.421875, + "completions/mean_terminated_length": 7610.71533203125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9007300287485123, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001413302612490952, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 259098655.0, + "reward": 0.3203125, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 0.00017562482389621437, + "sampling/sampling_logp_difference/max": 8.647160530090332, + "sampling/sampling_logp_difference/mean": 0.019421691074967384, + "step": 316 + }, + { + "clip_ratio/high_max": 3.664743485387589e-05, + "clip_ratio/high_mean": 1.2026366050577053e-05, + "clip_ratio/low_mean": 3.211230455235636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4138670659776835e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15430.0, + "completions/mean_length": 6669.390625, + "completions/mean_terminated_length": 6515.19091796875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.8598581254482269, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018268795683979988, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 259971017.0, + "reward": 0.4453125, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 1.7517091066565627e-07, + "sampling/sampling_logp_difference/max": 15.557503700256348, + "sampling/sampling_logp_difference/mean": 0.01863129623234272, + "step": 317 + }, + { + "clip_ratio/high_max": 5.219860668148613e-06, + "clip_ratio/high_mean": 1.3049651670371532e-06, + "clip_ratio/low_mean": 2.3785564053468988e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509052933419298e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11342.0, + "completions/max_terminated_length": 11342.0, + "completions/mean_length": 5268.2890625, + "completions/mean_terminated_length": 5268.2890625, + "completions/min_length": 818.0, + "completions/min_terminated_length": 818.0, + "entropy": 0.8647450804710388, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027839087415486574, + "learning_rate": 1e-05, + "loss": 0.1259, + "num_tokens": 260663534.0, + "reward": 0.6171875, + "reward_std": 0.3345640003681183, + "rewards/accuracy_reward/mean": 0.6171875, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998882412910461, + "sampling/importance_sampling_ratio/min": 0.008392918854951859, + "sampling/sampling_logp_difference/max": 4.780366897583008, + "sampling/sampling_logp_difference/mean": 0.017936093732714653, + "step": 318 + }, + { + "clip_ratio/high_max": 3.5293785458634375e-06, + "clip_ratio/high_mean": 8.823446364658594e-07, + "clip_ratio/low_mean": 3.2431569934487925e-05, + "clip_ratio/low_min": 3.789371476159431e-06, + "clip_ratio/region_mean": 3.331391440042353e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14955.0, + "completions/mean_length": 7037.0, + "completions/mean_terminated_length": 6496.26416015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9258207008242607, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002726807491853833, + "learning_rate": 1e-05, + "loss": 0.1071, + "num_tokens": 261583222.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 0.0004893821314908564, + "sampling/sampling_logp_difference/max": 7.622366905212402, + "sampling/sampling_logp_difference/mean": 0.019336845725774765, + "step": 319 + }, + { + "clip_ratio/high_max": 3.219348491256824e-05, + "clip_ratio/high_mean": 8.04837122814206e-06, + "clip_ratio/low_mean": 3.258790718518867e-05, + "clip_ratio/low_min": 6.961073722777655e-06, + "clip_ratio/region_mean": 4.0636279095451755e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 6469.78125, + "completions/mean_terminated_length": 6391.71630859375, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "entropy": 0.9932648614048958, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00209408369846642, + "learning_rate": 1e-05, + "loss": 0.0446, + "num_tokens": 262430162.0, + "reward": 0.375, + "reward_std": 0.3640199303627014, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999074339866638, + "sampling/importance_sampling_ratio/min": 0.003386466298252344, + "sampling/sampling_logp_difference/max": 5.6879682540893555, + "sampling/sampling_logp_difference/mean": 0.020799942314624786, + "step": 320 + }, + { + "clip_ratio/high_max": 2.827135813276982e-05, + "clip_ratio/high_mean": 8.08931497431331e-06, + "clip_ratio/low_mean": 4.0315980186278466e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.840529436478391e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15815.0, + "completions/max_terminated_length": 15815.0, + "completions/mean_length": 5471.6953125, + "completions/mean_terminated_length": 5471.6953125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.979861818253994, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032934497576206923, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 263148331.0, + "reward": 0.4453125, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145435333252, + "sampling/importance_sampling_ratio/min": 4.68981761514442e-06, + "sampling/sampling_logp_difference/max": 12.270116806030273, + "sampling/sampling_logp_difference/mean": 0.019479844719171524, + "step": 321 + }, + { + "clip_ratio/high_max": 1.3237559869594406e-05, + "clip_ratio/high_mean": 3.3093899673986016e-06, + "clip_ratio/low_mean": 5.419432636699639e-05, + "clip_ratio/low_min": 3.509559974190779e-06, + "clip_ratio/region_mean": 5.750371656176867e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16121.0, + "completions/mean_length": 6640.65625, + "completions/mean_terminated_length": 6161.47509765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8560378029942513, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014544804580509663, + "learning_rate": 1e-05, + "loss": 0.1159, + "num_tokens": 264017391.0, + "reward": 0.515625, + "reward_std": 0.31983357667922974, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999976396560669, + "sampling/importance_sampling_ratio/min": 0.00810791365802288, + "sampling/sampling_logp_difference/max": 4.814914703369141, + "sampling/sampling_logp_difference/mean": 0.01882140152156353, + "step": 322 + }, + { + "clip_ratio/high_max": 3.979497705586255e-06, + "clip_ratio/high_mean": 9.948744263965636e-07, + "clip_ratio/low_mean": 3.569043906281877e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.668531348921533e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16249.0, + "completions/mean_length": 5950.7421875, + "completions/mean_terminated_length": 5700.34423828125, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.9033292010426521, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001294711953960359, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 264799326.0, + "reward": 0.5546875, + "reward_std": 0.22621294856071472, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000641345977783, + "sampling/importance_sampling_ratio/min": 0.0011992956278845668, + "sampling/sampling_logp_difference/max": 6.726020812988281, + "sampling/sampling_logp_difference/mean": 0.019538050517439842, + "step": 323 + }, + { + "clip_ratio/high_max": 3.0064740258239908e-05, + "clip_ratio/high_mean": 7.516185064559977e-06, + "clip_ratio/low_mean": 3.826810700502392e-05, + "clip_ratio/low_min": 4.875575541518629e-06, + "clip_ratio/region_mean": 4.578429286539176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 6356.0703125, + "completions/mean_terminated_length": 6196.89697265625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.8268664851784706, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022473863791674376, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 265630895.0, + "reward": 0.4375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 2.5895053113345057e-05, + "sampling/sampling_logp_difference/max": 10.561458587646484, + "sampling/sampling_logp_difference/mean": 0.01843554526567459, + "step": 324 + }, + { + "clip_ratio/high_max": 1.8887641999754123e-05, + "clip_ratio/high_mean": 5.5906657507875934e-06, + "clip_ratio/low_mean": 7.594743829031358e-05, + "clip_ratio/low_min": 8.592850917921169e-06, + "clip_ratio/region_mean": 8.153810449584853e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7955.546875, + "completions/mean_terminated_length": 7821.76220703125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.9475079327821732, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023036333732306957, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 266666285.0, + "reward": 0.421875, + "reward_std": 0.36008089780807495, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 1.0642166614616144e-07, + "sampling/sampling_logp_difference/max": 16.055856704711914, + "sampling/sampling_logp_difference/mean": 0.020778125151991844, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9688118729609414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9688118729609414e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16247.0, + "completions/mean_length": 7701.7578125, + "completions/mean_terminated_length": 6965.974609375, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.8349794074892998, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020953568164259195, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 267669230.0, + "reward": 0.46875, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999356269836426, + "sampling/importance_sampling_ratio/min": 0.010210023261606693, + "sampling/sampling_logp_difference/max": 4.584385395050049, + "sampling/sampling_logp_difference/mean": 0.018453046679496765, + "step": 326 + }, + { + "clip_ratio/high_max": 1.9330177565279882e-05, + "clip_ratio/high_mean": 4.832544391319971e-06, + "clip_ratio/low_mean": 3.980111284818122e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4633657012127514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16374.0, + "completions/mean_length": 7335.40625, + "completions/mean_terminated_length": 7118.240234375, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.9238340929150581, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016563549870625138, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 268627714.0, + "reward": 0.390625, + "reward_std": 0.32036250829696655, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.0011709382524713874, + "sampling/sampling_logp_difference/max": 6.749949932098389, + "sampling/sampling_logp_difference/mean": 0.019696014001965523, + "step": 327 + }, + { + "clip_ratio/high_max": 1.5036271179269534e-05, + "clip_ratio/high_mean": 3.7590677948173834e-06, + "clip_ratio/low_mean": 4.6864498017384904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.062356603957596e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15040.0, + "completions/max_terminated_length": 15040.0, + "completions/mean_length": 6259.875, + "completions/mean_terminated_length": 6259.875, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "entropy": 1.0842352360486984, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017849374562501907, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 269447338.0, + "reward": 0.3984375, + "reward_std": 0.2977364957332611, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998852014541626, + "sampling/importance_sampling_ratio/min": 0.009620909579098225, + "sampling/sampling_logp_difference/max": 4.6438164710998535, + "sampling/sampling_logp_difference/mean": 0.020421095192432404, + "step": 328 + }, + { + "clip_ratio/high_max": 1.4728739188285545e-05, + "clip_ratio/high_mean": 3.682184797071386e-06, + "clip_ratio/low_mean": 2.7205874630453764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.08880598822725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15784.0, + "completions/max_terminated_length": 15784.0, + "completions/mean_length": 7626.125, + "completions/mean_terminated_length": 7626.125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.1077729761600494, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017999790143221617, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 270444594.0, + "reward": 0.390625, + "reward_std": 0.24381662905216217, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991375207901, + "sampling/importance_sampling_ratio/min": 2.4265028741865535e-07, + "sampling/sampling_logp_difference/max": 15.231644630432129, + "sampling/sampling_logp_difference/mean": 0.021409697830677032, + "step": 329 + }, + { + "clip_ratio/high_max": 1.5701789834565716e-05, + "clip_ratio/high_mean": 3.925447458641429e-06, + "clip_ratio/low_mean": 3.2665291655575857e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.659073934159096e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15531.0, + "completions/max_terminated_length": 15531.0, + "completions/mean_length": 5581.5625, + "completions/mean_terminated_length": 5581.5625, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.8401889503002167, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031031551770865917, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 271177242.0, + "reward": 0.625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999693036079407, + "sampling/importance_sampling_ratio/min": 0.00020852939633186907, + "sampling/sampling_logp_difference/max": 8.475430488586426, + "sampling/sampling_logp_difference/mean": 0.017869479954242706, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.981169902544934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.981169902544934e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15973.0, + "completions/mean_length": 6442.84375, + "completions/mean_terminated_length": 6364.56689453125, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.8304163441061974, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002635185606777668, + "learning_rate": 1e-05, + "loss": 0.037, + "num_tokens": 272021830.0, + "reward": 0.4609375, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 0.0004586660652421415, + "sampling/sampling_logp_difference/max": 7.687188148498535, + "sampling/sampling_logp_difference/mean": 0.01730487309396267, + "step": 331 + }, + { + "clip_ratio/high_max": 2.2348198399413377e-05, + "clip_ratio/high_mean": 6.557516371685779e-06, + "clip_ratio/low_mean": 5.170885208372056e-05, + "clip_ratio/low_min": 4.756469024869148e-06, + "clip_ratio/region_mean": 5.826636891015369e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15868.0, + "completions/mean_length": 6052.265625, + "completions/mean_terminated_length": 5888.27001953125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.9033217504620552, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031849017832428217, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 272818080.0, + "reward": 0.3359375, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999919533729553, + "sampling/importance_sampling_ratio/min": 2.2380504560715053e-07, + "sampling/sampling_logp_difference/max": 15.312490463256836, + "sampling/sampling_logp_difference/mean": 0.019191090017557144, + "step": 332 + }, + { + "clip_ratio/high_max": 3.71780379282427e-06, + "clip_ratio/high_mean": 9.294509482060676e-07, + "clip_ratio/low_mean": 6.115805626905058e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.20875071035698e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16068.0, + "completions/max_terminated_length": 16068.0, + "completions/mean_length": 6337.5859375, + "completions/mean_terminated_length": 6337.5859375, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 1.0558827072381973, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002086545340716839, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 273648579.0, + "reward": 0.3203125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000107288360596, + "sampling/importance_sampling_ratio/min": 7.982287934282795e-05, + "sampling/sampling_logp_difference/max": 9.435700416564941, + "sampling/sampling_logp_difference/mean": 0.021268527954816818, + "step": 333 + }, + { + "clip_ratio/high_max": 1.228984365297947e-05, + "clip_ratio/high_mean": 3.0724609132448677e-06, + "clip_ratio/low_mean": 3.2620800709537434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.56932616227823e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15556.0, + "completions/mean_length": 6439.78125, + "completions/mean_terminated_length": 6361.48046875, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "entropy": 0.989262692630291, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002226081909611821, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 274493159.0, + "reward": 0.3984375, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000194311141968, + "sampling/importance_sampling_ratio/min": 0.03169185668230057, + "sampling/sampling_logp_difference/max": 3.451695442199707, + "sampling/sampling_logp_difference/mean": 0.019788069650530815, + "step": 334 + }, + { + "clip_ratio/high_max": 7.10556764715875e-06, + "clip_ratio/high_mean": 1.7763919117896876e-06, + "clip_ratio/low_mean": 3.469589137239382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.647228299996641e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7641.5234375, + "completions/mean_terminated_length": 7572.68505859375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 1.1427540630102158, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022452943958342075, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 275490762.0, + "reward": 0.203125, + "reward_std": 0.2567248046398163, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0003476575657259673, + "sampling/sampling_logp_difference/max": 7.964292526245117, + "sampling/sampling_logp_difference/mean": 0.022936880588531494, + "step": 335 + }, + { + "clip_ratio/high_max": 3.430955530348001e-06, + "clip_ratio/high_mean": 8.577388825870003e-07, + "clip_ratio/low_mean": 1.611294828762766e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6970687056527822e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15717.0, + "completions/mean_length": 6291.046875, + "completions/mean_terminated_length": 6211.57470703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.1789169162511826, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001387307420372963, + "learning_rate": 1e-05, + "loss": -0.0026, + "num_tokens": 276314904.0, + "reward": 0.28125, + "reward_std": 0.1712273508310318, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000487565994263, + "sampling/importance_sampling_ratio/min": 0.012205099686980247, + "sampling/sampling_logp_difference/max": 4.4059014320373535, + "sampling/sampling_logp_difference/mean": 0.020597899332642555, + "step": 336 + }, + { + "clip_ratio/high_max": 1.1513505342009012e-05, + "clip_ratio/high_mean": 2.878376335502253e-06, + "clip_ratio/low_mean": 5.239053416516981e-05, + "clip_ratio/low_min": 5.946967576164752e-06, + "clip_ratio/region_mean": 5.526891072804574e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15962.0, + "completions/mean_length": 7677.5, + "completions/mean_terminated_length": 7019.025390625, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9808845967054367, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018187003443017602, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 277320888.0, + "reward": 0.25, + "reward_std": 0.2880108058452606, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999768733978271, + "sampling/importance_sampling_ratio/min": 0.0001234103983733803, + "sampling/sampling_logp_difference/max": 8.999995231628418, + "sampling/sampling_logp_difference/mean": 0.0210642758756876, + "step": 337 + }, + { + "clip_ratio/high_max": 1.7702866443869425e-05, + "clip_ratio/high_mean": 4.425716610967356e-06, + "clip_ratio/low_mean": 4.517976913120947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.960548540111631e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15783.0, + "completions/mean_length": 7066.1484375, + "completions/mean_terminated_length": 6992.779296875, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "entropy": 1.0734655261039734, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019406796200200915, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 278245739.0, + "reward": 0.3359375, + "reward_std": 0.29249146580696106, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.004089824389666319, + "sampling/sampling_logp_difference/max": 5.499253273010254, + "sampling/sampling_logp_difference/mean": 0.020316962152719498, + "step": 338 + }, + { + "clip_ratio/high_max": 1.661570968281012e-05, + "clip_ratio/high_mean": 5.1870877086912515e-06, + "clip_ratio/low_mean": 1.647002238769346e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.165711032375839e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14474.0, + "completions/max_terminated_length": 14474.0, + "completions/mean_length": 5187.5078125, + "completions/mean_terminated_length": 5187.5078125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.9958596602082253, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023044368717819452, + "learning_rate": 1e-05, + "loss": -0.002, + "num_tokens": 278933796.0, + "reward": 0.453125, + "reward_std": 0.22331714630126953, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999975562095642, + "sampling/importance_sampling_ratio/min": 1.0969968570861965e-05, + "sampling/sampling_logp_difference/max": 11.42034912109375, + "sampling/sampling_logp_difference/mean": 0.019379254430532455, + "step": 339 + }, + { + "clip_ratio/high_max": 1.5325686035794206e-05, + "clip_ratio/high_mean": 3.8314215089485515e-06, + "clip_ratio/low_mean": 2.3057583121044445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.688900440261932e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15550.0, + "completions/mean_length": 6871.0859375, + "completions/mean_terminated_length": 6484.3818359375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.8953125178813934, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026841885410249233, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 279832175.0, + "reward": 0.4296875, + "reward_std": 0.3595392107963562, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001311302185, + "sampling/importance_sampling_ratio/min": 0.004663798026740551, + "sampling/sampling_logp_difference/max": 5.36792516708374, + "sampling/sampling_logp_difference/mean": 0.019127724692225456, + "step": 340 + }, + { + "clip_ratio/high_max": 1.315804820478661e-05, + "clip_ratio/high_mean": 4.150227596255718e-06, + "clip_ratio/low_mean": 3.6840762675183214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0990990044065256e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14255.0, + "completions/mean_length": 6459.2109375, + "completions/mean_terminated_length": 6381.06298828125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8647114709019661, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014444541884586215, + "learning_rate": 1e-05, + "loss": 0.0198, + "num_tokens": 280678482.0, + "reward": 0.2734375, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999886751174927, + "sampling/importance_sampling_ratio/min": 0.0019316815305501223, + "sampling/sampling_logp_difference/max": 6.249364376068115, + "sampling/sampling_logp_difference/mean": 0.01974722556769848, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.500776003624196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.500776003624196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 6280.0546875, + "completions/mean_terminated_length": 6037.56005859375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.9132707491517067, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001992191653698683, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 281499753.0, + "reward": 0.375, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999694228172302, + "sampling/importance_sampling_ratio/min": 2.558048436185345e-05, + "sampling/sampling_logp_difference/max": 10.573680877685547, + "sampling/sampling_logp_difference/mean": 0.01896769367158413, + "step": 342 + }, + { + "clip_ratio/high_max": 1.2855523436883232e-05, + "clip_ratio/high_mean": 3.213880859220808e-06, + "clip_ratio/low_mean": 2.9316923928490723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2530804674024694e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 6220.578125, + "completions/mean_terminated_length": 5892.7255859375, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "entropy": 0.8257150128483772, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003750045085325837, + "learning_rate": 1e-05, + "loss": 0.0631, + "num_tokens": 282316795.0, + "reward": 0.515625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999854564666748, + "sampling/importance_sampling_ratio/min": 2.2095075280503806e-07, + "sampling/sampling_logp_difference/max": 15.325325965881348, + "sampling/sampling_logp_difference/mean": 0.017498498782515526, + "step": 343 + }, + { + "clip_ratio/high_max": 9.090150342672132e-06, + "clip_ratio/high_mean": 2.272537585668033e-06, + "clip_ratio/low_mean": 5.6543332675573765e-05, + "clip_ratio/low_min": 4.705262199422577e-06, + "clip_ratio/region_mean": 5.881586980649445e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16134.0, + "completions/mean_length": 6845.09375, + "completions/mean_terminated_length": 6693.68310546875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.9700654074549675, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002124012913554907, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 283212095.0, + "reward": 0.4296875, + "reward_std": 0.3527093529701233, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914169311523, + "sampling/importance_sampling_ratio/min": 4.450856749826926e-07, + "sampling/sampling_logp_difference/max": 14.624999046325684, + "sampling/sampling_logp_difference/mean": 0.02086886763572693, + "step": 344 + }, + { + "clip_ratio/high_max": 4.2354217839601915e-06, + "clip_ratio/high_mean": 1.0588554459900479e-06, + "clip_ratio/low_mean": 5.4464956633637485e-05, + "clip_ratio/low_min": 7.402143637591507e-06, + "clip_ratio/region_mean": 5.552381219331437e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15416.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 4986.3828125, + "completions/mean_terminated_length": 4986.3828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9103464111685753, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035143878776580095, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 283871808.0, + "reward": 0.4296875, + "reward_std": 0.40715324878692627, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999771118164062, + "sampling/importance_sampling_ratio/min": 0.0028091762214899063, + "sampling/sampling_logp_difference/max": 5.874864101409912, + "sampling/sampling_logp_difference/mean": 0.01833461783826351, + "step": 345 + }, + { + "clip_ratio/high_max": 1.915729558277235e-05, + "clip_ratio/high_mean": 4.789323895693087e-06, + "clip_ratio/low_mean": 2.4886074015739723e-05, + "clip_ratio/low_min": 2.922677595051937e-06, + "clip_ratio/region_mean": 2.9675398081963067e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15954.0, + "completions/mean_length": 6467.9921875, + "completions/mean_terminated_length": 6310.595703125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.926672600209713, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014899170491844416, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 284718943.0, + "reward": 0.390625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999134540557861, + "sampling/importance_sampling_ratio/min": 0.00027431987109594047, + "sampling/sampling_logp_difference/max": 8.201215744018555, + "sampling/sampling_logp_difference/mean": 0.01909649185836315, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.792281761936465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.792281761936465e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15890.0, + "completions/mean_length": 6009.3671875, + "completions/mean_terminated_length": 5927.67724609375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 1.0197014585137367, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001638311194255948, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 285507622.0, + "reward": 0.4140625, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998466968536377, + "sampling/importance_sampling_ratio/min": 2.144540849258192e-05, + "sampling/sampling_logp_difference/max": 10.75, + "sampling/sampling_logp_difference/mean": 0.0198800191283226, + "step": 347 + }, + { + "clip_ratio/high_max": 1.3140848295734031e-05, + "clip_ratio/high_mean": 3.2852120739335078e-06, + "clip_ratio/low_mean": 5.1451362480747775e-05, + "clip_ratio/low_min": 7.097433353919769e-06, + "clip_ratio/region_mean": 5.473657506627205e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15174.0, + "completions/max_terminated_length": 15174.0, + "completions/mean_length": 6360.421875, + "completions/mean_terminated_length": 6360.421875, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.9253586605191231, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017278637969866395, + "learning_rate": 1e-05, + "loss": 0.0638, + "num_tokens": 286341012.0, + "reward": 0.390625, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998660087585449, + "sampling/importance_sampling_ratio/min": 5.007527215639129e-05, + "sampling/sampling_logp_difference/max": 9.901983261108398, + "sampling/sampling_logp_difference/mean": 0.02024514600634575, + "step": 348 + }, + { + "clip_ratio/high_max": 2.1974663468427025e-05, + "clip_ratio/high_mean": 6.800322353228694e-06, + "clip_ratio/low_mean": 3.598067922894188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.27810022642916e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16158.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 5470.5234375, + "completions/mean_terminated_length": 5470.5234375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.9031187370419502, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00106104149017483, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 287065039.0, + "reward": 0.3828125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999252557754517, + "sampling/importance_sampling_ratio/min": 1.6605448536211043e-06, + "sampling/sampling_logp_difference/max": 13.308364868164062, + "sampling/sampling_logp_difference/mean": 0.018382512032985687, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.3466772088577272e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3466772088577272e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15190.0, + "completions/max_terminated_length": 15190.0, + "completions/mean_length": 5533.265625, + "completions/mean_terminated_length": 5533.265625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 1.0052079856395721, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033145309425890446, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 287793249.0, + "reward": 0.484375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.04231228679418564, + "sampling/sampling_logp_difference/max": 3.162677764892578, + "sampling/sampling_logp_difference/mean": 0.020278627052903175, + "step": 350 + }, + { + "clip_ratio/high_max": 3.310516694909893e-05, + "clip_ratio/high_mean": 8.276291737274732e-06, + "clip_ratio/low_mean": 3.8735864336558734e-05, + "clip_ratio/low_min": 3.0842873002256965e-06, + "clip_ratio/region_mean": 4.7012156073833467e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 6025.6796875, + "completions/mean_terminated_length": 5604.609375, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 0.8798701837658882, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023973146453499794, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 288582232.0, + "reward": 0.453125, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998465776443481, + "sampling/importance_sampling_ratio/min": 5.531576334760757e-06, + "sampling/sampling_logp_difference/max": 12.105037689208984, + "sampling/sampling_logp_difference/mean": 0.01999252662062645, + "step": 351 + }, + { + "clip_ratio/high_max": 1.2754688668792369e-05, + "clip_ratio/high_mean": 4.434933430275123e-06, + "clip_ratio/low_mean": 2.503601820080803e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947095174476999e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14890.0, + "completions/mean_length": 6893.5390625, + "completions/mean_terminated_length": 6818.81103515625, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.8881499394774437, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016761437291279435, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 289483997.0, + "reward": 0.3515625, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.019294647499918938, + "step": 352 + }, + { + "clip_ratio/high_max": 1.8526947997088428e-05, + "clip_ratio/high_mean": 4.631736999272107e-06, + "clip_ratio/low_mean": 4.962505795447214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.425679569270869e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 6087.828125, + "completions/mean_terminated_length": 6006.755859375, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.8525711894035339, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002270620781928301, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 290282639.0, + "reward": 0.4765625, + "reward_std": 0.3645517826080322, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999156594276428, + "sampling/importance_sampling_ratio/min": 0.0006376233650371432, + "sampling/sampling_logp_difference/max": 7.357762813568115, + "sampling/sampling_logp_difference/mean": 0.01862185075879097, + "step": 353 + }, + { + "clip_ratio/high_max": 1.1926310435228515e-05, + "clip_ratio/high_mean": 2.981577608807129e-06, + "clip_ratio/low_mean": 5.369399366372818e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6675571954656334e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 7951.0, + "completions/mean_terminated_length": 7678.96728515625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.9653833135962486, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0013396133435890079, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 291320703.0, + "reward": 0.375, + "reward_std": 0.3429914712905884, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 2.4461383873131126e-05, + "sampling/sampling_logp_difference/max": 10.618414878845215, + "sampling/sampling_logp_difference/mean": 0.0205213762819767, + "step": 354 + }, + { + "clip_ratio/high_max": 1.886164773168275e-05, + "clip_ratio/high_mean": 4.715411932920688e-06, + "clip_ratio/low_mean": 4.581529401548323e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0530706175777595e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6017.2578125, + "completions/mean_terminated_length": 5852.70654296875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9492783322930336, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003194117220118642, + "learning_rate": 1e-05, + "loss": 0.0868, + "num_tokens": 292113384.0, + "reward": 0.5703125, + "reward_std": 0.36743485927581787, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999614357948303, + "sampling/importance_sampling_ratio/min": 0.004017275292426348, + "sampling/sampling_logp_difference/max": 5.517151355743408, + "sampling/sampling_logp_difference/mean": 0.02062429115176201, + "step": 355 + }, + { + "clip_ratio/high_max": 1.4877897228871007e-05, + "clip_ratio/high_mean": 3.7194743072177516e-06, + "clip_ratio/low_mean": 3.613741432673123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.985688817920163e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15690.0, + "completions/mean_length": 6696.0, + "completions/mean_terminated_length": 6619.71630859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 1.0417355075478554, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001876713940873742, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 292990600.0, + "reward": 0.34375, + "reward_std": 0.28011518716812134, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998572468757629, + "sampling/importance_sampling_ratio/min": 3.398728586034849e-05, + "sampling/sampling_logp_difference/max": 10.28952407836914, + "sampling/sampling_logp_difference/mean": 0.020289337262511253, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8955274046893464e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8955274046893464e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14436.0, + "completions/mean_length": 5184.203125, + "completions/mean_terminated_length": 5096.015625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 1.0320965945720673, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002229714998975396, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 293673106.0, + "reward": 0.375, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000356435775757, + "sampling/importance_sampling_ratio/min": 5.736888851970434e-05, + "sampling/sampling_logp_difference/max": 9.766008377075195, + "sampling/sampling_logp_difference/mean": 0.01969832368195057, + "step": 357 + }, + { + "clip_ratio/high_max": 1.2176971722510643e-05, + "clip_ratio/high_mean": 3.044242930627661e-06, + "clip_ratio/low_mean": 4.728799405029349e-05, + "clip_ratio/low_min": 5.63901312489179e-06, + "clip_ratio/region_mean": 5.033223698092115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15582.0, + "completions/mean_length": 6664.2890625, + "completions/mean_terminated_length": 6510.00830078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8329441174864769, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001597537542693317, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 294545927.0, + "reward": 0.4609375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.00012341710680630058, + "sampling/sampling_logp_difference/max": 8.999940872192383, + "sampling/sampling_logp_difference/mean": 0.018238451331853867, + "step": 358 + }, + { + "clip_ratio/high_max": 3.2730224575061584e-06, + "clip_ratio/high_mean": 8.182556143765396e-07, + "clip_ratio/low_mean": 5.867890376975993e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.94971597820404e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16322.0, + "completions/mean_length": 7486.4921875, + "completions/mean_terminated_length": 7345.26220703125, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 1.0071435943245888, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0018223393708467484, + "learning_rate": 1e-05, + "loss": 0.1035, + "num_tokens": 295523558.0, + "reward": 0.359375, + "reward_std": 0.36561262607574463, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999614357948303, + "sampling/importance_sampling_ratio/min": 8.459773198410403e-06, + "sampling/sampling_logp_difference/max": 11.680188179016113, + "sampling/sampling_logp_difference/mean": 0.021324433386325836, + "step": 359 + }, + { + "clip_ratio/high_max": 1.9864856540152687e-05, + "clip_ratio/high_mean": 4.966214135038172e-06, + "clip_ratio/low_mean": 4.498222278925823e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.994843698113982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14737.0, + "completions/mean_length": 6103.015625, + "completions/mean_terminated_length": 6022.06298828125, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "entropy": 0.9639975428581238, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002672795206308365, + "learning_rate": 1e-05, + "loss": 0.0559, + "num_tokens": 296323888.0, + "reward": 0.375, + "reward_std": 0.32589420676231384, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998803734779358, + "sampling/importance_sampling_ratio/min": 0.0057671889662742615, + "sampling/sampling_logp_difference/max": 5.1555705070495605, + "sampling/sampling_logp_difference/mean": 0.019866492599248886, + "step": 360 + }, + { + "clip_ratio/high_max": 1.1948508017667336e-05, + "clip_ratio/high_mean": 2.987127004416834e-06, + "clip_ratio/low_mean": 4.0038267286490736e-05, + "clip_ratio/low_min": 3.0986614092398668e-06, + "clip_ratio/region_mean": 4.302539394984706e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15805.0, + "completions/mean_length": 6524.640625, + "completions/mean_terminated_length": 6368.14306640625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.8653942495584488, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016479750629514456, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 297179234.0, + "reward": 0.46875, + "reward_std": 0.28011518716812134, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.0009119793539866805, + "sampling/sampling_logp_difference/max": 6.9998931884765625, + "sampling/sampling_logp_difference/mean": 0.018908966332674026, + "step": 361 + }, + { + "clip_ratio/high_max": 7.669039405300282e-06, + "clip_ratio/high_mean": 1.9172598513250705e-06, + "clip_ratio/low_mean": 2.1955054876343638e-05, + "clip_ratio/low_min": 3.4466595479898388e-06, + "clip_ratio/region_mean": 2.387231518241606e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16294.0, + "completions/mean_length": 8057.3203125, + "completions/mean_terminated_length": 7857.48046875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 1.0029005706310272, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018210343550890684, + "learning_rate": 1e-05, + "loss": 0.0309, + "num_tokens": 298230699.0, + "reward": 0.25, + "reward_std": 0.19438526034355164, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999086856842041, + "sampling/importance_sampling_ratio/min": 0.0046700225211679935, + "sampling/sampling_logp_difference/max": 5.366591453552246, + "sampling/sampling_logp_difference/mean": 0.020166225731372833, + "step": 362 + }, + { + "clip_ratio/high_max": 6.953715910640312e-06, + "clip_ratio/high_mean": 1.738428977660078e-06, + "clip_ratio/low_mean": 2.961834002235264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1356769113699556e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6875.3125, + "completions/mean_terminated_length": 6647.1044921875, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8582051023840904, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021944146137684584, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 299131579.0, + "reward": 0.4375, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 5.424213668447919e-06, + "sampling/sampling_logp_difference/max": 12.124637603759766, + "sampling/sampling_logp_difference/mean": 0.018997181206941605, + "step": 363 + }, + { + "clip_ratio/high_max": 1.4359977967615123e-05, + "clip_ratio/high_mean": 5.290952628911327e-06, + "clip_ratio/low_mean": 1.991117466104697e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5202126892054366e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16093.0, + "completions/mean_length": 7046.46875, + "completions/mean_terminated_length": 6745.2578125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8899112716317177, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021380677353590727, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 300051471.0, + "reward": 0.390625, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000321865081787, + "sampling/importance_sampling_ratio/min": 0.00043609709246084094, + "sampling/sampling_logp_difference/max": 7.737645626068115, + "sampling/sampling_logp_difference/mean": 0.018849756568670273, + "step": 364 + }, + { + "clip_ratio/high_max": 1.1736750366253546e-05, + "clip_ratio/high_mean": 2.9341875915633864e-06, + "clip_ratio/low_mean": 2.6090394442235265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.902458214748549e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14683.0, + "completions/mean_length": 7227.8203125, + "completions/mean_terminated_length": 7008.072265625, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "entropy": 0.9667621031403542, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001994286896660924, + "learning_rate": 1e-05, + "loss": 0.0231, + "num_tokens": 300994584.0, + "reward": 0.4296875, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000085830688477, + "sampling/importance_sampling_ratio/min": 0.005131956655532122, + "sampling/sampling_logp_difference/max": 5.272268295288086, + "sampling/sampling_logp_difference/mean": 0.019861025735735893, + "step": 365 + }, + { + "clip_ratio/high_max": 5.608902483800193e-06, + "clip_ratio/high_mean": 1.4022256209500483e-06, + "clip_ratio/low_mean": 1.2587312312462018e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3989537819725228e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 6763.484375, + "completions/mean_terminated_length": 6372.40625, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.9238758087158203, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019569231662899256, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 301878446.0, + "reward": 0.4765625, + "reward_std": 0.2664504647254944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999585151672363, + "sampling/importance_sampling_ratio/min": 6.425123189046644e-08, + "sampling/sampling_logp_difference/max": 16.56046485900879, + "sampling/sampling_logp_difference/mean": 0.019518161192536354, + "step": 366 + }, + { + "clip_ratio/high_max": 4.044129582325695e-06, + "clip_ratio/high_mean": 1.0110323955814238e-06, + "clip_ratio/low_mean": 3.2966671312806284e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3977703822074545e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16018.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 6098.703125, + "completions/mean_terminated_length": 6098.703125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.7785998061299324, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024868762120604515, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 302677272.0, + "reward": 0.4921875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999961853027344, + "sampling/importance_sampling_ratio/min": 0.003617732785642147, + "sampling/sampling_logp_difference/max": 5.621907711029053, + "sampling/sampling_logp_difference/mean": 0.017242450267076492, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.291554517341865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.291554517341865e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15935.0, + "completions/mean_length": 6799.1875, + "completions/mean_terminated_length": 6569.15234375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.8998014703392982, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0017842436209321022, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 303565408.0, + "reward": 0.3046875, + "reward_std": 0.17806214094161987, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 0.002333547454327345, + "sampling/sampling_logp_difference/max": 6.060365676879883, + "sampling/sampling_logp_difference/mean": 0.01987488754093647, + "step": 368 + }, + { + "clip_ratio/high_max": 2.6103274649358355e-05, + "clip_ratio/high_mean": 7.854475143176387e-06, + "clip_ratio/low_mean": 5.6201750339823775e-05, + "clip_ratio/low_min": 6.543817562487675e-06, + "clip_ratio/region_mean": 6.405622525562649e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15338.0, + "completions/mean_length": 5483.4140625, + "completions/mean_terminated_length": 5131.7822265625, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "entropy": 0.8604720532894135, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004101207479834557, + "learning_rate": 1e-05, + "loss": 0.083, + "num_tokens": 304283925.0, + "reward": 0.4375, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999923825263977, + "sampling/importance_sampling_ratio/min": 8.628804062027484e-05, + "sampling/sampling_logp_difference/max": 9.357819557189941, + "sampling/sampling_logp_difference/mean": 0.018733445554971695, + "step": 369 + }, + { + "clip_ratio/high_max": 8.375103107027826e-06, + "clip_ratio/high_mean": 2.0937757767569565e-06, + "clip_ratio/low_mean": 4.883176779912901e-05, + "clip_ratio/low_min": 7.539494390584878e-06, + "clip_ratio/region_mean": 5.092554329166887e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7857.9140625, + "completions/mean_terminated_length": 7722.57958984375, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.9493537694215775, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025712712667882442, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 305311730.0, + "reward": 0.3125, + "reward_std": 0.3227166533470154, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322295188904, + "sampling/importance_sampling_ratio/min": 0.00010902724170591682, + "sampling/sampling_logp_difference/max": 9.123912811279297, + "sampling/sampling_logp_difference/mean": 0.020730353891849518, + "step": 370 + }, + { + "clip_ratio/high_max": 1.7927761746250326e-05, + "clip_ratio/high_mean": 4.4819404365625815e-06, + "clip_ratio/low_mean": 1.4648778403625329e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.913071884018791e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14578.0, + "completions/mean_length": 6591.28125, + "completions/mean_terminated_length": 6514.17333984375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8540837243199348, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001778970006853342, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 306172870.0, + "reward": 0.53125, + "reward_std": 0.25855979323387146, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999608397483826, + "sampling/importance_sampling_ratio/min": 0.005589231848716736, + "sampling/sampling_logp_difference/max": 5.18691349029541, + "sampling/sampling_logp_difference/mean": 0.018087508156895638, + "step": 371 + }, + { + "clip_ratio/high_max": 1.5696539094278705e-05, + "clip_ratio/high_mean": 3.924134773569676e-06, + "clip_ratio/low_mean": 4.2228432448609965e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.615256762008357e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 7443.5859375, + "completions/mean_terminated_length": 7301.6748046875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 1.1251945495605469, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024547462817281485, + "learning_rate": 1e-05, + "loss": -0.0017, + "num_tokens": 307145857.0, + "reward": 0.2734375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.0008770838030613959, + "sampling/sampling_logp_difference/max": 7.038908004760742, + "sampling/sampling_logp_difference/mean": 0.021768298000097275, + "step": 372 + }, + { + "clip_ratio/high_max": 7.035515409370419e-06, + "clip_ratio/high_mean": 1.7588788523426047e-06, + "clip_ratio/low_mean": 2.2691801063956518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4450679802612285e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14811.0, + "completions/max_terminated_length": 14811.0, + "completions/mean_length": 6497.890625, + "completions/mean_terminated_length": 6497.890625, + "completions/min_length": 1079.0, + "completions/min_terminated_length": 1079.0, + "entropy": 1.0804385766386986, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003075090004131198, + "learning_rate": 1e-05, + "loss": 0.012, + "num_tokens": 307998003.0, + "reward": 0.3515625, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999311566352844, + "sampling/importance_sampling_ratio/min": 0.0032886455301195383, + "sampling/sampling_logp_difference/max": 5.717279434204102, + "sampling/sampling_logp_difference/mean": 0.021208221092820168, + "step": 373 + }, + { + "clip_ratio/high_max": 1.0550694696576102e-05, + "clip_ratio/high_mean": 3.640079512479133e-06, + "clip_ratio/low_mean": 3.440372779550671e-05, + "clip_ratio/low_min": 4.334107870818116e-06, + "clip_ratio/region_mean": 3.804380708061217e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16155.0, + "completions/mean_length": 7146.578125, + "completions/mean_terminated_length": 6692.2783203125, + "completions/min_length": 1089.0, + "completions/min_terminated_length": 1089.0, + "entropy": 0.900071032345295, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023383013904094696, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 308930389.0, + "reward": 0.453125, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000137090682983, + "sampling/importance_sampling_ratio/min": 0.003526465967297554, + "sampling/sampling_logp_difference/max": 5.647459030151367, + "sampling/sampling_logp_difference/mean": 0.019267898052930832, + "step": 374 + }, + { + "clip_ratio/high_max": 2.1745769345216104e-05, + "clip_ratio/high_mean": 6.434876752337004e-06, + "clip_ratio/low_mean": 3.9315604908551904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5750481831419165e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14293.0, + "completions/mean_length": 6189.109375, + "completions/mean_terminated_length": 6108.83447265625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.9284940734505653, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018437084509059787, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 309741419.0, + "reward": 0.4296875, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000801086425781, + "sampling/importance_sampling_ratio/min": 4.7444238589378074e-05, + "sampling/sampling_logp_difference/max": 9.955955505371094, + "sampling/sampling_logp_difference/mean": 0.019703445956110954, + "step": 375 + }, + { + "clip_ratio/high_max": 1.630432370802737e-05, + "clip_ratio/high_mean": 4.076080927006842e-06, + "clip_ratio/low_mean": 3.713273554240004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1208816355720046e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15556.0, + "completions/mean_length": 5456.7421875, + "completions/mean_terminated_length": 5194.48828125, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.9236080572009087, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030215675942599773, + "learning_rate": 1e-05, + "loss": 0.0431, + "num_tokens": 310458386.0, + "reward": 0.46875, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 0.00015846964379306883, + "sampling/sampling_logp_difference/max": 8.749947547912598, + "sampling/sampling_logp_difference/mean": 0.01910843700170517, + "step": 376 + }, + { + "clip_ratio/high_max": 2.3289825548999943e-05, + "clip_ratio/high_mean": 5.822456387249986e-06, + "clip_ratio/low_mean": 3.062871041947801e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.645116612460697e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15118.0, + "completions/mean_length": 6246.25, + "completions/mean_terminated_length": 6085.33349609375, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 1.0128052979707718, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002812379039824009, + "learning_rate": 1e-05, + "loss": 0.0117, + "num_tokens": 311279114.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999204277992249, + "sampling/importance_sampling_ratio/min": 0.0007136549684219062, + "sampling/sampling_logp_difference/max": 7.245110988616943, + "sampling/sampling_logp_difference/mean": 0.02073795720934868, + "step": 377 + }, + { + "clip_ratio/high_max": 1.566006790199026e-05, + "clip_ratio/high_mean": 3.915016975497565e-06, + "clip_ratio/low_mean": 1.4384278813395213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.829929567520594e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 7661.859375, + "completions/mean_terminated_length": 7452.5283203125, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 0.9746306762099266, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018165848450735211, + "learning_rate": 1e-05, + "loss": 0.0255, + "num_tokens": 312280648.0, + "reward": 0.3984375, + "reward_std": 0.15991678833961487, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999211430549622, + "sampling/importance_sampling_ratio/min": 2.2834767150925472e-05, + "sampling/sampling_logp_difference/max": 10.687226295471191, + "sampling/sampling_logp_difference/mean": 0.02064785361289978, + "step": 378 + }, + { + "clip_ratio/high_max": 6.112351002229843e-06, + "clip_ratio/high_mean": 1.5280877505574608e-06, + "clip_ratio/low_mean": 1.7822256495492184e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9350344246049644e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15283.0, + "completions/mean_length": 6575.921875, + "completions/mean_terminated_length": 6498.69287109375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 1.0576276555657387, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0009623004007153213, + "learning_rate": 1e-05, + "loss": -0.0131, + "num_tokens": 313142142.0, + "reward": 0.296875, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999088048934937, + "sampling/importance_sampling_ratio/min": 0.00010695109085645527, + "sampling/sampling_logp_difference/max": 9.143138885498047, + "sampling/sampling_logp_difference/mean": 0.02001393586397171, + "step": 379 + }, + { + "clip_ratio/high_max": 2.1532956907321932e-05, + "clip_ratio/high_mean": 7.117228278730181e-06, + "clip_ratio/low_mean": 4.647828791348729e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359551732908585e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16045.0, + "completions/mean_length": 7349.8203125, + "completions/mean_terminated_length": 7133.00048828125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.9633770063519478, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016735537210479379, + "learning_rate": 1e-05, + "loss": 0.0769, + "num_tokens": 314106551.0, + "reward": 0.3125, + "reward_std": 0.27670514583587646, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0006543444469571114, + "sampling/sampling_logp_difference/max": 7.331876754760742, + "sampling/sampling_logp_difference/mean": 0.01907072216272354, + "step": 380 + }, + { + "clip_ratio/high_max": 1.9804372868748032e-05, + "clip_ratio/high_mean": 4.951093217187008e-06, + "clip_ratio/low_mean": 2.807680073146912e-05, + "clip_ratio/low_min": 3.144654101561173e-06, + "clip_ratio/region_mean": 3.302789434656006e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16343.0, + "completions/mean_length": 7472.6640625, + "completions/mean_terminated_length": 7402.49609375, + "completions/min_length": 942.0, + "completions/min_terminated_length": 942.0, + "entropy": 1.0234674662351608, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0029567319434136152, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 315081020.0, + "reward": 0.328125, + "reward_std": 0.1841355264186859, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999366998672485, + "sampling/importance_sampling_ratio/min": 1.3551310985349119e-05, + "sampling/sampling_logp_difference/max": 11.209027290344238, + "sampling/sampling_logp_difference/mean": 0.020730063319206238, + "step": 381 + }, + { + "clip_ratio/high_max": 2.2943146859688568e-05, + "clip_ratio/high_mean": 6.9194542788864055e-06, + "clip_ratio/low_mean": 3.046788117444521e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.738733437330666e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7663.28125, + "completions/mean_terminated_length": 7234.39306640625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.989475853741169, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002559094922617078, + "learning_rate": 1e-05, + "loss": 0.002, + "num_tokens": 316083520.0, + "reward": 0.2890625, + "reward_std": 0.3227117359638214, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 0.003966364543884993, + "sampling/sampling_logp_difference/max": 5.529905319213867, + "sampling/sampling_logp_difference/mean": 0.02191789261996746, + "step": 382 + }, + { + "clip_ratio/high_max": 1.007244372885907e-05, + "clip_ratio/high_mean": 2.5181109322147677e-06, + "clip_ratio/low_mean": 4.157553627237576e-05, + "clip_ratio/low_min": 7.249949248944176e-06, + "clip_ratio/region_mean": 4.4093647659337876e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15227.0, + "completions/mean_length": 6828.703125, + "completions/mean_terminated_length": 6440.2763671875, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "entropy": 0.9493783265352249, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001576121780090034, + "learning_rate": 1e-05, + "loss": 0.0414, + "num_tokens": 316982154.0, + "reward": 0.4375, + "reward_std": 0.25726157426834106, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999561309814453, + "sampling/importance_sampling_ratio/min": 0.002232425380498171, + "sampling/sampling_logp_difference/max": 6.104666709899902, + "sampling/sampling_logp_difference/mean": 0.020356670022010803, + "step": 383 + }, + { + "clip_ratio/high_max": 4.308265033614589e-06, + "clip_ratio/high_mean": 1.0770662584036472e-06, + "clip_ratio/low_mean": 3.2841844813447096e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.391891118553758e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15194.0, + "completions/mean_length": 6555.2890625, + "completions/mean_terminated_length": 5986.685546875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9516563713550568, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002562758279964328, + "learning_rate": 1e-05, + "loss": -0.0459, + "num_tokens": 317841415.0, + "reward": 0.2734375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999120831489563, + "sampling/importance_sampling_ratio/min": 5.153654274181463e-05, + "sampling/sampling_logp_difference/max": 9.87321949005127, + "sampling/sampling_logp_difference/mean": 0.019885078072547913, + "step": 384 + }, + { + "clip_ratio/high_max": 1.579595573275583e-05, + "clip_ratio/high_mean": 3.948988933188957e-06, + "clip_ratio/low_mean": 5.6516228141845204e-05, + "clip_ratio/low_min": 1.2799536079910467e-05, + "clip_ratio/region_mean": 6.046521548341843e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 8033.5625, + "completions/mean_terminated_length": 7764.193359375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 1.0841791555285454, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0015623728977516294, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 318892079.0, + "reward": 0.234375, + "reward_std": 0.26249873638153076, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.0027189957909286022, + "sampling/sampling_logp_difference/max": 5.907492637634277, + "sampling/sampling_logp_difference/mean": 0.022173013538122177, + "step": 385 + }, + { + "clip_ratio/high_max": 1.592646640347084e-05, + "clip_ratio/high_mean": 3.98161660086771e-06, + "clip_ratio/low_mean": 3.5816001627608784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.979761731898179e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6105.0390625, + "completions/mean_terminated_length": 6024.1025390625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.7882698476314545, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015339057426899672, + "learning_rate": 1e-05, + "loss": 0.0568, + "num_tokens": 319692740.0, + "reward": 0.5625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999640583992004, + "sampling/importance_sampling_ratio/min": 0.005946483928710222, + "sampling/sampling_logp_difference/max": 5.124955177307129, + "sampling/sampling_logp_difference/mean": 0.017854198813438416, + "step": 386 + }, + { + "clip_ratio/high_max": 3.630976607382763e-06, + "clip_ratio/high_mean": 9.077441518456908e-07, + "clip_ratio/low_mean": 2.5168051195123553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6075795346969244e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14242.0, + "completions/max_terminated_length": 14242.0, + "completions/mean_length": 7078.359375, + "completions/mean_terminated_length": 7078.359375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 1.0915816724300385, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.000674036389682442, + "learning_rate": 1e-05, + "loss": 0.0477, + "num_tokens": 320618618.0, + "reward": 0.375, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999241828918457, + "sampling/importance_sampling_ratio/min": 0.012588412500917912, + "sampling/sampling_logp_difference/max": 4.374978542327881, + "sampling/sampling_logp_difference/mean": 0.021491196006536484, + "step": 387 + }, + { + "clip_ratio/high_max": 2.3060737021296518e-05, + "clip_ratio/high_mean": 8.880587984094745e-06, + "clip_ratio/low_mean": 4.042122702685447e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930181512463605e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15486.0, + "completions/mean_length": 7647.6875, + "completions/mean_terminated_length": 7065.26708984375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.8284596502780914, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001767225214280188, + "learning_rate": 1e-05, + "loss": 0.0847, + "num_tokens": 321617138.0, + "reward": 0.4765625, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999255537986755, + "sampling/importance_sampling_ratio/min": 0.0026657104026526213, + "sampling/sampling_logp_difference/max": 5.9272847175598145, + "sampling/sampling_logp_difference/mean": 0.018413839861750603, + "step": 388 + }, + { + "clip_ratio/high_max": 9.76903538685292e-06, + "clip_ratio/high_mean": 3.700462343658728e-06, + "clip_ratio/low_mean": 2.6322781820908858e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0023243880350492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14298.0, + "completions/mean_length": 6616.8984375, + "completions/mean_terminated_length": 6461.865234375, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "entropy": 0.9324140176177025, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007780150044709444, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 322482213.0, + "reward": 0.5078125, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999249577522278, + "sampling/importance_sampling_ratio/min": 8.851349093674798e-07, + "sampling/sampling_logp_difference/max": 13.937525749206543, + "sampling/sampling_logp_difference/mean": 0.019632574170827866, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.183885348154945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.183885348154945e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15911.0, + "completions/mean_length": 6376.375, + "completions/mean_terminated_length": 6297.57470703125, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 1.0122736915946007, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.00017013182514347136, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 323316413.0, + "reward": 0.484375, + "reward_std": 0.1173202246427536, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999897480010986, + "sampling/importance_sampling_ratio/min": 0.001820300007238984, + "sampling/sampling_logp_difference/max": 6.308753967285156, + "sampling/sampling_logp_difference/mean": 0.020268389955163002, + "step": 390 + }, + { + "clip_ratio/high_max": 1.2158910067228135e-05, + "clip_ratio/high_mean": 4.907883408122871e-06, + "clip_ratio/low_mean": 3.3955970252463885e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.886385343321308e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 7434.703125, + "completions/mean_terminated_length": 7364.236328125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 1.056224174797535, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019504460506141186, + "learning_rate": 1e-05, + "loss": 0.0176, + "num_tokens": 324289663.0, + "reward": 0.3046875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999295473098755, + "sampling/importance_sampling_ratio/min": 0.0005411410820670426, + "sampling/sampling_logp_difference/max": 7.5218305587768555, + "sampling/sampling_logp_difference/mean": 0.021627606824040413, + "step": 391 + }, + { + "clip_ratio/high_max": 2.5075807570829056e-05, + "clip_ratio/high_mean": 7.3508283549017506e-06, + "clip_ratio/low_mean": 3.88432285944873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.619405763151008e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 6783.9140625, + "completions/mean_terminated_length": 6708.32275390625, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.9994921758770943, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003350428305566311, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 325174860.0, + "reward": 0.40625, + "reward_std": 0.33797895908355713, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999490976333618, + "sampling/importance_sampling_ratio/min": 0.0019297851249575615, + "sampling/sampling_logp_difference/max": 6.250346660614014, + "sampling/sampling_logp_difference/mean": 0.02060745656490326, + "step": 392 + }, + { + "clip_ratio/high_max": 5.086883902549744e-06, + "clip_ratio/high_mean": 2.125662831531372e-06, + "clip_ratio/low_mean": 3.603865525292349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816431808445486e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15520.0, + "completions/mean_length": 6797.28125, + "completions/mean_terminated_length": 6645.111328125, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 0.9564928039908409, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030228395480662584, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 326065824.0, + "reward": 0.46875, + "reward_std": 0.27722427248954773, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999678134918213, + "sampling/importance_sampling_ratio/min": 1.927352604980115e-05, + "sampling/sampling_logp_difference/max": 10.856778144836426, + "sampling/sampling_logp_difference/mean": 0.020122073590755463, + "step": 393 + }, + { + "clip_ratio/high_max": 8.678096946823644e-06, + "clip_ratio/high_mean": 2.169524236705911e-06, + "clip_ratio/low_mean": 2.1449313862831332e-05, + "clip_ratio/low_min": 3.5140985801263014e-06, + "clip_ratio/region_mean": 2.361883775847673e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7028.4765625, + "completions/mean_terminated_length": 6954.81103515625, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "entropy": 0.9178477674722672, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027565474156290293, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 326985805.0, + "reward": 0.40625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 0.003095855936408043, + "sampling/sampling_logp_difference/max": 5.777690887451172, + "sampling/sampling_logp_difference/mean": 0.019194945693016052, + "step": 394 + }, + { + "clip_ratio/high_max": 1.1162969258293742e-05, + "clip_ratio/high_mean": 2.7907423145734356e-06, + "clip_ratio/low_mean": 4.0257837554236175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.304857930037542e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15646.0, + "completions/mean_length": 6254.71875, + "completions/mean_terminated_length": 6174.96044921875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.9090404361486435, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022540187928825617, + "learning_rate": 1e-05, + "loss": 0.0586, + "num_tokens": 327805417.0, + "reward": 0.4140625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999850392341614, + "sampling/importance_sampling_ratio/min": 0.007726692594587803, + "sampling/sampling_logp_difference/max": 4.86307430267334, + "sampling/sampling_logp_difference/mean": 0.01917862705886364, + "step": 395 + }, + { + "clip_ratio/high_max": 2.4049867988651386e-05, + "clip_ratio/high_mean": 6.012466997162846e-06, + "clip_ratio/low_mean": 2.1124733166288934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7137200504512293e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16051.0, + "completions/mean_length": 7654.546875, + "completions/mean_terminated_length": 7225.22900390625, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9535491093993187, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013819639571011066, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 328804303.0, + "reward": 0.5078125, + "reward_std": 0.2301519513130188, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999759793281555, + "sampling/importance_sampling_ratio/min": 0.00017957323871087283, + "sampling/sampling_logp_difference/max": 8.624927520751953, + "sampling/sampling_logp_difference/mean": 0.019935712218284607, + "step": 396 + }, + { + "clip_ratio/high_max": 4.677968718169723e-06, + "clip_ratio/high_mean": 1.1694921795424307e-06, + "clip_ratio/low_mean": 4.5318136926653096e-05, + "clip_ratio/low_min": 1.0762409146991558e-05, + "clip_ratio/region_mean": 4.648762910619553e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 6929.859375, + "completions/mean_terminated_length": 6702.96044921875, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.8612276986241341, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015145445941016078, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 329711437.0, + "reward": 0.4375, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998670220375061, + "sampling/importance_sampling_ratio/min": 6.962344286876032e-06, + "sampling/sampling_logp_difference/max": 11.874994277954102, + "sampling/sampling_logp_difference/mean": 0.01896081678569317, + "step": 397 + }, + { + "clip_ratio/high_max": 1.5800192159076687e-05, + "clip_ratio/high_mean": 5.8905598052660935e-06, + "clip_ratio/low_mean": 1.027900856342967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.616956859606944e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15568.0, + "completions/mean_length": 6751.09375, + "completions/mean_terminated_length": 6675.244140625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 1.008638858795166, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010175694478675723, + "learning_rate": 1e-05, + "loss": -0.0079, + "num_tokens": 330594657.0, + "reward": 0.40625, + "reward_std": 0.17017142474651337, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999219179153442, + "sampling/importance_sampling_ratio/min": 6.605670205317438e-05, + "sampling/sampling_logp_difference/max": 9.62499713897705, + "sampling/sampling_logp_difference/mean": 0.019827818498015404, + "step": 398 + }, + { + "clip_ratio/high_max": 7.255490572788403e-06, + "clip_ratio/high_mean": 1.8138726431971008e-06, + "clip_ratio/low_mean": 4.20189051055786e-05, + "clip_ratio/low_min": 7.900641321612056e-06, + "clip_ratio/region_mean": 4.383277814667963e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 7907.796875, + "completions/mean_terminated_length": 7563.2353515625, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.8603325337171555, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014811329310759902, + "learning_rate": 1e-05, + "loss": 0.0714, + "num_tokens": 331626943.0, + "reward": 0.28125, + "reward_std": 0.2161829173564911, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998874068260193, + "sampling/importance_sampling_ratio/min": 3.0665268013763125e-07, + "sampling/sampling_logp_difference/max": 14.997550010681152, + "sampling/sampling_logp_difference/mean": 0.018387217074632645, + "step": 399 + }, + { + "clip_ratio/high_max": 1.2884957641290384e-05, + "clip_ratio/high_mean": 4.083570104285172e-06, + "clip_ratio/low_mean": 1.6143149423442082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.022671930035358e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 7498.40625, + "completions/mean_terminated_length": 7137.203125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 1.0180751085281372, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001668943208642304, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 332605987.0, + "reward": 0.3359375, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725222587585, + "sampling/importance_sampling_ratio/min": 3.239733814552892e-08, + "sampling/sampling_logp_difference/max": 17.245189666748047, + "sampling/sampling_logp_difference/mean": 0.020663965493440628, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8121567652306112e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8121567652306112e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6650.4453125, + "completions/mean_terminated_length": 6495.94482421875, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 0.9293805658817291, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0036925526801496744, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333475324.0, + "reward": 0.3828125, + "reward_std": 0.19674427807331085, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999297857284546, + "sampling/importance_sampling_ratio/min": 0.0019147126004099846, + "sampling/sampling_logp_difference/max": 6.258187770843506, + "sampling/sampling_logp_difference/mean": 0.01987956464290619, + "step": 401 + }, + { + "clip_ratio/high_max": 9.03130421647802e-06, + "clip_ratio/high_mean": 2.257826054119505e-06, + "clip_ratio/low_mean": 3.9613908143110166e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.187173419722967e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14410.0, + "completions/mean_length": 6967.6328125, + "completions/mean_terminated_length": 6663.87890625, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "entropy": 0.8103456348180771, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015530216041952372, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 334389053.0, + "reward": 0.4765625, + "reward_std": 0.29932138323783875, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999937415122986, + "sampling/importance_sampling_ratio/min": 1.2903526112495456e-05, + "sampling/sampling_logp_difference/max": 11.258009910583496, + "sampling/sampling_logp_difference/mean": 0.018520750105381012, + "step": 402 + }, + { + "clip_ratio/high_max": 7.21459082342335e-06, + "clip_ratio/high_mean": 1.8036477058558376e-06, + "clip_ratio/low_mean": 2.5680752742118784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7484400334287784e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15788.0, + "completions/mean_length": 6583.15625, + "completions/mean_terminated_length": 6427.587890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 1.0669879838824272, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023163470905274153, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 335249113.0, + "reward": 0.3671875, + "reward_std": 0.2867175340652466, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999435544013977, + "sampling/importance_sampling_ratio/min": 0.0013276290846988559, + "sampling/sampling_logp_difference/max": 6.62436056137085, + "sampling/sampling_logp_difference/mean": 0.020729750394821167, + "step": 403 + }, + { + "clip_ratio/high_max": 1.915673669827811e-05, + "clip_ratio/high_mean": 4.789184174569527e-06, + "clip_ratio/low_mean": 4.268036605026282e-05, + "clip_ratio/low_min": 6.225874585652491e-06, + "clip_ratio/region_mean": 4.746955005430209e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 7847.734375, + "completions/mean_terminated_length": 7712.23876953125, + "completions/min_length": 1127.0, + "completions/min_terminated_length": 1127.0, + "entropy": 1.0450394004583359, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0011931186309084296, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 336270823.0, + "reward": 0.2734375, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000035047531128, + "sampling/importance_sampling_ratio/min": 0.004087730310857296, + "sampling/sampling_logp_difference/max": 5.499765396118164, + "sampling/sampling_logp_difference/mean": 0.02191723883152008, + "step": 404 + }, + { + "clip_ratio/high_max": 7.73082024352334e-06, + "clip_ratio/high_mean": 1.932705060880835e-06, + "clip_ratio/low_mean": 2.2936642153581488e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4869347271305742e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15621.0, + "completions/mean_length": 6286.1953125, + "completions/mean_terminated_length": 6206.68505859375, + "completions/min_length": 918.0, + "completions/min_terminated_length": 918.0, + "entropy": 1.0122173130512238, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032431832514703274, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 337095136.0, + "reward": 0.4453125, + "reward_std": 0.24275578558444977, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999330639839172, + "sampling/importance_sampling_ratio/min": 2.1024358431986911e-07, + "sampling/sampling_logp_difference/max": 15.374999046325684, + "sampling/sampling_logp_difference/mean": 0.021477293223142624, + "step": 405 + }, + { + "clip_ratio/high_max": 9.451312507735565e-06, + "clip_ratio/high_mean": 2.3628281269338913e-06, + "clip_ratio/low_mean": 1.8447401316734613e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.081022921629483e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15792.0, + "completions/max_terminated_length": 15792.0, + "completions/mean_length": 7430.8125, + "completions/mean_terminated_length": 7430.8125, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 1.1211064383387566, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012266195844858885, + "learning_rate": 1e-05, + "loss": 0.0132, + "num_tokens": 338069448.0, + "reward": 0.234375, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999444484710693, + "sampling/importance_sampling_ratio/min": 0.0013370488304644823, + "sampling/sampling_logp_difference/max": 6.617290496826172, + "sampling/sampling_logp_difference/mean": 0.02237049862742424, + "step": 406 + }, + { + "clip_ratio/high_max": 1.1666743375826627e-05, + "clip_ratio/high_mean": 2.9166858439566568e-06, + "clip_ratio/low_mean": 3.927663362901512e-05, + "clip_ratio/low_min": 4.591199740389129e-06, + "clip_ratio/region_mean": 4.2193319245598104e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15672.0, + "completions/max_terminated_length": 15672.0, + "completions/mean_length": 6209.578125, + "completions/mean_terminated_length": 6209.578125, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.9696918427944183, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002120936056599021, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 338883986.0, + "reward": 0.4921875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 0.000961031299084425, + "sampling/sampling_logp_difference/max": 6.947503566741943, + "sampling/sampling_logp_difference/mean": 0.0204964317381382, + "step": 407 + }, + { + "clip_ratio/high_max": 3.829187789960997e-06, + "clip_ratio/high_mean": 9.572969474902493e-07, + "clip_ratio/low_mean": 4.5606326921188156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.656362375499157e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15322.0, + "completions/max_terminated_length": 15322.0, + "completions/mean_length": 6625.140625, + "completions/mean_terminated_length": 6625.140625, + "completions/min_length": 1063.0, + "completions/min_terminated_length": 1063.0, + "entropy": 1.0780328214168549, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021016194950789213, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 339753228.0, + "reward": 0.359375, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 0.00479263998568058, + "sampling/sampling_logp_difference/max": 5.340673923492432, + "sampling/sampling_logp_difference/mean": 0.02143041603267193, + "step": 408 + }, + { + "clip_ratio/high_max": 1.7951345853362e-05, + "clip_ratio/high_mean": 4.4878364633405e-06, + "clip_ratio/low_mean": 3.357411151228007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8061947634560056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 7494.2109375, + "completions/mean_terminated_length": 7207.443359375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 1.0134501904249191, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017506639705970883, + "learning_rate": 1e-05, + "loss": 0.0361, + "num_tokens": 340731983.0, + "reward": 0.34375, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999791383743286, + "sampling/importance_sampling_ratio/min": 6.919008654904246e-08, + "sampling/sampling_logp_difference/max": 16.486408233642578, + "sampling/sampling_logp_difference/mean": 0.020142192021012306, + "step": 409 + }, + { + "clip_ratio/high_max": 2.0409703665791312e-05, + "clip_ratio/high_mean": 7.713539844189654e-06, + "clip_ratio/low_mean": 3.658559990071808e-05, + "clip_ratio/low_min": 3.80390133614128e-06, + "clip_ratio/region_mean": 4.429913997228141e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15238.0, + "completions/mean_length": 6724.828125, + "completions/mean_terminated_length": 6493.00830078125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.961749866604805, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014797865878790617, + "learning_rate": 1e-05, + "loss": -0.0195, + "num_tokens": 341613265.0, + "reward": 0.5, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999843835830688, + "sampling/importance_sampling_ratio/min": 1.6481149941682816e-05, + "sampling/sampling_logp_difference/max": 11.013293266296387, + "sampling/sampling_logp_difference/mean": 0.021053435280919075, + "step": 410 + }, + { + "clip_ratio/high_max": 8.271860679087695e-06, + "clip_ratio/high_mean": 2.0679651697719237e-06, + "clip_ratio/low_mean": 2.1166565488783817e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.323453065855574e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14961.0, + "completions/mean_length": 6513.5625, + "completions/mean_terminated_length": 6195.1611328125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.8742869198322296, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018223582301288843, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 342466337.0, + "reward": 0.5, + "reward_std": 0.20593318343162537, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690651893616, + "sampling/importance_sampling_ratio/min": 0.0027132700197398663, + "sampling/sampling_logp_difference/max": 5.909600734710693, + "sampling/sampling_logp_difference/mean": 0.01892159879207611, + "step": 411 + }, + { + "clip_ratio/high_max": 1.867416995082749e-05, + "clip_ratio/high_mean": 4.668542487706873e-06, + "clip_ratio/low_mean": 5.194308118916524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6611622540003737e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15859.0, + "completions/max_terminated_length": 15859.0, + "completions/mean_length": 7088.0390625, + "completions/mean_terminated_length": 7088.0390625, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "entropy": 0.8695354089140892, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00121080141980201, + "learning_rate": 1e-05, + "loss": 0.0095, + "num_tokens": 343393318.0, + "reward": 0.515625, + "reward_std": 0.3009189963340759, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.0003100235771853477, + "sampling/sampling_logp_difference/max": 8.078862190246582, + "sampling/sampling_logp_difference/mean": 0.01892455853521824, + "step": 412 + }, + { + "clip_ratio/high_max": 3.6179024164084694e-05, + "clip_ratio/high_mean": 9.044756041021174e-06, + "clip_ratio/low_mean": 3.288474886176118e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1929504845938936e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15178.0, + "completions/mean_length": 6221.6484375, + "completions/mean_terminated_length": 6141.6298828125, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.937163233757019, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002599990228191018, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 344207225.0, + "reward": 0.390625, + "reward_std": 0.348238468170166, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 3.535756695782766e-05, + "sampling/sampling_logp_difference/max": 10.249998092651367, + "sampling/sampling_logp_difference/mean": 0.019875720143318176, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.69036411534762e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.69036411534762e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6664.46875, + "completions/mean_terminated_length": 6587.93701171875, + "completions/min_length": 1317.0, + "completions/min_terminated_length": 1317.0, + "entropy": 1.0893034785985947, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012395181693136692, + "learning_rate": 1e-05, + "loss": 0.0358, + "num_tokens": 345082629.0, + "reward": 0.3984375, + "reward_std": 0.23145011067390442, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253153800964, + "sampling/importance_sampling_ratio/min": 0.0004444181395228952, + "sampling/sampling_logp_difference/max": 7.71874475479126, + "sampling/sampling_logp_difference/mean": 0.022249475121498108, + "step": 414 + }, + { + "clip_ratio/high_max": 3.8116729683679296e-06, + "clip_ratio/high_mean": 9.529182420919824e-07, + "clip_ratio/low_mean": 1.930760379309504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0260522319404117e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16020.0, + "completions/mean_length": 5986.390625, + "completions/mean_terminated_length": 5904.51953125, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9476369470357895, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011368105188012123, + "learning_rate": 1e-05, + "loss": 0.0414, + "num_tokens": 345869327.0, + "reward": 0.40625, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999204277992249, + "sampling/importance_sampling_ratio/min": 0.0007102401577867568, + "sampling/sampling_logp_difference/max": 7.249907493591309, + "sampling/sampling_logp_difference/mean": 0.019328134134411812, + "step": 415 + }, + { + "clip_ratio/high_max": 2.638578052938101e-06, + "clip_ratio/high_mean": 6.596445132345252e-07, + "clip_ratio/low_mean": 2.8019193905493012e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8678838418727537e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15247.0, + "completions/mean_length": 7780.8046875, + "completions/mean_terminated_length": 7574.328125, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.9548748508095741, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016439391765743494, + "learning_rate": 1e-05, + "loss": 0.0134, + "num_tokens": 346885974.0, + "reward": 0.3828125, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999086856842041, + "sampling/importance_sampling_ratio/min": 0.0041214353404939175, + "sampling/sampling_logp_difference/max": 5.491553783416748, + "sampling/sampling_logp_difference/mean": 0.020669173449277878, + "step": 416 + }, + { + "clip_ratio/high_max": 8.280869224108756e-06, + "clip_ratio/high_mean": 2.070217306027189e-06, + "clip_ratio/low_mean": 3.338867099955678e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5458888532957644e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15766.0, + "completions/mean_length": 7118.4921875, + "completions/mean_terminated_length": 6582.470703125, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.9908356294035912, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002354196272790432, + "learning_rate": 1e-05, + "loss": 0.037, + "num_tokens": 347818245.0, + "reward": 0.421875, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998934268951416, + "sampling/importance_sampling_ratio/min": 7.691462087677792e-05, + "sampling/sampling_logp_difference/max": 9.472814559936523, + "sampling/sampling_logp_difference/mean": 0.020420750603079796, + "step": 417 + }, + { + "clip_ratio/high_max": 4.261557478457689e-06, + "clip_ratio/high_mean": 1.0653893696144223e-06, + "clip_ratio/low_mean": 3.0260198514042713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1325587883657136e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15213.0, + "completions/mean_length": 7016.0546875, + "completions/mean_terminated_length": 6791.22412109375, + "completions/min_length": 907.0, + "completions/min_terminated_length": 907.0, + "entropy": 0.9372202381491661, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002695834031328559, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 348734852.0, + "reward": 0.484375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999836087226868, + "sampling/importance_sampling_ratio/min": 3.6898933331031003e-07, + "sampling/sampling_logp_difference/max": 14.812498092651367, + "sampling/sampling_logp_difference/mean": 0.01997985690832138, + "step": 418 + }, + { + "clip_ratio/high_max": 1.4203505088516977e-05, + "clip_ratio/high_mean": 4.557706688501639e-06, + "clip_ratio/low_mean": 3.802522951446008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.258293620296172e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15005.0, + "completions/max_terminated_length": 15005.0, + "completions/mean_length": 6170.859375, + "completions/mean_terminated_length": 6170.859375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.7692223712801933, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003598283976316452, + "learning_rate": 1e-05, + "loss": 0.0745, + "num_tokens": 349543850.0, + "reward": 0.625, + "reward_std": 0.37875327467918396, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.017690379172563553, + "step": 419 + }, + { + "clip_ratio/high_max": 3.7454306038853247e-06, + "clip_ratio/high_mean": 9.363576509713312e-07, + "clip_ratio/low_mean": 2.0118780639677425e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1055138290648756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14385.0, + "completions/mean_length": 6198.5859375, + "completions/mean_terminated_length": 6118.3857421875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 1.0641538202762604, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003362868446856737, + "learning_rate": 1e-05, + "loss": 0.0385, + "num_tokens": 350358493.0, + "reward": 0.4375, + "reward_std": 0.2432974874973297, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000051498413086, + "sampling/importance_sampling_ratio/min": 9.425564826415211e-07, + "sampling/sampling_logp_difference/max": 13.874670028686523, + "sampling/sampling_logp_difference/mean": 0.01945672184228897, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.529027955868514e-05, + "clip_ratio/low_min": 1.1817648100986844e-05, + "clip_ratio/region_mean": 4.529027955868514e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7429.953125, + "completions/mean_terminated_length": 6833.01708984375, + "completions/min_length": 1152.0, + "completions/min_terminated_length": 1152.0, + "entropy": 0.7885174229741096, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020358162000775337, + "learning_rate": 1e-05, + "loss": 0.0665, + "num_tokens": 351327135.0, + "reward": 0.3984375, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483823776245, + "sampling/importance_sampling_ratio/min": 4.07999541494064e-05, + "sampling/sampling_logp_difference/max": 10.106829643249512, + "sampling/sampling_logp_difference/mean": 0.017557526007294655, + "step": 421 + }, + { + "clip_ratio/high_max": 1.2953943951288238e-05, + "clip_ratio/high_mean": 4.294050768294255e-06, + "clip_ratio/low_mean": 2.7448330115475983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174238065639656e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16185.0, + "completions/max_terminated_length": 16185.0, + "completions/mean_length": 7466.75, + "completions/mean_terminated_length": 7466.75, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "entropy": 0.9798530638217926, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019255588995292783, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 352300247.0, + "reward": 0.265625, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999645352363586, + "sampling/importance_sampling_ratio/min": 0.0010790677042677999, + "sampling/sampling_logp_difference/max": 6.831657886505127, + "sampling/sampling_logp_difference/mean": 0.020764775574207306, + "step": 422 + }, + { + "clip_ratio/high_max": 1.4318582771011279e-05, + "clip_ratio/high_mean": 3.5796456927528197e-06, + "clip_ratio/low_mean": 1.4836090599601448e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8415736349197687e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16182.0, + "completions/mean_length": 6264.40625, + "completions/mean_terminated_length": 6021.5361328125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.8464985340833664, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016904048388823867, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 353122747.0, + "reward": 0.2890625, + "reward_std": 0.2738093435764313, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 1.569278902024962e-05, + "sampling/sampling_logp_difference/max": 11.062309265136719, + "sampling/sampling_logp_difference/mean": 0.018584076315164566, + "step": 423 + }, + { + "clip_ratio/high_max": 1.6524649709026562e-05, + "clip_ratio/high_mean": 5.198334406486538e-06, + "clip_ratio/low_mean": 5.1570618779805955e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.676895318629249e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16051.0, + "completions/max_terminated_length": 16051.0, + "completions/mean_length": 5848.3359375, + "completions/mean_terminated_length": 5848.3359375, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "entropy": 1.0793062299489975, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015838779509067535, + "learning_rate": 1e-05, + "loss": -0.0144, + "num_tokens": 353888374.0, + "reward": 0.4921875, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999014139175415, + "sampling/importance_sampling_ratio/min": 0.0002261155314045027, + "sampling/sampling_logp_difference/max": 8.394464492797852, + "sampling/sampling_logp_difference/mean": 0.020625369623303413, + "step": 424 + }, + { + "clip_ratio/high_max": 2.2546613308804808e-05, + "clip_ratio/high_mean": 5.636653327201202e-06, + "clip_ratio/low_mean": 4.848485787078971e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.048513922796701e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14583.0, + "completions/mean_length": 5917.984375, + "completions/mean_terminated_length": 5751.857421875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8621423915028572, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002542395843192935, + "learning_rate": 1e-05, + "loss": 0.053, + "num_tokens": 354665052.0, + "reward": 0.6484375, + "reward_std": 0.13941732048988342, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999158382415771, + "sampling/importance_sampling_ratio/min": 0.00038012932054698467, + "sampling/sampling_logp_difference/max": 7.874999046325684, + "sampling/sampling_logp_difference/mean": 0.0170799158513546, + "step": 425 + }, + { + "clip_ratio/high_max": 1.1686064681271091e-05, + "clip_ratio/high_mean": 2.9215161703177728e-06, + "clip_ratio/low_mean": 1.6330765674865688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9252282072557136e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 6513.65625, + "completions/mean_terminated_length": 6435.93701171875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0047430396080017, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0008743361104279757, + "learning_rate": 1e-05, + "loss": 0.0568, + "num_tokens": 355526744.0, + "reward": 0.3125, + "reward_std": 0.16097761690616608, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999683499336243, + "sampling/importance_sampling_ratio/min": 5.006812898500357e-06, + "sampling/sampling_logp_difference/max": 12.204710960388184, + "sampling/sampling_logp_difference/mean": 0.020237455144524574, + "step": 426 + }, + { + "clip_ratio/high_max": 1.7667963220446836e-05, + "clip_ratio/high_mean": 4.416990805111709e-06, + "clip_ratio/low_mean": 2.390649478911655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.832348559422826e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13700.0, + "completions/max_terminated_length": 13700.0, + "completions/mean_length": 6363.9375, + "completions/mean_terminated_length": 6363.9375, + "completions/min_length": 1118.0, + "completions/min_terminated_length": 1118.0, + "entropy": 0.910186342895031, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034290661569684744, + "learning_rate": 1e-05, + "loss": 0.0773, + "num_tokens": 356359920.0, + "reward": 0.4296875, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.0023352939169853926, + "sampling/sampling_logp_difference/max": 6.059617519378662, + "sampling/sampling_logp_difference/mean": 0.019128751009702682, + "step": 427 + }, + { + "clip_ratio/high_max": 1.9295963738841238e-05, + "clip_ratio/high_mean": 4.823990934710309e-06, + "clip_ratio/low_mean": 3.187764491485723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.67016357358807e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14673.0, + "completions/max_terminated_length": 14673.0, + "completions/mean_length": 6206.5859375, + "completions/mean_terminated_length": 6206.5859375, + "completions/min_length": 988.0, + "completions/min_terminated_length": 988.0, + "entropy": 0.8695667088031769, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022478618193417788, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 357172435.0, + "reward": 0.5390625, + "reward_std": 0.3332657814025879, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000020146369934, + "sampling/importance_sampling_ratio/min": 1.993246769416146e-06, + "sampling/sampling_logp_difference/max": 13.12574577331543, + "sampling/sampling_logp_difference/mean": 0.019101407378911972, + "step": 428 + }, + { + "clip_ratio/high_max": 2.577107125034672e-06, + "clip_ratio/high_mean": 6.44276781258668e-07, + "clip_ratio/low_mean": 3.719566507243144e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.783994179684669e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14648.0, + "completions/mean_length": 6773.65625, + "completions/mean_terminated_length": 6697.984375, + "completions/min_length": 1150.0, + "completions/min_terminated_length": 1150.0, + "entropy": 1.0704292133450508, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030995130073279142, + "learning_rate": 1e-05, + "loss": 0.0409, + "num_tokens": 358060623.0, + "reward": 0.3515625, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589323997498, + "sampling/importance_sampling_ratio/min": 1.8965129129355773e-05, + "sampling/sampling_logp_difference/max": 10.872908592224121, + "sampling/sampling_logp_difference/mean": 0.02080383338034153, + "step": 429 + }, + { + "clip_ratio/high_max": 1.0044732334790751e-05, + "clip_ratio/high_mean": 3.6204799016559264e-06, + "clip_ratio/low_mean": 3.683777390506293e-05, + "clip_ratio/low_min": 4.640285169443814e-06, + "clip_ratio/region_mean": 4.045825380671886e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6753.4609375, + "completions/mean_terminated_length": 6442.79833984375, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 0.8907509669661522, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025211002212017775, + "learning_rate": 1e-05, + "loss": 0.0812, + "num_tokens": 358942514.0, + "reward": 0.5078125, + "reward_std": 0.33691808581352234, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.001427572569809854, + "sampling/sampling_logp_difference/max": 6.551779747009277, + "sampling/sampling_logp_difference/mean": 0.019076799973845482, + "step": 430 + }, + { + "clip_ratio/high_max": 2.213625748481718e-05, + "clip_ratio/high_mean": 5.534064371204295e-06, + "clip_ratio/low_mean": 4.042425916850334e-05, + "clip_ratio/low_min": 4.858519787376281e-06, + "clip_ratio/region_mean": 4.59583234260208e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16165.0, + "completions/max_terminated_length": 16165.0, + "completions/mean_length": 5878.4921875, + "completions/mean_terminated_length": 5878.4921875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.8234230354428291, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023358019534498453, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 359716041.0, + "reward": 0.53125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998354911804199, + "sampling/importance_sampling_ratio/min": 0.0008571944781579077, + "sampling/sampling_logp_difference/max": 7.061845779418945, + "sampling/sampling_logp_difference/mean": 0.018851958215236664, + "step": 431 + }, + { + "clip_ratio/high_max": 7.793237045916612e-06, + "clip_ratio/high_mean": 1.948309261479153e-06, + "clip_ratio/low_mean": 5.3089813718543155e-05, + "clip_ratio/low_min": 3.7982376852596644e-06, + "clip_ratio/region_mean": 5.503812303686573e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15028.0, + "completions/mean_length": 6296.0078125, + "completions/mean_terminated_length": 6135.88134765625, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.9341304004192352, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002632992109283805, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 360544066.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999693036079407, + "sampling/importance_sampling_ratio/min": 0.00015875507961027324, + "sampling/sampling_logp_difference/max": 8.748147964477539, + "sampling/sampling_logp_difference/mean": 0.01882069557905197, + "step": 432 + }, + { + "clip_ratio/high_max": 1.8652748622116633e-05, + "clip_ratio/high_mean": 4.663187155529158e-06, + "clip_ratio/low_mean": 3.725770324081168e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1920890453184256e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15766.0, + "completions/mean_length": 7325.359375, + "completions/mean_terminated_length": 6957.12158203125, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.7979409247636795, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002110559493303299, + "learning_rate": 1e-05, + "loss": 0.0474, + "num_tokens": 361502504.0, + "reward": 0.4921875, + "reward_std": 0.21436071395874023, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271631240845, + "sampling/importance_sampling_ratio/min": 1.778415753506124e-05, + "sampling/sampling_logp_difference/max": 10.937202453613281, + "sampling/sampling_logp_difference/mean": 0.018452363088726997, + "step": 433 + }, + { + "clip_ratio/high_max": 5.034029982198263e-06, + "clip_ratio/high_mean": 1.2585074955495656e-06, + "clip_ratio/low_mean": 2.1098365436955646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2356872932505212e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 5471.5625, + "completions/mean_terminated_length": 5385.6376953125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "entropy": 0.8691592514514923, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038794223219156265, + "learning_rate": 1e-05, + "loss": -0.041, + "num_tokens": 362220856.0, + "reward": 0.546875, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 0.0027285523246973753, + "sampling/sampling_logp_difference/max": 5.903984069824219, + "sampling/sampling_logp_difference/mean": 0.01814887300133705, + "step": 434 + }, + { + "clip_ratio/high_max": 1.2709096154139843e-05, + "clip_ratio/high_mean": 3.1772740385349607e-06, + "clip_ratio/low_mean": 4.124845816022571e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.442573271035144e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 5305.328125, + "completions/mean_terminated_length": 5218.09423828125, + "completions/min_length": 542.0, + "completions/min_terminated_length": 542.0, + "entropy": 0.7804318591952324, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029884849209338427, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 362921226.0, + "reward": 0.6328125, + "reward_std": 0.3505876660346985, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999871015548706, + "sampling/importance_sampling_ratio/min": 0.0024799995590001345, + "sampling/sampling_logp_difference/max": 5.999496936798096, + "sampling/sampling_logp_difference/mean": 0.017358118668198586, + "step": 435 + }, + { + "clip_ratio/high_max": 4.018904746772023e-06, + "clip_ratio/high_mean": 1.9869055449817097e-06, + "clip_ratio/low_mean": 3.535901299756006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.734591876991544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15577.0, + "completions/max_terminated_length": 15577.0, + "completions/mean_length": 7197.6328125, + "completions/mean_terminated_length": 7197.6328125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9466754496097565, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023567057214677334, + "learning_rate": 1e-05, + "loss": 0.1036, + "num_tokens": 363863579.0, + "reward": 0.375, + "reward_std": 0.2924865484237671, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 3.132333574740187e-07, + "sampling/sampling_logp_difference/max": 14.976317405700684, + "sampling/sampling_logp_difference/mean": 0.020331334322690964, + "step": 436 + }, + { + "clip_ratio/high_max": 3.7869606330787065e-06, + "clip_ratio/high_mean": 9.467401582696766e-07, + "clip_ratio/low_mean": 4.479868130147224e-05, + "clip_ratio/low_min": 5.061343472334556e-06, + "clip_ratio/region_mean": 4.57454214028985e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15503.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6058.7890625, + "completions/mean_terminated_length": 6058.7890625, + "completions/min_length": 732.0, + "completions/min_terminated_length": 732.0, + "entropy": 0.9345398098230362, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018098369473591447, + "learning_rate": 1e-05, + "loss": 0.1307, + "num_tokens": 364660120.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293684959412, + "sampling/importance_sampling_ratio/min": 0.004112724680453539, + "sampling/sampling_logp_difference/max": 5.493669509887695, + "sampling/sampling_logp_difference/mean": 0.019891154021024704, + "step": 437 + }, + { + "clip_ratio/high_max": 1.2886742979389965e-05, + "clip_ratio/high_mean": 3.221685744847491e-06, + "clip_ratio/low_mean": 4.962291495758109e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.284460121401935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16003.0, + "completions/mean_length": 6929.984375, + "completions/mean_terminated_length": 6625.01611328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9930986166000366, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033301038201898336, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 365564662.0, + "reward": 0.3828125, + "reward_std": 0.30457618832588196, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.0009120093891397119, + "sampling/sampling_logp_difference/max": 6.9998602867126465, + "sampling/sampling_logp_difference/mean": 0.02060488425195217, + "step": 438 + }, + { + "clip_ratio/high_max": 1.3284722399475868e-05, + "clip_ratio/high_mean": 3.321180599868967e-06, + "clip_ratio/low_mean": 2.590538883850968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.922656926784839e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14903.0, + "completions/max_terminated_length": 14903.0, + "completions/mean_length": 6197.3671875, + "completions/mean_terminated_length": 6197.3671875, + "completions/min_length": 845.0, + "completions/min_terminated_length": 845.0, + "entropy": 0.9469878897070885, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003049476072192192, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 366379725.0, + "reward": 0.421875, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999247789382935, + "sampling/importance_sampling_ratio/min": 0.0005533178336918354, + "sampling/sampling_logp_difference/max": 7.49957799911499, + "sampling/sampling_logp_difference/mean": 0.019666746258735657, + "step": 439 + }, + { + "clip_ratio/high_max": 1.4212190535545233e-05, + "clip_ratio/high_mean": 3.553047633886308e-06, + "clip_ratio/low_mean": 4.362488289189059e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7177931264741346e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15647.0, + "completions/mean_length": 6331.6015625, + "completions/mean_terminated_length": 6007.33056640625, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "entropy": 0.9937634319067001, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001529635745100677, + "learning_rate": 1e-05, + "loss": 0.0863, + "num_tokens": 367207994.0, + "reward": 0.3671875, + "reward_std": 0.2732901871204376, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998211860656738, + "sampling/importance_sampling_ratio/min": 0.0013787593925371766, + "sampling/sampling_logp_difference/max": 6.586571216583252, + "sampling/sampling_logp_difference/mean": 0.02042214572429657, + "step": 440 + }, + { + "clip_ratio/high_max": 1.3438677797239507e-05, + "clip_ratio/high_mean": 4.353689405434125e-06, + "clip_ratio/low_mean": 2.1308957457222277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5662646748969564e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14323.0, + "completions/mean_length": 6679.5, + "completions/mean_terminated_length": 6525.4609375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 1.034226231276989, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002576075494289398, + "learning_rate": 1e-05, + "loss": 0.0037, + "num_tokens": 368085602.0, + "reward": 0.4921875, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999170899391174, + "sampling/importance_sampling_ratio/min": 0.02749871462583542, + "sampling/sampling_logp_difference/max": 3.593616008758545, + "sampling/sampling_logp_difference/mean": 0.02129797264933586, + "step": 441 + }, + { + "clip_ratio/high_max": 1.2707126188615803e-05, + "clip_ratio/high_mean": 3.1767815471539507e-06, + "clip_ratio/low_mean": 5.362682486520498e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6803606184985256e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14425.0, + "completions/mean_length": 7171.984375, + "completions/mean_terminated_length": 6874.822265625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.994599312543869, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003648000070825219, + "learning_rate": 1e-05, + "loss": 0.0468, + "num_tokens": 369021400.0, + "reward": 0.34375, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 2.1446328901220113e-05, + "sampling/sampling_logp_difference/max": 10.749957084655762, + "sampling/sampling_logp_difference/mean": 0.02128203772008419, + "step": 442 + }, + { + "clip_ratio/high_max": 4.010523753095185e-06, + "clip_ratio/high_mean": 1.0026309382737963e-06, + "clip_ratio/low_mean": 5.049121273259516e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.149384355718212e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15812.0, + "completions/mean_length": 7633.953125, + "completions/mean_terminated_length": 7203.62255859375, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "entropy": 0.9781397357583046, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002637698082253337, + "learning_rate": 1e-05, + "loss": 0.1255, + "num_tokens": 370022274.0, + "reward": 0.3671875, + "reward_std": 0.3106446862220764, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527931213379, + "sampling/importance_sampling_ratio/min": 0.0006269909208640456, + "sampling/sampling_logp_difference/max": 7.374578475952148, + "sampling/sampling_logp_difference/mean": 0.02037280797958374, + "step": 443 + }, + { + "clip_ratio/high_max": 8.796280781098176e-06, + "clip_ratio/high_mean": 2.199070195274544e-06, + "clip_ratio/low_mean": 2.404907445452409e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6248144422424957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14346.0, + "completions/mean_length": 6966.890625, + "completions/mean_terminated_length": 6892.740234375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0748675763607025, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002537182765081525, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 370936076.0, + "reward": 0.421875, + "reward_std": 0.24329747259616852, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483823776245, + "sampling/importance_sampling_ratio/min": 0.001600474352017045, + "sampling/sampling_logp_difference/max": 6.437455177307129, + "sampling/sampling_logp_difference/mean": 0.0208933986723423, + "step": 444 + }, + { + "clip_ratio/high_max": 1.888703832264582e-05, + "clip_ratio/high_mean": 4.721759580661455e-06, + "clip_ratio/low_mean": 3.932560184694012e-05, + "clip_ratio/low_min": 3.3643752885836875e-06, + "clip_ratio/region_mean": 4.404736250762653e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16226.0, + "completions/mean_length": 7487.2890625, + "completions/mean_terminated_length": 7346.07177734375, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "entropy": 0.9402988106012344, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016896538436412811, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 371915793.0, + "reward": 0.3125, + "reward_std": 0.32849061489105225, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999174475669861, + "sampling/importance_sampling_ratio/min": 4.222911684337305e-06, + "sampling/sampling_logp_difference/max": 12.374985694885254, + "sampling/sampling_logp_difference/mean": 0.018897607922554016, + "step": 445 + }, + { + "clip_ratio/high_max": 1.2214306025271071e-05, + "clip_ratio/high_mean": 3.0535765063177678e-06, + "clip_ratio/low_mean": 1.0073189514514524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3126766020832292e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14307.0, + "completions/max_terminated_length": 14307.0, + "completions/mean_length": 5188.9375, + "completions/mean_terminated_length": 5188.9375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.8868530839681625, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001575644128024578, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 372605969.0, + "reward": 0.5390625, + "reward_std": 0.1938612163066864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999008774757385, + "sampling/importance_sampling_ratio/min": 0.0020112686324864626, + "sampling/sampling_logp_difference/max": 6.20898962020874, + "sampling/sampling_logp_difference/mean": 0.017719607800245285, + "step": 446 + }, + { + "clip_ratio/high_max": 1.6542175217182375e-05, + "clip_ratio/high_mean": 6.5401112578911125e-06, + "clip_ratio/low_mean": 3.020691053734481e-05, + "clip_ratio/low_min": 4.941101906297263e-06, + "clip_ratio/region_mean": 3.674702134048857e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 7290.9140625, + "completions/mean_terminated_length": 7146.57958984375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 1.06352149695158, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020332508720457554, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 373557094.0, + "reward": 0.40625, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998309016227722, + "sampling/importance_sampling_ratio/min": 8.97010977496393e-06, + "sampling/sampling_logp_difference/max": 11.621612548828125, + "sampling/sampling_logp_difference/mean": 0.022010326385498047, + "step": 447 + }, + { + "clip_ratio/high_max": 8.10710616860888e-06, + "clip_ratio/high_mean": 2.02677654215222e-06, + "clip_ratio/low_mean": 5.330761632649228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.5334393664452364e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15476.0, + "completions/mean_length": 6881.6640625, + "completions/mean_terminated_length": 6495.39013671875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.9094375595450401, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019624519627541304, + "learning_rate": 1e-05, + "loss": 0.0492, + "num_tokens": 374459827.0, + "reward": 0.4609375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 3.292101524721147e-08, + "sampling/sampling_logp_difference/max": 17.229154586791992, + "sampling/sampling_logp_difference/mean": 0.019491354003548622, + "step": 448 + }, + { + "clip_ratio/high_max": 2.0297283754189266e-05, + "clip_ratio/high_mean": 5.0743209385473165e-06, + "clip_ratio/low_mean": 3.7426975950438646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.250129745742015e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14792.0, + "completions/mean_length": 6641.203125, + "completions/mean_terminated_length": 6245.154296875, + "completions/min_length": 925.0, + "completions/min_terminated_length": 925.0, + "entropy": 0.7556380406022072, + "epoch": 0.41306347746090155, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0010716031538322568, + "learning_rate": 1e-05, + "loss": 0.1355, + "num_tokens": 375331749.0, + "reward": 0.625, + "reward_std": 0.34876543283462524, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000039339065552, + "sampling/importance_sampling_ratio/min": 0.00010258897236781195, + "sampling/sampling_logp_difference/max": 9.18478012084961, + "sampling/sampling_logp_difference/mean": 0.017056716606020927, + "step": 449 + }, + { + "clip_ratio/high_max": 2.1341018509701826e-05, + "clip_ratio/high_mean": 5.335254627425456e-06, + "clip_ratio/low_mean": 4.72563451694441e-05, + "clip_ratio/low_min": 6.4834025579330046e-06, + "clip_ratio/region_mean": 5.259159979686956e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15757.0, + "completions/max_terminated_length": 15757.0, + "completions/mean_length": 6514.875, + "completions/mean_terminated_length": 6514.875, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.9535354822874069, + "epoch": 0.4139834406623735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025929149705916643, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 376183309.0, + "reward": 0.421875, + "reward_std": 0.28277361392974854, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998977184295654, + "sampling/importance_sampling_ratio/min": 0.002191081177443266, + "sampling/sampling_logp_difference/max": 6.1233601570129395, + "sampling/sampling_logp_difference/mean": 0.019740387797355652, + "step": 450 + }, + { + "clip_ratio/high_max": 1.2529956165963085e-05, + "clip_ratio/high_mean": 4.370210831439181e-06, + "clip_ratio/low_mean": 6.38160736343707e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.075181819487625e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15798.0, + "completions/mean_length": 6045.640625, + "completions/mean_terminated_length": 5964.236328125, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 1.0733412355184555, + "epoch": 0.41490340386384544, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023648168426007032, + "learning_rate": 1e-05, + "loss": 0.005, + "num_tokens": 376978175.0, + "reward": 0.421875, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999704360961914, + "sampling/importance_sampling_ratio/min": 0.0001392154226778075, + "sampling/sampling_logp_difference/max": 8.879487991333008, + "sampling/sampling_logp_difference/mean": 0.020569145679473877, + "step": 451 + }, + { + "clip_ratio/high_max": 4.286840976419626e-06, + "clip_ratio/high_mean": 1.0717102441049065e-06, + "clip_ratio/low_mean": 2.4207001501963532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5278711859755276e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7767.7578125, + "completions/mean_terminated_length": 7489.814453125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 1.0381295159459114, + "epoch": 0.41582336706531736, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0015338027151301503, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 377994592.0, + "reward": 0.4140625, + "reward_std": 0.14230038225650787, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999648332595825, + "sampling/importance_sampling_ratio/min": 8.825274733226252e-08, + "sampling/sampling_logp_difference/max": 16.243061065673828, + "sampling/sampling_logp_difference/mean": 0.02027149498462677, + "step": 452 + }, + { + "clip_ratio/high_max": 7.272515631484566e-06, + "clip_ratio/high_mean": 1.8181289078711416e-06, + "clip_ratio/low_mean": 2.767900923572597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949713825728395e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15264.0, + "completions/max_terminated_length": 15264.0, + "completions/mean_length": 7002.21875, + "completions/mean_terminated_length": 7002.21875, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 1.0032588243484497, + "epoch": 0.41674333026678934, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002184878336265683, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 378909468.0, + "reward": 0.4453125, + "reward_std": 0.17859894037246704, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321103096008, + "sampling/importance_sampling_ratio/min": 1.3040186786383856e-05, + "sampling/sampling_logp_difference/max": 11.247474670410156, + "sampling/sampling_logp_difference/mean": 0.02025642991065979, + "step": 453 + }, + { + "clip_ratio/high_max": 4.38227471022401e-06, + "clip_ratio/high_mean": 1.0955686775560025e-06, + "clip_ratio/low_mean": 2.8486808901106997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9582377578663e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 8433.3984375, + "completions/mean_terminated_length": 8042.384765625, + "completions/min_length": 1429.0, + "completions/min_terminated_length": 1429.0, + "entropy": 0.9339399412274361, + "epoch": 0.41766329346826125, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0015065330080688, + "learning_rate": 1e-05, + "loss": 0.0026, + "num_tokens": 380009687.0, + "reward": 0.3359375, + "reward_std": 0.17358636856079102, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999430179595947, + "sampling/importance_sampling_ratio/min": 0.0004234187363181263, + "sampling/sampling_logp_difference/max": 7.767148971557617, + "sampling/sampling_logp_difference/mean": 0.020081156864762306, + "step": 454 + }, + { + "clip_ratio/high_max": 1.8815874227584573e-05, + "clip_ratio/high_mean": 4.703968556896143e-06, + "clip_ratio/low_mean": 2.8154490735232685e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.285845917844199e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15186.0, + "completions/max_terminated_length": 15186.0, + "completions/mean_length": 7050.3203125, + "completions/mean_terminated_length": 7050.3203125, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.9537717178463936, + "epoch": 0.41858325666973323, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0013606940628960729, + "learning_rate": 1e-05, + "loss": 0.0125, + "num_tokens": 380930480.0, + "reward": 0.578125, + "reward_std": 0.28407180309295654, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999956488609314, + "sampling/importance_sampling_ratio/min": 0.00011017238284694031, + "sampling/sampling_logp_difference/max": 9.11346435546875, + "sampling/sampling_logp_difference/mean": 0.020253805443644524, + "step": 455 + }, + { + "clip_ratio/high_max": 4.247366632625926e-06, + "clip_ratio/high_mean": 1.0618416581564816e-06, + "clip_ratio/low_mean": 2.397758157712815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5039423462658306e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15698.0, + "completions/max_terminated_length": 15698.0, + "completions/mean_length": 6561.1640625, + "completions/mean_terminated_length": 6561.1640625, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.9863667339086533, + "epoch": 0.41950321987120515, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017187768826261163, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 381790981.0, + "reward": 0.4375, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998313188552856, + "sampling/importance_sampling_ratio/min": 0.010767512023448944, + "sampling/sampling_logp_difference/max": 4.531221866607666, + "sampling/sampling_logp_difference/mean": 0.02073034644126892, + "step": 456 + }, + { + "clip_ratio/high_max": 2.9292289127624827e-05, + "clip_ratio/high_mean": 8.657401849632151e-06, + "clip_ratio/low_mean": 4.3774077425950964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2431478707148926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15705.0, + "completions/mean_length": 7120.1875, + "completions/mean_terminated_length": 6973.14306640625, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.9760185852646828, + "epoch": 0.4204231830726771, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016811270033940673, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 382722173.0, + "reward": 0.421875, + "reward_std": 0.27670514583587646, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999004602432251, + "sampling/importance_sampling_ratio/min": 0.0008047398878261447, + "sampling/sampling_logp_difference/max": 7.124991416931152, + "sampling/sampling_logp_difference/mean": 0.02018534392118454, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.610178137274488e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.610178137274488e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 7057.1015625, + "completions/mean_terminated_length": 6833.25634765625, + "completions/min_length": 922.0, + "completions/min_terminated_length": 922.0, + "entropy": 0.948130652308464, + "epoch": 0.42134314627414904, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015492907259613276, + "learning_rate": 1e-05, + "loss": 0.0319, + "num_tokens": 383650426.0, + "reward": 0.421875, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999640583992004, + "sampling/importance_sampling_ratio/min": 0.003965416923165321, + "sampling/sampling_logp_difference/max": 5.530144214630127, + "sampling/sampling_logp_difference/mean": 0.02065262943506241, + "step": 458 + }, + { + "clip_ratio/high_max": 8.952108146331739e-06, + "clip_ratio/high_mean": 2.2380270365829347e-06, + "clip_ratio/low_mean": 2.777617066840321e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.001419747761247e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7001.7578125, + "completions/mean_terminated_length": 6852.83349609375, + "completions/min_length": 1065.0, + "completions/min_terminated_length": 1065.0, + "entropy": 0.9631693065166473, + "epoch": 0.42226310947562096, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0013419219758361578, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 384565995.0, + "reward": 0.390625, + "reward_std": 0.18701860308647156, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999476671218872, + "sampling/importance_sampling_ratio/min": 0.0006672164890915155, + "sampling/sampling_logp_difference/max": 7.312396049499512, + "sampling/sampling_logp_difference/mean": 0.01975739374756813, + "step": 459 + }, + { + "clip_ratio/high_max": 1.215636098095274e-05, + "clip_ratio/high_mean": 3.039090245238185e-06, + "clip_ratio/low_mean": 4.157363855483709e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4612729197979206e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15727.0, + "completions/mean_length": 7282.875, + "completions/mean_terminated_length": 6912.91015625, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9037974923849106, + "epoch": 0.42318307267709293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021072588860988617, + "learning_rate": 1e-05, + "loss": 0.0866, + "num_tokens": 385516659.0, + "reward": 0.359375, + "reward_std": 0.3277292251586914, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0013449778780341148, + "sampling/sampling_logp_difference/max": 6.611377716064453, + "sampling/sampling_logp_difference/mean": 0.018494941294193268, + "step": 460 + }, + { + "clip_ratio/high_max": 1.669851098995423e-05, + "clip_ratio/high_mean": 4.174627747488557e-06, + "clip_ratio/low_mean": 2.594786496956658e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0122492944428814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14783.0, + "completions/mean_length": 7063.6953125, + "completions/mean_terminated_length": 6840.00830078125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9738125056028366, + "epoch": 0.42410303587856485, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020963819697499275, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 386440556.0, + "reward": 0.4765625, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999623894691467, + "sampling/importance_sampling_ratio/min": 7.853446390981844e-07, + "sampling/sampling_logp_difference/max": 14.057143211364746, + "sampling/sampling_logp_difference/mean": 0.0198366716504097, + "step": 461 + }, + { + "clip_ratio/high_max": 3.949322490370832e-06, + "clip_ratio/high_mean": 9.87330622592708e-07, + "clip_ratio/low_mean": 1.8185473095400084e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9172803717992792e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15651.0, + "completions/mean_length": 7672.7421875, + "completions/mean_terminated_length": 7262.0244140625, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 1.0194172486662865, + "epoch": 0.4250229990800368, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014175203396007419, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 387450843.0, + "reward": 0.4609375, + "reward_std": 0.24541424214839935, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999865889549255, + "sampling/importance_sampling_ratio/min": 0.004501644056290388, + "sampling/sampling_logp_difference/max": 5.403312683105469, + "sampling/sampling_logp_difference/mean": 0.02058412693440914, + "step": 462 + }, + { + "clip_ratio/high_max": 2.1894326664551045e-05, + "clip_ratio/high_mean": 6.6363724613438535e-06, + "clip_ratio/low_mean": 8.431412652498693e-05, + "clip_ratio/low_min": 3.288245125077083e-05, + "clip_ratio/region_mean": 9.095049927054788e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 6846.8828125, + "completions/mean_terminated_length": 6459.19482421875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.886472262442112, + "epoch": 0.42594296228150874, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002659202553331852, + "learning_rate": 1e-05, + "loss": 0.1199, + "num_tokens": 388344660.0, + "reward": 0.34375, + "reward_std": 0.40267258882522583, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000640153884888, + "sampling/importance_sampling_ratio/min": 0.00015848172188270837, + "sampling/sampling_logp_difference/max": 8.749871253967285, + "sampling/sampling_logp_difference/mean": 0.018909990787506104, + "step": 463 + }, + { + "clip_ratio/high_max": 1.3184767340135295e-05, + "clip_ratio/high_mean": 3.2961918350338237e-06, + "clip_ratio/low_mean": 4.2340758909631404e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.563695051729155e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 6271.84375, + "completions/mean_terminated_length": 6029.15234375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.9538674280047417, + "epoch": 0.42686292548298066, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002775643253698945, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 389167344.0, + "reward": 0.484375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000534057617188, + "sampling/importance_sampling_ratio/min": 0.0022844907362014055, + "sampling/sampling_logp_difference/max": 6.0816121101379395, + "sampling/sampling_logp_difference/mean": 0.020731300115585327, + "step": 464 + }, + { + "clip_ratio/high_max": 5.017863713874249e-06, + "clip_ratio/high_mean": 1.2544659284685622e-06, + "clip_ratio/low_mean": 3.720694280673342e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.846140884888882e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6312.9765625, + "completions/mean_terminated_length": 6233.67724609375, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.937890075147152, + "epoch": 0.42778288868445263, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001834206865169108, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 389993613.0, + "reward": 0.484375, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000054836273193, + "sampling/importance_sampling_ratio/min": 0.0004770693776663393, + "sampling/sampling_logp_difference/max": 7.647848606109619, + "sampling/sampling_logp_difference/mean": 0.020461473613977432, + "step": 465 + }, + { + "clip_ratio/high_max": 1.484874360357935e-05, + "clip_ratio/high_mean": 3.7121859008948377e-06, + "clip_ratio/low_mean": 3.374425170932227e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7456437212313176e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 5643.125, + "completions/mean_terminated_length": 5385.34423828125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.9210820645093918, + "epoch": 0.42870285188592455, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015243689995259047, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 390735629.0, + "reward": 0.4765625, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995661735535, + "sampling/importance_sampling_ratio/min": 1.4338597509322426e-07, + "sampling/sampling_logp_difference/max": 15.757725715637207, + "sampling/sampling_logp_difference/mean": 0.01841399073600769, + "step": 466 + }, + { + "clip_ratio/high_max": 5.748976491304347e-06, + "clip_ratio/high_mean": 1.4372441228260868e-06, + "clip_ratio/low_mean": 3.702218441503646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.845942796942836e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16084.0, + "completions/mean_length": 8048.40625, + "completions/mean_terminated_length": 7848.3525390625, + "completions/min_length": 1236.0, + "completions/min_terminated_length": 1236.0, + "entropy": 1.048905499279499, + "epoch": 0.4296228150873965, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026433062739670277, + "learning_rate": 1e-05, + "loss": 0.0548, + "num_tokens": 391786761.0, + "reward": 0.265625, + "reward_std": 0.22962789237499237, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000020980834961, + "sampling/importance_sampling_ratio/min": 0.0006000763387419283, + "sampling/sampling_logp_difference/max": 7.418453693389893, + "sampling/sampling_logp_difference/mean": 0.021647389978170395, + "step": 467 + }, + { + "clip_ratio/high_max": 2.0228523908372154e-05, + "clip_ratio/high_mean": 5.057130977093038e-06, + "clip_ratio/low_mean": 5.334191632755392e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.839904770255089e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16197.0, + "completions/mean_length": 7073.078125, + "completions/mean_terminated_length": 6772.7255859375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 1.0020805671811104, + "epoch": 0.43054277828886844, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019431376131251454, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 392709699.0, + "reward": 0.4140625, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999632239341736, + "sampling/importance_sampling_ratio/min": 0.0003546403022482991, + "sampling/sampling_logp_difference/max": 7.944406509399414, + "sampling/sampling_logp_difference/mean": 0.020886382088065147, + "step": 468 + }, + { + "clip_ratio/high_max": 8.001388550837873e-06, + "clip_ratio/high_mean": 2.0003471377094684e-06, + "clip_ratio/low_mean": 5.976677766739158e-05, + "clip_ratio/low_min": 1.2241466720297467e-05, + "clip_ratio/region_mean": 6.176712395244977e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16249.0, + "completions/mean_length": 7128.5390625, + "completions/mean_terminated_length": 6981.62744140625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.9986839666962624, + "epoch": 0.43146274149034036, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002014609519392252, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 393643864.0, + "reward": 0.265625, + "reward_std": 0.3411741852760315, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000815391540527, + "sampling/importance_sampling_ratio/min": 0.0030073157977312803, + "sampling/sampling_logp_difference/max": 5.806707382202148, + "sampling/sampling_logp_difference/mean": 0.020323367789387703, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0874447525566211e-05, + "clip_ratio/high_mean": 2.7186118813915527e-06, + "clip_ratio/low_mean": 3.265329507939896e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.537190696079051e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14445.0, + "completions/mean_length": 5538.625, + "completions/mean_terminated_length": 5366.4765625, + "completions/min_length": 1149.0, + "completions/min_terminated_length": 1149.0, + "entropy": 1.0297009721398354, + "epoch": 0.43238270469181234, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019020825857296586, + "learning_rate": 1e-05, + "loss": 0.0277, + "num_tokens": 394371184.0, + "reward": 0.3515625, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.00010906249372055754, + "sampling/sampling_logp_difference/max": 9.123589515686035, + "sampling/sampling_logp_difference/mean": 0.01992623880505562, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.427005844969244e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.427005844969244e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16072.0, + "completions/mean_length": 7305.7109375, + "completions/mean_terminated_length": 7087.83251953125, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "entropy": 0.9444865211844444, + "epoch": 0.43330266789328425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037416366394609213, + "learning_rate": 1e-05, + "loss": 0.07, + "num_tokens": 395325427.0, + "reward": 0.375, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999666213989258, + "sampling/importance_sampling_ratio/min": 1.3054028613623814e-06, + "sampling/sampling_logp_difference/max": 13.548998832702637, + "sampling/sampling_logp_difference/mean": 0.02093587815761566, + "step": 471 + }, + { + "clip_ratio/high_max": 1.0206378192378907e-05, + "clip_ratio/high_mean": 2.5515945480947266e-06, + "clip_ratio/low_mean": 2.926629849753226e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.181789293194015e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 6020.71875, + "completions/mean_terminated_length": 5686.4189453125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9555193856358528, + "epoch": 0.43422263109475623, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003123396774753928, + "learning_rate": 1e-05, + "loss": 0.0906, + "num_tokens": 396118047.0, + "reward": 0.375, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 9.029568900587037e-05, + "sampling/sampling_logp_difference/max": 9.312420845031738, + "sampling/sampling_logp_difference/mean": 0.019349105656147003, + "step": 472 + }, + { + "clip_ratio/high_max": 7.391638519038679e-06, + "clip_ratio/high_mean": 1.8479096297596698e-06, + "clip_ratio/low_mean": 4.082024281615304e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.266815255959955e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16177.0, + "completions/mean_length": 6789.203125, + "completions/mean_terminated_length": 6149.55029296875, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.8103364855051041, + "epoch": 0.43514259429622815, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017067189328372478, + "learning_rate": 1e-05, + "loss": 0.0618, + "num_tokens": 397008497.0, + "reward": 0.421875, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000635385513306, + "sampling/importance_sampling_ratio/min": 1.8778002868202748e-06, + "sampling/sampling_logp_difference/max": 13.185409545898438, + "sampling/sampling_logp_difference/mean": 0.01813405565917492, + "step": 473 + }, + { + "clip_ratio/high_max": 3.4544700611149892e-06, + "clip_ratio/high_mean": 1.6775043150119018e-06, + "clip_ratio/low_mean": 3.894365818268852e-05, + "clip_ratio/low_min": 3.4544700611149892e-06, + "clip_ratio/region_mean": 4.0621162042953074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 8000.53125, + "completions/mean_terminated_length": 7934.51953125, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "entropy": 1.0201406553387642, + "epoch": 0.43606255749770007, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001533582923002541, + "learning_rate": 1e-05, + "loss": 0.0826, + "num_tokens": 398052373.0, + "reward": 0.328125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000503063201904, + "sampling/importance_sampling_ratio/min": 3.783419288083678e-06, + "sampling/sampling_logp_difference/max": 12.484882354736328, + "sampling/sampling_logp_difference/mean": 0.02113974839448929, + "step": 474 + }, + { + "clip_ratio/high_max": 5.666878223564709e-06, + "clip_ratio/high_mean": 1.4167195558911772e-06, + "clip_ratio/low_mean": 1.8879915842262562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0296635739214253e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15121.0, + "completions/max_terminated_length": 15121.0, + "completions/mean_length": 6122.6875, + "completions/mean_terminated_length": 6122.6875, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 1.0430640205740929, + "epoch": 0.43698252069917204, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025845973286777735, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 398855205.0, + "reward": 0.5, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999109506607056, + "sampling/importance_sampling_ratio/min": 3.3893353247549385e-05, + "sampling/sampling_logp_difference/max": 10.292291641235352, + "sampling/sampling_logp_difference/mean": 0.020821597427129745, + "step": 475 + }, + { + "clip_ratio/high_max": 6.862502914373181e-06, + "clip_ratio/high_mean": 1.7156257285932952e-06, + "clip_ratio/low_mean": 3.732125173883105e-05, + "clip_ratio/low_min": 3.870448381348979e-06, + "clip_ratio/region_mean": 3.9036877069520415e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16308.0, + "completions/mean_length": 6895.4453125, + "completions/mean_terminated_length": 6820.732421875, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 1.097649298608303, + "epoch": 0.43790248390064396, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00137829699087888, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 399758166.0, + "reward": 0.2890625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999674558639526, + "sampling/importance_sampling_ratio/min": 8.400417755183298e-06, + "sampling/sampling_logp_difference/max": 11.68722915649414, + "sampling/sampling_logp_difference/mean": 0.02135382406413555, + "step": 476 + }, + { + "clip_ratio/high_max": 8.859707577357767e-06, + "clip_ratio/high_mean": 2.2149268943394418e-06, + "clip_ratio/low_mean": 3.0371424600161845e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.258635138081445e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14854.0, + "completions/mean_length": 5552.265625, + "completions/mean_terminated_length": 5380.33349609375, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 0.9384580478072166, + "epoch": 0.43882244710211593, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002469305880367756, + "learning_rate": 1e-05, + "loss": 0.0868, + "num_tokens": 400488560.0, + "reward": 0.515625, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998993277549744, + "sampling/importance_sampling_ratio/min": 1.934680221893359e-05, + "sampling/sampling_logp_difference/max": 10.852983474731445, + "sampling/sampling_logp_difference/mean": 0.019046220928430557, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.751295116671827e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.751295116671827e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6492.8125, + "completions/mean_terminated_length": 6335.81005859375, + "completions/min_length": 1238.0, + "completions/min_terminated_length": 1238.0, + "entropy": 0.9447641968727112, + "epoch": 0.43974241030358785, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019261077977716923, + "learning_rate": 1e-05, + "loss": 0.0684, + "num_tokens": 401339544.0, + "reward": 0.359375, + "reward_std": 0.27221953868865967, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999949932098389, + "sampling/importance_sampling_ratio/min": 0.016565052792429924, + "sampling/sampling_logp_difference/max": 4.100460052490234, + "sampling/sampling_logp_difference/mean": 0.018938450142741203, + "step": 478 + }, + { + "clip_ratio/high_max": 1.0270573739035171e-05, + "clip_ratio/high_mean": 2.567643434758793e-06, + "clip_ratio/low_mean": 3.2130441354638606e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4698084505180304e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6688.5546875, + "completions/mean_terminated_length": 6211.72900390625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9593756124377251, + "epoch": 0.4406623735050598, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027252996806055307, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 402213983.0, + "reward": 0.4375, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 0.09333998709917068, + "sampling/sampling_logp_difference/max": 2.371506690979004, + "sampling/sampling_logp_difference/mean": 0.020656028762459755, + "step": 479 + }, + { + "clip_ratio/high_max": 4.220976734359283e-06, + "clip_ratio/high_mean": 1.0552441835898208e-06, + "clip_ratio/low_mean": 2.7019574872610974e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.807481928357447e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15739.0, + "completions/mean_length": 6957.8828125, + "completions/mean_terminated_length": 6808.26220703125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.9458145052194595, + "epoch": 0.44158233670653174, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021163993515074253, + "learning_rate": 1e-05, + "loss": -0.0054, + "num_tokens": 403124296.0, + "reward": 0.3125, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000032186508179, + "sampling/importance_sampling_ratio/min": 5.414607926468307e-07, + "sampling/sampling_logp_difference/max": 14.428995132446289, + "sampling/sampling_logp_difference/mean": 0.019670519977808, + "step": 480 + }, + { + "clip_ratio/high_max": 1.4141203109829803e-05, + "clip_ratio/high_mean": 4.24627120310106e-06, + "clip_ratio/low_mean": 3.319961399483873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7445884800035856e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 7141.8359375, + "completions/mean_terminated_length": 6843.701171875, + "completions/min_length": 1005.0, + "completions/min_terminated_length": 1005.0, + "entropy": 0.9727424532175064, + "epoch": 0.44250229990800366, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024569793604314327, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 404056571.0, + "reward": 0.421875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999956488609314, + "sampling/importance_sampling_ratio/min": 8.950789379014168e-06, + "sampling/sampling_logp_difference/max": 11.62376880645752, + "sampling/sampling_logp_difference/mean": 0.020752113312482834, + "step": 481 + }, + { + "clip_ratio/high_max": 1.5587193956889678e-05, + "clip_ratio/high_mean": 4.596514145305264e-06, + "clip_ratio/low_mean": 6.96504166626255e-05, + "clip_ratio/low_min": 7.279775445567793e-06, + "clip_ratio/region_mean": 7.424693194479914e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 7685.046875, + "completions/mean_terminated_length": 7476.2724609375, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9029846489429474, + "epoch": 0.44342226310947563, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019990119617432356, + "learning_rate": 1e-05, + "loss": 0.1109, + "num_tokens": 405058705.0, + "reward": 0.421875, + "reward_std": 0.38375797867774963, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999930262565613, + "sampling/importance_sampling_ratio/min": 0.002107172505930066, + "sampling/sampling_logp_difference/max": 6.162408351898193, + "sampling/sampling_logp_difference/mean": 0.01937328279018402, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7506703443359584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7506703443359584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7004.21875, + "completions/mean_terminated_length": 6779.1044921875, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "entropy": 0.9121566936373711, + "epoch": 0.44434222631094755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029584914445877075, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 405974789.0, + "reward": 0.5234375, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000674724578857, + "sampling/importance_sampling_ratio/min": 0.000792751437984407, + "sampling/sampling_logp_difference/max": 7.140000820159912, + "sampling/sampling_logp_difference/mean": 0.019368886947631836, + "step": 483 + }, + { + "clip_ratio/high_max": 1.2470530009522918e-05, + "clip_ratio/high_mean": 3.1176325023807294e-06, + "clip_ratio/low_mean": 3.606646794196422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918410050118837e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6294.90625, + "completions/mean_terminated_length": 6215.46435546875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.899978794157505, + "epoch": 0.4452621895124195, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001394490827806294, + "learning_rate": 1e-05, + "loss": 0.0376, + "num_tokens": 406798417.0, + "reward": 0.4296875, + "reward_std": 0.2577856183052063, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.0007101757801137865, + "sampling/sampling_logp_difference/max": 7.249998092651367, + "sampling/sampling_logp_difference/mean": 0.018764980137348175, + "step": 484 + }, + { + "clip_ratio/high_max": 1.568959305586759e-05, + "clip_ratio/high_mean": 3.9223982639668975e-06, + "clip_ratio/low_mean": 3.593084011299652e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.985323814958974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15685.0, + "completions/mean_length": 6940.046875, + "completions/mean_terminated_length": 6790.14306640625, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9006319642066956, + "epoch": 0.44618215271389144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002361331367865205, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 407703351.0, + "reward": 0.4453125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999904036521912, + "sampling/importance_sampling_ratio/min": 4.8537625843891874e-05, + "sampling/sampling_logp_difference/max": 9.933171272277832, + "sampling/sampling_logp_difference/mean": 0.019578561186790466, + "step": 485 + }, + { + "clip_ratio/high_max": 5.896504717384232e-06, + "clip_ratio/high_mean": 1.474126179346058e-06, + "clip_ratio/low_mean": 4.614499187027832e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7619118163311214e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 6362.484375, + "completions/mean_terminated_length": 6283.57470703125, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 0.9299133494496346, + "epoch": 0.44710211591536336, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027458088006824255, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 408537765.0, + "reward": 0.4296875, + "reward_std": 0.3595392107963562, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999920129776001, + "sampling/importance_sampling_ratio/min": 0.0007113060564734042, + "sampling/sampling_logp_difference/max": 7.24840784072876, + "sampling/sampling_logp_difference/mean": 0.019821636378765106, + "step": 486 + }, + { + "clip_ratio/high_max": 2.0891785879939562e-05, + "clip_ratio/high_mean": 7.879635973040422e-06, + "clip_ratio/low_mean": 2.6475246386326035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.435488224567962e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15924.0, + "completions/max_terminated_length": 15924.0, + "completions/mean_length": 5226.765625, + "completions/mean_terminated_length": 5226.765625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 1.0277203470468521, + "epoch": 0.44802207911683534, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024366467259824276, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 409223903.0, + "reward": 0.546875, + "reward_std": 0.3006146252155304, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 0.01590813137590885, + "sampling/sampling_logp_difference/max": 4.14092493057251, + "sampling/sampling_logp_difference/mean": 0.019991066306829453, + "step": 487 + }, + { + "clip_ratio/high_max": 9.688145382824587e-06, + "clip_ratio/high_mean": 2.4220363457061467e-06, + "clip_ratio/low_mean": 1.920005956890236e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.162209625566902e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12344.0, + "completions/max_terminated_length": 12344.0, + "completions/mean_length": 5051.0, + "completions/mean_terminated_length": 5051.0, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.8572651967406273, + "epoch": 0.44894204231830726, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027017516549676657, + "learning_rate": 1e-05, + "loss": -0.003, + "num_tokens": 409895199.0, + "reward": 0.6015625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940037727356, + "sampling/importance_sampling_ratio/min": 5.7065666624112055e-05, + "sampling/sampling_logp_difference/max": 9.771307945251465, + "sampling/sampling_logp_difference/mean": 0.01831716299057007, + "step": 488 + }, + { + "clip_ratio/high_max": 1.5306721707020188e-05, + "clip_ratio/high_mean": 3.826680426755047e-06, + "clip_ratio/low_mean": 3.0764163398089295e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4590844165904855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13702.0, + "completions/mean_length": 6231.9765625, + "completions/mean_terminated_length": 6070.83349609375, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 0.9115571528673172, + "epoch": 0.44986200551977923, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021461176220327616, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 410711300.0, + "reward": 0.4765625, + "reward_std": 0.2672119140625, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000066757202148, + "sampling/importance_sampling_ratio/min": 0.00019801831513177603, + "sampling/sampling_logp_difference/max": 8.527151107788086, + "sampling/sampling_logp_difference/mean": 0.019596103578805923, + "step": 489 + }, + { + "clip_ratio/high_max": 2.7797910661320202e-05, + "clip_ratio/high_mean": 9.322406867795507e-06, + "clip_ratio/low_mean": 6.275825364809862e-05, + "clip_ratio/low_min": 3.0194694318197435e-06, + "clip_ratio/region_mean": 7.208066119801515e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16059.0, + "completions/mean_length": 6766.4765625, + "completions/mean_terminated_length": 6375.52001953125, + "completions/min_length": 764.0, + "completions/min_terminated_length": 764.0, + "entropy": 0.8712737187743187, + "epoch": 0.45078196872125115, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019740054849535227, + "learning_rate": 1e-05, + "loss": 0.091, + "num_tokens": 411597969.0, + "reward": 0.4609375, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 7.488903065677732e-05, + "sampling/sampling_logp_difference/max": 9.499503135681152, + "sampling/sampling_logp_difference/mean": 0.018991166725754738, + "step": 490 + }, + { + "clip_ratio/high_max": 4.992810318071861e-06, + "clip_ratio/high_mean": 1.2482025795179652e-06, + "clip_ratio/low_mean": 1.100720277236178e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2255405295036326e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14776.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 6619.1171875, + "completions/mean_terminated_length": 6619.1171875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.1462209969758987, + "epoch": 0.45170193192272307, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001665184274315834, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 412464384.0, + "reward": 0.3046875, + "reward_std": 0.17806214094161987, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999956488609314, + "sampling/importance_sampling_ratio/min": 0.009808298200368881, + "sampling/sampling_logp_difference/max": 4.624526500701904, + "sampling/sampling_logp_difference/mean": 0.02124062180519104, + "step": 491 + }, + { + "clip_ratio/high_max": 1.5520400665991474e-05, + "clip_ratio/high_mean": 3.8801001664978685e-06, + "clip_ratio/low_mean": 2.0763711063409573e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.464381134359428e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 7035.25, + "completions/mean_terminated_length": 6886.857421875, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.9810440614819527, + "epoch": 0.45262189512419504, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015779118984937668, + "learning_rate": 1e-05, + "loss": 0.0582, + "num_tokens": 413383792.0, + "reward": 0.4453125, + "reward_std": 0.21436068415641785, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999344944953918, + "sampling/importance_sampling_ratio/min": 0.01566622592508793, + "sampling/sampling_logp_difference/max": 4.156248092651367, + "sampling/sampling_logp_difference/mean": 0.021432677283883095, + "step": 492 + }, + { + "clip_ratio/high_max": 4.644250566343544e-06, + "clip_ratio/high_mean": 1.161062641585886e-06, + "clip_ratio/low_mean": 3.4143843777201255e-05, + "clip_ratio/low_min": 3.276024699516711e-06, + "clip_ratio/region_mean": 3.530490653247398e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15869.0, + "completions/mean_length": 6945.9375, + "completions/mean_terminated_length": 6796.12744140625, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.7932121306657791, + "epoch": 0.45354185832566696, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013254050863906741, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 414290000.0, + "reward": 0.4921875, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 7.031726272543892e-05, + "sampling/sampling_logp_difference/max": 9.562493324279785, + "sampling/sampling_logp_difference/mean": 0.018196485936641693, + "step": 493 + }, + { + "clip_ratio/high_max": 1.8977402305608848e-05, + "clip_ratio/high_mean": 4.744350576402212e-06, + "clip_ratio/low_mean": 3.744401988114987e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218837011649157e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14930.0, + "completions/mean_length": 7526.4375, + "completions/mean_terminated_length": 7313.8564453125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.9790460616350174, + "epoch": 0.45446182152713893, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001866620616056025, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 415272280.0, + "reward": 0.4140625, + "reward_std": 0.2517249584197998, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998944997787476, + "sampling/importance_sampling_ratio/min": 0.00020347593817859888, + "sampling/sampling_logp_difference/max": 8.49996280670166, + "sampling/sampling_logp_difference/mean": 0.020433884114027023, + "step": 494 + }, + { + "clip_ratio/high_max": 7.432954589603469e-06, + "clip_ratio/high_mean": 3.44574186783575e-06, + "clip_ratio/low_mean": 4.426451175731927e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7710253397781344e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15964.0, + "completions/mean_length": 6108.8671875, + "completions/mean_terminated_length": 5862.26416015625, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "entropy": 0.8818904608488083, + "epoch": 0.45538178472861085, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002431972650811076, + "learning_rate": 1e-05, + "loss": 0.0175, + "num_tokens": 416072591.0, + "reward": 0.59375, + "reward_std": 0.26720699667930603, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999450445175171, + "sampling/importance_sampling_ratio/min": 0.001706472015939653, + "sampling/sampling_logp_difference/max": 6.373327255249023, + "sampling/sampling_logp_difference/mean": 0.01932165026664734, + "step": 495 + }, + { + "clip_ratio/high_max": 9.704292551759863e-06, + "clip_ratio/high_mean": 2.426073137939966e-06, + "clip_ratio/low_mean": 1.47394894156605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7165562553600466e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15239.0, + "completions/max_terminated_length": 15239.0, + "completions/mean_length": 6841.59375, + "completions/mean_terminated_length": 6841.59375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1732418313622475, + "epoch": 0.4563017479300828, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002203838201239705, + "learning_rate": 1e-05, + "loss": 0.0308, + "num_tokens": 416966187.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998947381973267, + "sampling/importance_sampling_ratio/min": 0.0004944052780047059, + "sampling/sampling_logp_difference/max": 7.612154960632324, + "sampling/sampling_logp_difference/mean": 0.02160799130797386, + "step": 496 + }, + { + "clip_ratio/high_max": 2.328647701688169e-05, + "clip_ratio/high_mean": 5.821619254220423e-06, + "clip_ratio/low_mean": 5.462882245410583e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0450441651482834e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13836.0, + "completions/max_terminated_length": 13836.0, + "completions/mean_length": 5898.7421875, + "completions/mean_terminated_length": 5898.7421875, + "completions/min_length": 675.0, + "completions/min_terminated_length": 675.0, + "entropy": 0.9141146093606949, + "epoch": 0.45722171113155474, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028326623141765594, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 417740586.0, + "reward": 0.4453125, + "reward_std": 0.32984596490859985, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998853206634521, + "sampling/importance_sampling_ratio/min": 1.0281119102728553e-06, + "sampling/sampling_logp_difference/max": 13.787786483764648, + "sampling/sampling_logp_difference/mean": 0.01856965571641922, + "step": 497 + }, + { + "clip_ratio/high_max": 2.667783610377228e-05, + "clip_ratio/high_mean": 6.66945902594307e-06, + "clip_ratio/low_mean": 4.455613873233233e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.122559878145694e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 6416.140625, + "completions/mean_terminated_length": 6176.912109375, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.8854602724313736, + "epoch": 0.45814167433302666, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001950124162249267, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 418579788.0, + "reward": 0.5078125, + "reward_std": 0.25012245774269104, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998780488967896, + "sampling/importance_sampling_ratio/min": 2.6581541533232667e-05, + "sampling/sampling_logp_difference/max": 10.535293579101562, + "sampling/sampling_logp_difference/mean": 0.01931869424879551, + "step": 498 + }, + { + "clip_ratio/high_max": 3.6452713629842037e-06, + "clip_ratio/high_mean": 9.113178407460509e-07, + "clip_ratio/low_mean": 3.819847256636422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910979035026685e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15548.0, + "completions/mean_length": 7738.2578125, + "completions/mean_terminated_length": 7313.05712890625, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.9239770472049713, + "epoch": 0.45906163753449863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016899642068892717, + "learning_rate": 1e-05, + "loss": 0.0844, + "num_tokens": 419589021.0, + "reward": 0.375, + "reward_std": 0.20069600641727448, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000193119049072, + "sampling/importance_sampling_ratio/min": 0.00016869053069967777, + "sampling/sampling_logp_difference/max": 8.687444686889648, + "sampling/sampling_logp_difference/mean": 0.01966589316725731, + "step": 499 + }, + { + "clip_ratio/high_max": 1.0700351140258135e-05, + "clip_ratio/high_mean": 2.675087785064534e-06, + "clip_ratio/low_mean": 3.456382330568886e-05, + "clip_ratio/low_min": 4.663483196054585e-06, + "clip_ratio/region_mean": 3.723891120444023e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 7594.921875, + "completions/mean_terminated_length": 7383.984375, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "entropy": 0.9970445707440376, + "epoch": 0.45998160073597055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026633136440068483, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 420579459.0, + "reward": 0.40625, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 0.000756366120185703, + "sampling/sampling_logp_difference/max": 7.186985015869141, + "sampling/sampling_logp_difference/mean": 0.020969431847333908, + "step": 500 + }, + { + "clip_ratio/high_max": 2.166650710933027e-05, + "clip_ratio/high_mean": 6.6261792426303145e-06, + "clip_ratio/low_mean": 5.730952580051962e-05, + "clip_ratio/low_min": 4.826068561669672e-06, + "clip_ratio/region_mean": 6.393570629370515e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14856.0, + "completions/max_terminated_length": 14856.0, + "completions/mean_length": 5897.2890625, + "completions/mean_terminated_length": 5897.2890625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.9427390918135643, + "epoch": 0.4609015639374425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015446916222572327, + "learning_rate": 1e-05, + "loss": -0.0487, + "num_tokens": 421354536.0, + "reward": 0.40625, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000121593475342, + "sampling/importance_sampling_ratio/min": 0.00037080893525853753, + "sampling/sampling_logp_difference/max": 7.8998236656188965, + "sampling/sampling_logp_difference/mean": 0.019464563578367233, + "step": 501 + }, + { + "clip_ratio/high_max": 3.1168960958893877e-06, + "clip_ratio/high_mean": 7.792240239723469e-07, + "clip_ratio/low_mean": 1.842527422013518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9204498244107526e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16365.0, + "completions/mean_length": 7197.1875, + "completions/mean_terminated_length": 6900.83837890625, + "completions/min_length": 1181.0, + "completions/min_terminated_length": 1181.0, + "entropy": 0.9357216581702232, + "epoch": 0.46182152713891444, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019460292533040047, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 422296632.0, + "reward": 0.4921875, + "reward_std": 0.20934812724590302, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0004937088815495372, + "sampling/sampling_logp_difference/max": 7.613564491271973, + "sampling/sampling_logp_difference/mean": 0.0199101734906435, + "step": 502 + }, + { + "clip_ratio/high_max": 3.01917771139415e-06, + "clip_ratio/high_mean": 7.547944278485375e-07, + "clip_ratio/low_mean": 2.4536840555811068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5291634983659605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16092.0, + "completions/mean_length": 6675.8515625, + "completions/mean_terminated_length": 6599.40966796875, + "completions/min_length": 1369.0, + "completions/min_terminated_length": 1369.0, + "entropy": 0.8980752006173134, + "epoch": 0.46274149034038636, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017490689642727375, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 423170085.0, + "reward": 0.484375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966025352478, + "sampling/importance_sampling_ratio/min": 4.0153237932827324e-05, + "sampling/sampling_logp_difference/max": 10.122807502746582, + "sampling/sampling_logp_difference/mean": 0.01868046447634697, + "step": 503 + }, + { + "clip_ratio/high_max": 1.4156895304040518e-05, + "clip_ratio/high_mean": 4.290660626793397e-06, + "clip_ratio/low_mean": 4.468955739866942e-05, + "clip_ratio/low_min": 3.951194685214432e-06, + "clip_ratio/region_mean": 4.898021779808914e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 7394.5546875, + "completions/mean_terminated_length": 6874.50390625, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.891602098941803, + "epoch": 0.46366145354185834, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026261890307068825, + "learning_rate": 1e-05, + "loss": 0.0981, + "num_tokens": 424134916.0, + "reward": 0.484375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0019415394635871053, + "sampling/sampling_logp_difference/max": 6.244274139404297, + "sampling/sampling_logp_difference/mean": 0.018863018602132797, + "step": 504 + }, + { + "clip_ratio/high_max": 4.867222287430195e-06, + "clip_ratio/high_mean": 1.2168055718575488e-06, + "clip_ratio/low_mean": 2.737805482411204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8594860509656428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 5508.3359375, + "completions/mean_terminated_length": 5422.70068359375, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.9608336761593819, + "epoch": 0.46458141674333026, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030600661411881447, + "learning_rate": 1e-05, + "loss": 0.0369, + "num_tokens": 424860847.0, + "reward": 0.5625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999776482582092, + "sampling/importance_sampling_ratio/min": 4.006533345091157e-05, + "sampling/sampling_logp_difference/max": 10.124999046325684, + "sampling/sampling_logp_difference/mean": 0.018935665488243103, + "step": 505 + }, + { + "clip_ratio/high_max": 1.3109260635246756e-05, + "clip_ratio/high_mean": 3.277315158811689e-06, + "clip_ratio/low_mean": 3.854507008327346e-05, + "clip_ratio/low_min": 2.992077043018071e-06, + "clip_ratio/region_mean": 4.182238512839831e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16077.0, + "completions/mean_length": 7779.4765625, + "completions/mean_terminated_length": 7572.96826171875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 1.0322985425591469, + "epoch": 0.46550137994480223, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002075409982353449, + "learning_rate": 1e-05, + "loss": 0.0939, + "num_tokens": 425877532.0, + "reward": 0.421875, + "reward_std": 0.3337898254394531, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999785423278809, + "sampling/importance_sampling_ratio/min": 0.025282513350248337, + "sampling/sampling_logp_difference/max": 3.677642345428467, + "sampling/sampling_logp_difference/mean": 0.020769601687788963, + "step": 506 + }, + { + "clip_ratio/high_max": 1.4176180684444262e-05, + "clip_ratio/high_mean": 4.564619985103491e-06, + "clip_ratio/low_mean": 2.2551324207142898e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7115944419620064e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15292.0, + "completions/mean_length": 6004.21875, + "completions/mean_terminated_length": 5755.1044921875, + "completions/min_length": 992.0, + "completions/min_terminated_length": 992.0, + "entropy": 0.9162944257259369, + "epoch": 0.46642134314627415, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039940495043993, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 426666008.0, + "reward": 0.6328125, + "reward_std": 0.31140607595443726, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 5.144598981132731e-05, + "sampling/sampling_logp_difference/max": 9.874978065490723, + "sampling/sampling_logp_difference/mean": 0.01873711869120598, + "step": 507 + }, + { + "clip_ratio/high_max": 3.6937442473572446e-06, + "clip_ratio/high_mean": 9.234360618393112e-07, + "clip_ratio/low_mean": 3.4857803484555916e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.578123954639523e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14337.0, + "completions/mean_length": 6619.6015625, + "completions/mean_terminated_length": 6542.71630859375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 1.1118961870670319, + "epoch": 0.46734130634774607, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002274538855999708, + "learning_rate": 1e-05, + "loss": 0.0259, + "num_tokens": 427535397.0, + "reward": 0.3125, + "reward_std": 0.2177756428718567, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000343322753906, + "sampling/importance_sampling_ratio/min": 2.4061378098849673e-06, + "sampling/sampling_logp_difference/max": 12.937487602233887, + "sampling/sampling_logp_difference/mean": 0.0214434452354908, + "step": 508 + }, + { + "clip_ratio/high_max": 7.764184829284204e-06, + "clip_ratio/high_mean": 1.941046207321051e-06, + "clip_ratio/low_mean": 2.4530202267669665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6471248474990716e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6469.9765625, + "completions/mean_terminated_length": 6469.9765625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.8812271729111671, + "epoch": 0.46826126954921804, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020800349302589893, + "learning_rate": 1e-05, + "loss": 0.0592, + "num_tokens": 428379026.0, + "reward": 0.546875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999022483825684, + "sampling/importance_sampling_ratio/min": 9.611256973585114e-05, + "sampling/sampling_logp_difference/max": 9.249990463256836, + "sampling/sampling_logp_difference/mean": 0.01902790367603302, + "step": 509 + }, + { + "clip_ratio/high_max": 3.3670939956209622e-06, + "clip_ratio/high_mean": 8.417734989052406e-07, + "clip_ratio/low_mean": 3.1169882220183354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.201165577593201e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16189.0, + "completions/mean_length": 7417.2421875, + "completions/mean_terminated_length": 7346.6376953125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 1.0124703496694565, + "epoch": 0.46918123275068996, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013554802862927318, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 429347777.0, + "reward": 0.359375, + "reward_std": 0.24039676785469055, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 1.086339216271881e-05, + "sampling/sampling_logp_difference/max": 11.4301118850708, + "sampling/sampling_logp_difference/mean": 0.02034895122051239, + "step": 510 + }, + { + "clip_ratio/high_max": 2.4966960609162925e-05, + "clip_ratio/high_mean": 6.241740152290731e-06, + "clip_ratio/low_mean": 2.400768698862521e-05, + "clip_ratio/low_min": 7.9038825333555e-06, + "clip_ratio/region_mean": 3.0249426572481752e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 6539.7578125, + "completions/mean_terminated_length": 6383.50048828125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.9707148522138596, + "epoch": 0.47010119595216193, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016008630627766252, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 430203402.0, + "reward": 0.5078125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999600648880005, + "sampling/importance_sampling_ratio/min": 1.7258255269325673e-08, + "sampling/sampling_logp_difference/max": 17.874975204467773, + "sampling/sampling_logp_difference/mean": 0.01951115019619465, + "step": 511 + }, + { + "clip_ratio/high_max": 7.0406667873612605e-06, + "clip_ratio/high_mean": 1.7601666968403151e-06, + "clip_ratio/low_mean": 2.4132358305450907e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5892525002291222e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6722.53125, + "completions/mean_terminated_length": 6329.78857421875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9293247908353806, + "epoch": 0.47102115915363385, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002203655894845724, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 431082350.0, + "reward": 0.46875, + "reward_std": 0.18543371558189392, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999996542930603, + "sampling/importance_sampling_ratio/min": 0.002989979926496744, + "sampling/sampling_logp_difference/max": 5.812488555908203, + "sampling/sampling_logp_difference/mean": 0.018750539049506187, + "step": 512 + }, + { + "clip_ratio/high_max": 5.424876235338161e-06, + "clip_ratio/high_mean": 1.3562190588345402e-06, + "clip_ratio/low_mean": 2.538728870149498e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.674350776032952e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15874.0, + "completions/mean_length": 6347.03125, + "completions/mean_terminated_length": 5766.3798828125, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 0.9512053951621056, + "epoch": 0.47194112235510577, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002207641489803791, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 431914122.0, + "reward": 0.4765625, + "reward_std": 0.21648237109184265, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999993085861206, + "sampling/importance_sampling_ratio/min": 0.0011340104974806309, + "sampling/sampling_logp_difference/max": 6.781994819641113, + "sampling/sampling_logp_difference/mean": 0.01931341364979744, + "step": 513 + }, + { + "clip_ratio/high_max": 1.2328315506238141e-05, + "clip_ratio/high_mean": 3.0820788765595353e-06, + "clip_ratio/low_mean": 4.058695458297734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.366903374375397e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14757.0, + "completions/mean_length": 5719.8671875, + "completions/mean_terminated_length": 5635.8974609375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.9754309803247452, + "epoch": 0.47286108555657774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018057655543088913, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 432663249.0, + "reward": 0.4921875, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999573230743408, + "sampling/importance_sampling_ratio/min": 0.00016155402408912778, + "sampling/sampling_logp_difference/max": 8.730670928955078, + "sampling/sampling_logp_difference/mean": 0.019999589771032333, + "step": 514 + }, + { + "clip_ratio/high_max": 3.34771721099969e-05, + "clip_ratio/high_mean": 8.369293027499225e-06, + "clip_ratio/low_mean": 3.319342158647487e-05, + "clip_ratio/low_min": 3.644846174211125e-06, + "clip_ratio/region_mean": 4.1562714159226744e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 5969.1328125, + "completions/mean_terminated_length": 5803.81787109375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.9498241171240807, + "epoch": 0.47378104875804966, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002828414784744382, + "learning_rate": 1e-05, + "loss": 0.0843, + "num_tokens": 433448874.0, + "reward": 0.4375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999672174453735, + "sampling/importance_sampling_ratio/min": 0.00043074661516584456, + "sampling/sampling_logp_difference/max": 7.749990463256836, + "sampling/sampling_logp_difference/mean": 0.019238140434026718, + "step": 515 + }, + { + "clip_ratio/high_max": 2.4458067855448462e-05, + "clip_ratio/high_mean": 7.50266553950496e-06, + "clip_ratio/low_mean": 4.7241341690096306e-05, + "clip_ratio/low_min": 4.075511242263019e-06, + "clip_ratio/region_mean": 5.4744006320106564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14714.0, + "completions/max_terminated_length": 14714.0, + "completions/mean_length": 6808.3671875, + "completions/mean_terminated_length": 6808.3671875, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "entropy": 0.9247330650687218, + "epoch": 0.47470101195952163, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019250004552304745, + "learning_rate": 1e-05, + "loss": 0.0535, + "num_tokens": 434338609.0, + "reward": 0.4921875, + "reward_std": 0.36007601022720337, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999670386314392, + "sampling/importance_sampling_ratio/min": 0.00025917106540873647, + "sampling/sampling_logp_difference/max": 8.25802230834961, + "sampling/sampling_logp_difference/mean": 0.01927364431321621, + "step": 516 + }, + { + "clip_ratio/high_max": 2.067027617158601e-05, + "clip_ratio/high_mean": 5.167569042896503e-06, + "clip_ratio/low_mean": 1.523887078747066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0406439944054e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6119.921875, + "completions/mean_terminated_length": 6039.1025390625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.9210109040141106, + "epoch": 0.47562097516099355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022343189921230078, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 435145247.0, + "reward": 0.5, + "reward_std": 0.2467075139284134, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998999834060669, + "sampling/importance_sampling_ratio/min": 0.00011216365965083241, + "sampling/sampling_logp_difference/max": 9.095551490783691, + "sampling/sampling_logp_difference/mean": 0.019618261605501175, + "step": 517 + }, + { + "clip_ratio/high_max": 1.9286600036139134e-05, + "clip_ratio/high_mean": 4.821650009034784e-06, + "clip_ratio/low_mean": 3.679497240227647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1616622866058606e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16282.0, + "completions/mean_length": 6259.0625, + "completions/mean_terminated_length": 6179.33837890625, + "completions/min_length": 1087.0, + "completions/min_terminated_length": 1087.0, + "entropy": 0.9430939853191376, + "epoch": 0.4765409383624655, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00324260420165956, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 435964383.0, + "reward": 0.5, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999647736549377, + "sampling/importance_sampling_ratio/min": 1.5690335203544237e-05, + "sampling/sampling_logp_difference/max": 11.06246566772461, + "sampling/sampling_logp_difference/mean": 0.019678015261888504, + "step": 518 + }, + { + "clip_ratio/high_max": 5.182851054996718e-06, + "clip_ratio/high_mean": 1.2957127637491794e-06, + "clip_ratio/low_mean": 3.5416796038134635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6712508745040395e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14682.0, + "completions/mean_length": 6898.8671875, + "completions/mean_terminated_length": 6748.31005859375, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9633238166570663, + "epoch": 0.47746090156393745, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017788221593946218, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 436866830.0, + "reward": 0.328125, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014305114746, + "sampling/importance_sampling_ratio/min": 0.007227231748402119, + "sampling/sampling_logp_difference/max": 4.929899215698242, + "sampling/sampling_logp_difference/mean": 0.019975006580352783, + "step": 519 + }, + { + "clip_ratio/high_max": 1.8337552319280803e-05, + "clip_ratio/high_mean": 4.584388079820201e-06, + "clip_ratio/low_mean": 3.3715954828039685e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8300342453112535e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6568.8359375, + "completions/mean_terminated_length": 6333.2724609375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.9648878574371338, + "epoch": 0.47838086476540936, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021614902652800083, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 437728081.0, + "reward": 0.4140625, + "reward_std": 0.24487745761871338, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.001384100178256631, + "sampling/sampling_logp_difference/max": 6.582705020904541, + "sampling/sampling_logp_difference/mean": 0.019699109718203545, + "step": 520 + }, + { + "clip_ratio/high_max": 1.9740967672987608e-05, + "clip_ratio/high_mean": 4.935241918246902e-06, + "clip_ratio/low_mean": 5.360748559724016e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0295990477970918e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 6709.7265625, + "completions/mean_terminated_length": 6233.9423828125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.791545994579792, + "epoch": 0.47930082796688134, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002030634554103017, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 438605294.0, + "reward": 0.5, + "reward_std": 0.2435920089483261, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999866485595703, + "sampling/importance_sampling_ratio/min": 0.00981139950454235, + "sampling/sampling_logp_difference/max": 4.624210357666016, + "sampling/sampling_logp_difference/mean": 0.01805954799056053, + "step": 521 + }, + { + "clip_ratio/high_max": 7.663652240808005e-06, + "clip_ratio/high_mean": 1.9159130602020014e-06, + "clip_ratio/low_mean": 2.266609857315416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4582011747042998e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6556.9140625, + "completions/mean_terminated_length": 6400.9287109375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.886083297431469, + "epoch": 0.48022079116835326, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014125843299552798, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 439462971.0, + "reward": 0.4921875, + "reward_std": 0.3158818185329437, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999947726726532, + "sampling/importance_sampling_ratio/min": 3.454853825246573e-08, + "sampling/sampling_logp_difference/max": 17.18090057373047, + "sampling/sampling_logp_difference/mean": 0.018355879932641983, + "step": 522 + }, + { + "clip_ratio/high_max": 9.186456281895516e-06, + "clip_ratio/high_mean": 2.296614070473879e-06, + "clip_ratio/low_mean": 3.2019113405112876e-05, + "clip_ratio/low_min": 4.055676527059404e-06, + "clip_ratio/region_mean": 3.431572758927359e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6152.4921875, + "completions/mean_terminated_length": 6071.92919921875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9536242336034775, + "epoch": 0.48114075436982523, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00171169254463166, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 440268882.0, + "reward": 0.484375, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99989914894104, + "sampling/importance_sampling_ratio/min": 0.03775034472346306, + "sampling/sampling_logp_difference/max": 3.2767605781555176, + "sampling/sampling_logp_difference/mean": 0.018800247460603714, + "step": 523 + }, + { + "clip_ratio/high_max": 8.734396942600142e-06, + "clip_ratio/high_mean": 2.1835992356500356e-06, + "clip_ratio/low_mean": 4.899439159089525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.117799059917161e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15578.0, + "completions/mean_length": 5740.796875, + "completions/mean_terminated_length": 5656.9921875, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9311753436923027, + "epoch": 0.48206071757129715, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002236112719401717, + "learning_rate": 1e-05, + "loss": 0.1033, + "num_tokens": 441020904.0, + "reward": 0.5078125, + "reward_std": 0.34353315830230713, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 0.09267321974039078, + "sampling/sampling_logp_difference/max": 2.378675699234009, + "sampling/sampling_logp_difference/mean": 0.018967337906360626, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9387059296605003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9387059296605003e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15338.0, + "completions/max_terminated_length": 15338.0, + "completions/mean_length": 7279.078125, + "completions/mean_terminated_length": 7279.078125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 1.170717716217041, + "epoch": 0.48298068077276907, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0011770959245041013, + "learning_rate": 1e-05, + "loss": 0.0173, + "num_tokens": 441970986.0, + "reward": 0.3515625, + "reward_std": 0.2382800281047821, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999333620071411, + "sampling/importance_sampling_ratio/min": 1.1565300155780278e-05, + "sampling/sampling_logp_difference/max": 11.367501258850098, + "sampling/sampling_logp_difference/mean": 0.02134837955236435, + "step": 525 + }, + { + "clip_ratio/high_max": 1.838239040807821e-05, + "clip_ratio/high_mean": 4.595597602019552e-06, + "clip_ratio/low_mean": 3.5013973274544696e-05, + "clip_ratio/low_min": 4.0234326661447994e-06, + "clip_ratio/region_mean": 3.960957087656425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 7376.796875, + "completions/mean_terminated_length": 7233.82568359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 1.0409907028079033, + "epoch": 0.48390064397424104, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002001611515879631, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 442936808.0, + "reward": 0.4453125, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999234676361084, + "sampling/importance_sampling_ratio/min": 0.003353495616465807, + "sampling/sampling_logp_difference/max": 5.697751998901367, + "sampling/sampling_logp_difference/mean": 0.02169732004404068, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.393580459487566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.393580459487566e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 6904.515625, + "completions/mean_terminated_length": 6829.8740234375, + "completions/min_length": 1159.0, + "completions/min_terminated_length": 1159.0, + "entropy": 0.9905650988221169, + "epoch": 0.48482060717571296, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023104713764041662, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 443843010.0, + "reward": 0.3515625, + "reward_std": 0.226732075214386, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.0020711510442197323, + "sampling/sampling_logp_difference/max": 6.179650783538818, + "sampling/sampling_logp_difference/mean": 0.020169749855995178, + "step": 527 + }, + { + "clip_ratio/high_max": 3.274137043263181e-06, + "clip_ratio/high_mean": 8.185342608157953e-07, + "clip_ratio/low_mean": 3.806211361734313e-05, + "clip_ratio/low_min": 4.1808816604316235e-06, + "clip_ratio/region_mean": 3.8880647935002344e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15035.0, + "completions/max_terminated_length": 15035.0, + "completions/mean_length": 6611.21875, + "completions/mean_terminated_length": 6611.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.8890361413359642, + "epoch": 0.48574057037718493, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032739758025854826, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 444709854.0, + "reward": 0.4140625, + "reward_std": 0.30327799916267395, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891996383667, + "sampling/importance_sampling_ratio/min": 0.00029604812152683735, + "sampling/sampling_logp_difference/max": 8.124988555908203, + "sampling/sampling_logp_difference/mean": 0.018246350809931755, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.133989605430543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133989605430543e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15999.0, + "completions/mean_length": 6928.296875, + "completions/mean_terminated_length": 6853.84228515625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.9614408612251282, + "epoch": 0.48666053357865685, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018802061676979065, + "learning_rate": 1e-05, + "loss": 0.0528, + "num_tokens": 445614284.0, + "reward": 0.4921875, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999129176139832, + "sampling/importance_sampling_ratio/min": 0.02033112570643425, + "sampling/sampling_logp_difference/max": 3.895602226257324, + "sampling/sampling_logp_difference/mean": 0.019618764519691467, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9743174675568298e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9743174675568298e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16194.0, + "completions/mean_length": 7946.8671875, + "completions/mean_terminated_length": 7812.94482421875, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "entropy": 0.9987246319651604, + "epoch": 0.48758049678012877, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002164191100746393, + "learning_rate": 1e-05, + "loss": 0.0192, + "num_tokens": 446649731.0, + "reward": 0.453125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999844431877136, + "sampling/importance_sampling_ratio/min": 0.0018519347067922354, + "sampling/sampling_logp_difference/max": 6.291524410247803, + "sampling/sampling_logp_difference/mean": 0.020579926669597626, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4596658477094024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4596658477094024e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14446.0, + "completions/mean_length": 6763.53125, + "completions/mean_terminated_length": 6532.64013671875, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "entropy": 0.9593042582273483, + "epoch": 0.48850045998160074, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002090689493343234, + "learning_rate": 1e-05, + "loss": 0.0375, + "num_tokens": 447536311.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999126195907593, + "sampling/importance_sampling_ratio/min": 0.014640630222856998, + "sampling/sampling_logp_difference/max": 4.223954677581787, + "sampling/sampling_logp_difference/mean": 0.019683964550495148, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.527509309402376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.527509309402376e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15805.0, + "completions/mean_length": 7394.40625, + "completions/mean_terminated_length": 7323.6220703125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0184528306126595, + "epoch": 0.48942042318307266, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002562359906733036, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 448505707.0, + "reward": 0.2578125, + "reward_std": 0.17123225331306458, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560713768005, + "sampling/importance_sampling_ratio/min": 0.0002687747764866799, + "sampling/sampling_logp_difference/max": 8.221636772155762, + "sampling/sampling_logp_difference/mean": 0.020989736542105675, + "step": 532 + }, + { + "clip_ratio/high_max": 4.772085048898589e-06, + "clip_ratio/high_mean": 1.1930212622246472e-06, + "clip_ratio/low_mean": 2.0207754744205886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.140077623380421e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 7196.328125, + "completions/mean_terminated_length": 6822.84521484375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 1.0106298848986626, + "epoch": 0.49034038638454464, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017445285338908434, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 449443709.0, + "reward": 0.296875, + "reward_std": 0.21436558663845062, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999077320098877, + "sampling/importance_sampling_ratio/min": 0.0012854337692260742, + "sampling/sampling_logp_difference/max": 6.656659126281738, + "sampling/sampling_logp_difference/mean": 0.021059826016426086, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.0835892605173285e-05, + "clip_ratio/low_min": 3.619411700128694e-06, + "clip_ratio/region_mean": 4.0835892605173285e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 7418.3515625, + "completions/mean_terminated_length": 7203.17626953125, + "completions/min_length": 1445.0, + "completions/min_terminated_length": 1445.0, + "entropy": 1.002836562693119, + "epoch": 0.49126034958601655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015701872762292624, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 450412866.0, + "reward": 0.328125, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 8.191307279048488e-05, + "sampling/sampling_logp_difference/max": 9.409852027893066, + "sampling/sampling_logp_difference/mean": 0.020907817408442497, + "step": 534 + }, + { + "clip_ratio/high_max": 1.0691738907553372e-05, + "clip_ratio/high_mean": 4.761823504395579e-06, + "clip_ratio/low_mean": 9.472978547364619e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.949160914857202e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14089.0, + "completions/mean_length": 7007.109375, + "completions/mean_terminated_length": 6782.064453125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.9748141467571259, + "epoch": 0.4921803127874885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003912154585123062, + "learning_rate": 1e-05, + "loss": 0.055, + "num_tokens": 451331560.0, + "reward": 0.453125, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994460344314575, + "sampling/importance_sampling_ratio/min": 1.125945416902141e-07, + "sampling/sampling_logp_difference/max": 15.999472618103027, + "sampling/sampling_logp_difference/mean": 0.026503996923565865, + "step": 535 + }, + { + "clip_ratio/high_max": 1.5173390238487627e-05, + "clip_ratio/high_mean": 3.793347559621907e-06, + "clip_ratio/low_mean": 3.870478303724667e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.249813082424225e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15336.0, + "completions/mean_length": 6605.5, + "completions/mean_terminated_length": 6290.064453125, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "entropy": 0.9742915332317352, + "epoch": 0.49310027598896045, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0029959778767079115, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 452197568.0, + "reward": 0.46875, + "reward_std": 0.3180162310600281, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998653531074524, + "sampling/importance_sampling_ratio/min": 0.0005176665727049112, + "sampling/sampling_logp_difference/max": 7.566179275512695, + "sampling/sampling_logp_difference/mean": 0.019547434523701668, + "step": 536 + }, + { + "clip_ratio/high_max": 4.233987056068145e-06, + "clip_ratio/high_mean": 1.0584967640170362e-06, + "clip_ratio/low_mean": 3.348358245602867e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.454207922004571e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 6091.828125, + "completions/mean_terminated_length": 6010.78759765625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 0.9893068373203278, + "epoch": 0.49402023919043236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027553467079997063, + "learning_rate": 1e-05, + "loss": 0.064, + "num_tokens": 452995762.0, + "reward": 0.3671875, + "reward_std": 0.22437798976898193, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000625848770142, + "sampling/importance_sampling_ratio/min": 1.8432530168865924e-08, + "sampling/sampling_logp_difference/max": 17.80914878845215, + "sampling/sampling_logp_difference/mean": 0.02093922719359398, + "step": 537 + }, + { + "clip_ratio/high_max": 2.9927550940556102e-05, + "clip_ratio/high_mean": 7.481887735139026e-06, + "clip_ratio/low_mean": 5.346296995867306e-05, + "clip_ratio/low_min": 5.110593065182911e-06, + "clip_ratio/region_mean": 6.094485820540285e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16080.0, + "completions/mean_length": 6864.578125, + "completions/mean_terminated_length": 6789.6220703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 1.005393773317337, + "epoch": 0.49494020239190434, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002985693048685789, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 453896300.0, + "reward": 0.3828125, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999870777130127, + "sampling/importance_sampling_ratio/min": 1.8929262296296656e-05, + "sampling/sampling_logp_difference/max": 10.874801635742188, + "sampling/sampling_logp_difference/mean": 0.019800683483481407, + "step": 538 + }, + { + "clip_ratio/high_max": 1.2092638826288749e-05, + "clip_ratio/high_mean": 4.037869075546041e-06, + "clip_ratio/low_mean": 2.9533587621699553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3571456697245594e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14651.0, + "completions/max_terminated_length": 14651.0, + "completions/mean_length": 5828.125, + "completions/mean_terminated_length": 5828.125, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.909324087202549, + "epoch": 0.49586016559337626, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003145795315504074, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 454661564.0, + "reward": 0.359375, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999277591705322, + "sampling/importance_sampling_ratio/min": 5.3384183047455736e-06, + "sampling/sampling_logp_difference/max": 12.140581130981445, + "sampling/sampling_logp_difference/mean": 0.019065624102950096, + "step": 539 + }, + { + "clip_ratio/high_max": 2.344680183341552e-05, + "clip_ratio/high_mean": 5.86170045835388e-06, + "clip_ratio/low_mean": 4.5576647153211525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.143834823684301e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 6213.4140625, + "completions/mean_terminated_length": 6051.9765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.9570266529917717, + "epoch": 0.49678012879484823, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026711132377386093, + "learning_rate": 1e-05, + "loss": 0.116, + "num_tokens": 455477577.0, + "reward": 0.4296875, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.00041241716826334596, + "sampling/sampling_logp_difference/max": 7.793475151062012, + "sampling/sampling_logp_difference/mean": 0.01995767280459404, + "step": 540 + }, + { + "clip_ratio/high_max": 1.5261470707628177e-05, + "clip_ratio/high_mean": 3.815367676907044e-06, + "clip_ratio/low_mean": 3.6731302770931507e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.054667033415171e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15132.0, + "completions/mean_length": 7125.4140625, + "completions/mean_terminated_length": 7052.51171875, + "completions/min_length": 1374.0, + "completions/min_terminated_length": 1374.0, + "entropy": 0.9259644895792007, + "epoch": 0.49770009199632015, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030442574061453342, + "learning_rate": 1e-05, + "loss": 0.1227, + "num_tokens": 456408966.0, + "reward": 0.484375, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999846816062927, + "sampling/importance_sampling_ratio/min": 0.00023056140344124287, + "sampling/sampling_logp_difference/max": 8.374993324279785, + "sampling/sampling_logp_difference/mean": 0.020200349390506744, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.665321148422663e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.665321148422663e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15909.0, + "completions/mean_length": 6472.1640625, + "completions/mean_terminated_length": 6314.83349609375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.8606229647994041, + "epoch": 0.49862005519779207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002203581389039755, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 457257011.0, + "reward": 0.453125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998785853385925, + "sampling/importance_sampling_ratio/min": 8.579161658417434e-05, + "sampling/sampling_logp_difference/max": 9.3635892868042, + "sampling/sampling_logp_difference/mean": 0.018575064837932587, + "step": 542 + }, + { + "clip_ratio/high_max": 1.1763763723138254e-05, + "clip_ratio/high_mean": 2.9409409307845635e-06, + "clip_ratio/low_mean": 2.8100045369683357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.104098641415476e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16072.0, + "completions/max_terminated_length": 16072.0, + "completions/mean_length": 7154.0, + "completions/mean_terminated_length": 7154.0, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "entropy": 0.977513425052166, + "epoch": 0.49954001839926404, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001689116470515728, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 458196355.0, + "reward": 0.40625, + "reward_std": 0.18543371558189392, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999965250492096, + "sampling/importance_sampling_ratio/min": 0.00029606535099446774, + "sampling/sampling_logp_difference/max": 8.124930381774902, + "sampling/sampling_logp_difference/mean": 0.0198836512863636, + "step": 543 + }, + { + "clip_ratio/high_max": 1.1758888149415725e-05, + "clip_ratio/high_mean": 2.9397220373539312e-06, + "clip_ratio/low_mean": 4.075526112501393e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369498378764547e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16003.0, + "completions/mean_length": 6878.7265625, + "completions/mean_terminated_length": 6727.849609375, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.9291028156876564, + "epoch": 0.500459981600736, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001968112075701356, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 459095320.0, + "reward": 0.4609375, + "reward_std": 0.30274122953414917, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00014571755309589207, + "sampling/sampling_logp_difference/max": 8.833840370178223, + "sampling/sampling_logp_difference/mean": 0.019927173852920532, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.1461796147123096e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.1461796147123096e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15599.0, + "completions/mean_length": 7187.96875, + "completions/mean_terminated_length": 7042.00048828125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 1.1720879971981049, + "epoch": 0.5013799448022079, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002588641829788685, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 460042660.0, + "reward": 0.2265625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998769760131836, + "sampling/importance_sampling_ratio/min": 4.738242012081173e-07, + "sampling/sampling_logp_difference/max": 14.562429428100586, + "sampling/sampling_logp_difference/mean": 0.021826796233654022, + "step": 545 + }, + { + "clip_ratio/high_max": 1.55452166836767e-05, + "clip_ratio/high_mean": 3.886304170919175e-06, + "clip_ratio/low_mean": 4.735719005566352e-05, + "clip_ratio/low_min": 4.235134838381782e-06, + "clip_ratio/region_mean": 5.1243494908703724e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16019.0, + "completions/mean_length": 6278.078125, + "completions/mean_terminated_length": 6035.5361328125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.8143310993909836, + "epoch": 0.5022999080036799, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002047745743766427, + "learning_rate": 1e-05, + "loss": 0.064, + "num_tokens": 460864862.0, + "reward": 0.625, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999461770057678, + "sampling/importance_sampling_ratio/min": 0.011549573391675949, + "sampling/sampling_logp_difference/max": 4.461106777191162, + "sampling/sampling_logp_difference/mean": 0.017143042758107185, + "step": 546 + }, + { + "clip_ratio/high_max": 2.9079910746077076e-06, + "clip_ratio/high_mean": 7.269977686519269e-07, + "clip_ratio/low_mean": 6.497366200619581e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.224363969271508e-06, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13575.0, + "completions/mean_length": 5664.8828125, + "completions/mean_terminated_length": 5494.73828125, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "entropy": 0.9489249512553215, + "epoch": 0.5032198712051518, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002950560301542282, + "learning_rate": 1e-05, + "loss": 0.0867, + "num_tokens": 461608471.0, + "reward": 0.625, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999043345451355, + "sampling/importance_sampling_ratio/min": 1.6701715139788575e-05, + "sampling/sampling_logp_difference/max": 10.999999046325684, + "sampling/sampling_logp_difference/mean": 0.019181005656719208, + "step": 547 + }, + { + "clip_ratio/high_max": 1.2411757779773325e-05, + "clip_ratio/high_mean": 3.102939444943331e-06, + "clip_ratio/low_mean": 2.458288531670405e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7685824761647382e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16223.0, + "completions/mean_length": 6914.4375, + "completions/mean_terminated_length": 6839.8740234375, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "entropy": 0.9416745603084564, + "epoch": 0.5041398344066237, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013792186509817839, + "learning_rate": 1e-05, + "loss": 0.0112, + "num_tokens": 462511519.0, + "reward": 0.3671875, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999217391014099, + "sampling/importance_sampling_ratio/min": 4.006533345091157e-05, + "sampling/sampling_logp_difference/max": 10.124999046325684, + "sampling/sampling_logp_difference/mean": 0.01967109739780426, + "step": 548 + }, + { + "clip_ratio/high_max": 7.5066598128614714e-06, + "clip_ratio/high_mean": 1.8766649532153679e-06, + "clip_ratio/low_mean": 3.393825062403266e-05, + "clip_ratio/low_min": 3.3629271456447896e-06, + "clip_ratio/region_mean": 3.581491563409145e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 7343.296875, + "completions/mean_terminated_length": 7051.6611328125, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "entropy": 0.845381110906601, + "epoch": 0.5050597976080957, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028722358401864767, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 463472581.0, + "reward": 0.3984375, + "reward_std": 0.2880156934261322, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880194664001, + "sampling/importance_sampling_ratio/min": 1.5694884496042505e-05, + "sampling/sampling_logp_difference/max": 11.062175750732422, + "sampling/sampling_logp_difference/mean": 0.018903033807873726, + "step": 549 + }, + { + "clip_ratio/high_max": 1.6802483287392533e-05, + "clip_ratio/high_mean": 5.505368051217374e-06, + "clip_ratio/low_mean": 2.8057194754183e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.356256252118328e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13716.0, + "completions/mean_length": 6022.4375, + "completions/mean_terminated_length": 5940.8505859375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.9279188066720963, + "epoch": 0.5059797608095676, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002812078921124339, + "learning_rate": 1e-05, + "loss": 0.0074, + "num_tokens": 464263709.0, + "reward": 0.421875, + "reward_std": 0.26120057702064514, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000264644622803, + "sampling/importance_sampling_ratio/min": 0.0008089813054539263, + "sampling/sampling_logp_difference/max": 7.119734764099121, + "sampling/sampling_logp_difference/mean": 0.01863965392112732, + "step": 550 + }, + { + "clip_ratio/high_max": 1.799457299966889e-05, + "clip_ratio/high_mean": 5.5325897960756265e-06, + "clip_ratio/low_mean": 3.587696073736879e-05, + "clip_ratio/low_min": 2.965106659758021e-06, + "clip_ratio/region_mean": 4.140955002185365e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 6888.6328125, + "completions/mean_terminated_length": 6813.8662109375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 1.0720202773809433, + "epoch": 0.5068997240110396, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001776764984242618, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 465167502.0, + "reward": 0.3203125, + "reward_std": 0.2961437702178955, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945342540741, + "sampling/importance_sampling_ratio/min": 0.0013267829781398177, + "sampling/sampling_logp_difference/max": 6.624998092651367, + "sampling/sampling_logp_difference/mean": 0.02100517973303795, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.568914848983695e-05, + "clip_ratio/low_min": 3.652834493550472e-06, + "clip_ratio/region_mean": 3.568914848983695e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14283.0, + "completions/mean_length": 6626.7578125, + "completions/mean_terminated_length": 6549.92919921875, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9632527679204941, + "epoch": 0.5078196872125115, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016460138140246272, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 466034535.0, + "reward": 0.5, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000152587890625, + "sampling/importance_sampling_ratio/min": 0.0002774179738480598, + "sampling/sampling_logp_difference/max": 8.189985275268555, + "sampling/sampling_logp_difference/mean": 0.020494937896728516, + "step": 552 + }, + { + "clip_ratio/high_max": 9.810846677282825e-06, + "clip_ratio/high_mean": 2.4527116693207063e-06, + "clip_ratio/low_mean": 2.4154636378170835e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.660734804749154e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16169.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 6685.484375, + "completions/mean_terminated_length": 6685.484375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.9092860966920853, + "epoch": 0.5087396504139834, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019802958704531193, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 466911965.0, + "reward": 0.4609375, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 0.0020434472244232893, + "sampling/sampling_logp_difference/max": 6.193117141723633, + "sampling/sampling_logp_difference/mean": 0.02000512182712555, + "step": 553 + }, + { + "clip_ratio/high_max": 3.24397274198418e-06, + "clip_ratio/high_mean": 8.10993185496045e-07, + "clip_ratio/low_mean": 2.4120176362885104e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.493116954838115e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7105.1171875, + "completions/mean_terminated_length": 7032.05517578125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 1.046683594584465, + "epoch": 0.5096596136154554, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002490658313035965, + "learning_rate": 1e-05, + "loss": 0.0077, + "num_tokens": 467844820.0, + "reward": 0.2578125, + "reward_std": 0.17123225331306458, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999809265136719, + "sampling/importance_sampling_ratio/min": 7.140394586713228e-07, + "sampling/sampling_logp_difference/max": 14.152327537536621, + "sampling/sampling_logp_difference/mean": 0.020726388320326805, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0303147582344536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0303147582344536e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15969.0, + "completions/max_terminated_length": 15969.0, + "completions/mean_length": 6806.5546875, + "completions/mean_terminated_length": 6806.5546875, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "entropy": 0.9514358267188072, + "epoch": 0.5105795768169273, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002886313945055008, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 468732451.0, + "reward": 0.3203125, + "reward_std": 0.23250603675842285, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999695420265198, + "sampling/importance_sampling_ratio/min": 3.148883251924417e-06, + "sampling/sampling_logp_difference/max": 12.668462753295898, + "sampling/sampling_logp_difference/mean": 0.019308820366859436, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.485187078742456e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.485187078742456e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16075.0, + "completions/mean_length": 6238.546875, + "completions/mean_terminated_length": 5995.05615234375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.9408878460526466, + "epoch": 0.5114995400183993, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002731110667809844, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 469551145.0, + "reward": 0.3671875, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.000488168589072302, + "sampling/sampling_logp_difference/max": 7.624849796295166, + "sampling/sampling_logp_difference/mean": 0.01883235014975071, + "step": 556 + }, + { + "clip_ratio/high_max": 3.5477096389513463e-06, + "clip_ratio/high_mean": 8.869274097378366e-07, + "clip_ratio/low_mean": 2.5422534008612274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.630946141835011e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 7354.5, + "completions/mean_terminated_length": 7283.4013671875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.9548593312501907, + "epoch": 0.5124195032198712, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022831051610410213, + "learning_rate": 1e-05, + "loss": 0.004, + "num_tokens": 470510305.0, + "reward": 0.4609375, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999933123588562, + "sampling/importance_sampling_ratio/min": 0.00029948100564070046, + "sampling/sampling_logp_difference/max": 8.113459587097168, + "sampling/sampling_logp_difference/mean": 0.020626772195100784, + "step": 557 + }, + { + "clip_ratio/high_max": 1.0478707963557099e-05, + "clip_ratio/high_mean": 2.6196769908892747e-06, + "clip_ratio/low_mean": 4.646405352559668e-05, + "clip_ratio/low_min": 9.308073458669242e-06, + "clip_ratio/region_mean": 4.908373023226886e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16222.0, + "completions/mean_length": 7481.421875, + "completions/mean_terminated_length": 7119.5283203125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.9302244186401367, + "epoch": 0.5133394664213431, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015396618982777, + "learning_rate": 1e-05, + "loss": 0.0944, + "num_tokens": 471486799.0, + "reward": 0.34375, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397993087769, + "sampling/importance_sampling_ratio/min": 0.0004175819631200284, + "sampling/sampling_logp_difference/max": 7.78102970123291, + "sampling/sampling_logp_difference/mean": 0.019920824095606804, + "step": 558 + }, + { + "clip_ratio/high_max": 1.2743131946990616e-05, + "clip_ratio/high_mean": 3.185782986747654e-06, + "clip_ratio/low_mean": 3.139938735330361e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.458517039689468e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 7333.9375, + "completions/mean_terminated_length": 7042.0, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0290198475122452, + "epoch": 0.5142594296228151, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002122553065419197, + "learning_rate": 1e-05, + "loss": 0.0653, + "num_tokens": 472443991.0, + "reward": 0.359375, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000252723693848, + "sampling/importance_sampling_ratio/min": 0.00011467799777165055, + "sampling/sampling_logp_difference/max": 9.073382377624512, + "sampling/sampling_logp_difference/mean": 0.020558707416057587, + "step": 559 + }, + { + "clip_ratio/high_max": 2.856805417650321e-05, + "clip_ratio/high_mean": 7.142013544125803e-06, + "clip_ratio/low_mean": 4.716298451512557e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.430499885505924e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16248.0, + "completions/mean_length": 6908.953125, + "completions/mean_terminated_length": 6681.55224609375, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.9942271336913109, + "epoch": 0.515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017197602428495884, + "learning_rate": 1e-05, + "loss": 0.1309, + "num_tokens": 473346577.0, + "reward": 0.421875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999131560325623, + "sampling/importance_sampling_ratio/min": 0.00016969948774203658, + "sampling/sampling_logp_difference/max": 8.68148136138916, + "sampling/sampling_logp_difference/mean": 0.019906114786863327, + "step": 560 + }, + { + "clip_ratio/high_max": 2.4387230496358825e-05, + "clip_ratio/high_mean": 7.2725478048596415e-06, + "clip_ratio/low_mean": 3.3024165190909116e-05, + "clip_ratio/low_min": 2.9529187486332376e-06, + "clip_ratio/region_mean": 4.029671254102141e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7543.046875, + "completions/mean_terminated_length": 7183.658203125, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.973315916955471, + "epoch": 0.516099356025759, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001618197187781334, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 474330663.0, + "reward": 0.28125, + "reward_std": 0.28353503346443176, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999313950538635, + "sampling/importance_sampling_ratio/min": 2.1410157557966158e-07, + "sampling/sampling_logp_difference/max": 15.356815338134766, + "sampling/sampling_logp_difference/mean": 0.019991599023342133, + "step": 561 + }, + { + "clip_ratio/high_max": 1.8185269482273725e-05, + "clip_ratio/high_mean": 4.546317370568431e-06, + "clip_ratio/low_mean": 5.2758662491214636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7304980941808026e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15773.0, + "completions/mean_length": 7136.375, + "completions/mean_terminated_length": 6838.064453125, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.8573452606797218, + "epoch": 0.5170193192272309, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025291196070611477, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 475262071.0, + "reward": 0.453125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999455213546753, + "sampling/importance_sampling_ratio/min": 5.8296889619668946e-05, + "sampling/sampling_logp_difference/max": 9.749961853027344, + "sampling/sampling_logp_difference/mean": 0.018726464360952377, + "step": 562 + }, + { + "clip_ratio/high_max": 1.9233400280427304e-05, + "clip_ratio/high_mean": 4.808350070106826e-06, + "clip_ratio/low_mean": 4.3801222432193754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.860957244545716e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 6538.765625, + "completions/mean_terminated_length": 6138.552734375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8312613591551781, + "epoch": 0.5179392824287029, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018195402808487415, + "learning_rate": 1e-05, + "loss": 0.1266, + "num_tokens": 476119385.0, + "reward": 0.5078125, + "reward_std": 0.3674348294734955, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999343156814575, + "sampling/importance_sampling_ratio/min": 0.005875314120203257, + "sampling/sampling_logp_difference/max": 5.136995792388916, + "sampling/sampling_logp_difference/mean": 0.018957480788230896, + "step": 563 + }, + { + "clip_ratio/high_max": 1.4299099348136224e-05, + "clip_ratio/high_mean": 3.574774837034056e-06, + "clip_ratio/low_mean": 2.9377598366409075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.295237320344313e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6692.078125, + "completions/mean_terminated_length": 5870.72900390625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.943247564136982, + "epoch": 0.5188592456301748, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001623075339011848, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 476995139.0, + "reward": 0.53125, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999014139175415, + "sampling/importance_sampling_ratio/min": 0.0003255821648053825, + "sampling/sampling_logp_difference/max": 8.029895782470703, + "sampling/sampling_logp_difference/mean": 0.019327864050865173, + "step": 564 + }, + { + "clip_ratio/high_max": 2.547848680478637e-06, + "clip_ratio/high_mean": 6.369621701196593e-07, + "clip_ratio/low_mean": 5.479312403622316e-05, + "clip_ratio/low_min": 8.624037718618638e-06, + "clip_ratio/region_mean": 5.543008592212573e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 7118.40625, + "completions/mean_terminated_length": 6896.0322265625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 1.051003873348236, + "epoch": 0.5197792088316467, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034032040275633335, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 477926583.0, + "reward": 0.359375, + "reward_std": 0.30115145444869995, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.00037551531568169594, + "sampling/sampling_logp_difference/max": 7.887211322784424, + "sampling/sampling_logp_difference/mean": 0.021631836891174316, + "step": 565 + }, + { + "clip_ratio/high_max": 3.823331553576281e-06, + "clip_ratio/high_mean": 9.558328883940703e-07, + "clip_ratio/low_mean": 1.506989860899921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.602573161108012e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 7555.8515625, + "completions/mean_terminated_length": 7415.72265625, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.9771487265825272, + "epoch": 0.5206991720331187, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014035169733688235, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 478914724.0, + "reward": 0.1875, + "reward_std": 0.19673939049243927, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999145865440369, + "sampling/importance_sampling_ratio/min": 0.0017069041496142745, + "sampling/sampling_logp_difference/max": 6.373074054718018, + "sampling/sampling_logp_difference/mean": 0.020011281594634056, + "step": 566 + }, + { + "clip_ratio/high_max": 4.262138645572122e-06, + "clip_ratio/high_mean": 2.0894199224130716e-06, + "clip_ratio/low_mean": 2.9273888458192232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1363308380605304e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 6505.671875, + "completions/mean_terminated_length": 6019.85205078125, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 0.9913810566067696, + "epoch": 0.5216191352345906, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0012457151897251606, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 479766874.0, + "reward": 0.3984375, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999585151672363, + "sampling/importance_sampling_ratio/min": 5.239284206481898e-08, + "sampling/sampling_logp_difference/max": 16.764495849609375, + "sampling/sampling_logp_difference/mean": 0.01945749670267105, + "step": 567 + }, + { + "clip_ratio/high_max": 4.419772267283406e-06, + "clip_ratio/high_mean": 1.1049430668208515e-06, + "clip_ratio/low_mean": 3.3968740126510966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.507368319333182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15589.0, + "completions/max_terminated_length": 15589.0, + "completions/mean_length": 6709.96875, + "completions/mean_terminated_length": 6709.96875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 1.053658738732338, + "epoch": 0.5225390984360626, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002912909025326371, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 480644782.0, + "reward": 0.4140625, + "reward_std": 0.2041109800338745, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000216960906982, + "sampling/importance_sampling_ratio/min": 0.00010272916551912203, + "sampling/sampling_logp_difference/max": 9.183414459228516, + "sampling/sampling_logp_difference/mean": 0.020628605037927628, + "step": 568 + }, + { + "clip_ratio/high_max": 1.5635781892342493e-05, + "clip_ratio/high_mean": 5.148336185811786e-06, + "clip_ratio/low_mean": 7.926051148388069e-05, + "clip_ratio/low_min": 9.047379990079207e-06, + "clip_ratio/region_mean": 8.440884812443983e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15677.0, + "completions/max_terminated_length": 15677.0, + "completions/mean_length": 6712.8515625, + "completions/mean_terminated_length": 6712.8515625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 0.9288468211889267, + "epoch": 0.5234590616375345, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028935675509274006, + "learning_rate": 1e-05, + "loss": 0.0293, + "num_tokens": 481525875.0, + "reward": 0.328125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999656677246094, + "sampling/importance_sampling_ratio/min": 0.0003157128521706909, + "sampling/sampling_logp_difference/max": 8.060677528381348, + "sampling/sampling_logp_difference/mean": 0.0201251357793808, + "step": 569 + }, + { + "clip_ratio/high_max": 1.1007121202055714e-05, + "clip_ratio/high_mean": 2.7517803005139285e-06, + "clip_ratio/low_mean": 4.98413718332813e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2593152645386e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16247.0, + "completions/mean_length": 7452.125, + "completions/mean_terminated_length": 7164.0, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.8201636075973511, + "epoch": 0.5243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014447550056502223, + "learning_rate": 1e-05, + "loss": 0.1068, + "num_tokens": 482498539.0, + "reward": 0.25, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352097511292, + "sampling/importance_sampling_ratio/min": 0.0008213221444748342, + "sampling/sampling_logp_difference/max": 7.104595184326172, + "sampling/sampling_logp_difference/mean": 0.018142810091376305, + "step": 570 + }, + { + "clip_ratio/high_max": 3.4893782867584378e-06, + "clip_ratio/high_mean": 8.723445716896094e-07, + "clip_ratio/low_mean": 2.5241818775612046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6114163347301655e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16242.0, + "completions/mean_length": 5997.6484375, + "completions/mean_terminated_length": 5915.8662109375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 0.9595593363046646, + "epoch": 0.5252989880404784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013929647393524647, + "learning_rate": 1e-05, + "loss": -0.0018, + "num_tokens": 483286590.0, + "reward": 0.421875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000566244125366, + "sampling/importance_sampling_ratio/min": 6.860717985546216e-05, + "sampling/sampling_logp_difference/max": 9.587113380432129, + "sampling/sampling_logp_difference/mean": 0.019294174388051033, + "step": 571 + }, + { + "clip_ratio/high_max": 1.2741817272399203e-05, + "clip_ratio/high_mean": 3.1854543180998007e-06, + "clip_ratio/low_mean": 3.2705364901630674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.589081939026073e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15962.0, + "completions/mean_length": 6706.4140625, + "completions/mean_terminated_length": 6474.15234375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9320398196578026, + "epoch": 0.5262189512419503, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020693838596343994, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 484164003.0, + "reward": 0.4296875, + "reward_std": 0.30744946002960205, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999852180480957, + "sampling/importance_sampling_ratio/min": 0.011049352586269379, + "sampling/sampling_logp_difference/max": 4.505383491516113, + "sampling/sampling_logp_difference/mean": 0.01968679018318653, + "step": 572 + }, + { + "clip_ratio/high_max": 1.783004472599714e-05, + "clip_ratio/high_mean": 4.457511181499285e-06, + "clip_ratio/low_mean": 2.067615122314237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5133662290954817e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15636.0, + "completions/mean_length": 5317.96875, + "completions/mean_terminated_length": 5230.83447265625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.891069769859314, + "epoch": 0.5271389144434223, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004261080641299486, + "learning_rate": 1e-05, + "loss": 0.0528, + "num_tokens": 484864799.0, + "reward": 0.5234375, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999099969863892, + "sampling/importance_sampling_ratio/min": 0.00014285604993347079, + "sampling/sampling_logp_difference/max": 8.853672981262207, + "sampling/sampling_logp_difference/mean": 0.01876065693795681, + "step": 573 + }, + { + "clip_ratio/high_max": 6.954531272640452e-06, + "clip_ratio/high_mean": 1.738632818160113e-06, + "clip_ratio/low_mean": 4.1548010585756856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.328664340391697e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6978.7890625, + "completions/mean_terminated_length": 6596.46337890625, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "entropy": 0.9322286397218704, + "epoch": 0.5280588776448942, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013973438180983067, + "learning_rate": 1e-05, + "loss": 0.0396, + "num_tokens": 485779676.0, + "reward": 0.3125, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999111890792847, + "sampling/importance_sampling_ratio/min": 0.00024690330610610545, + "sampling/sampling_logp_difference/max": 8.306513786315918, + "sampling/sampling_logp_difference/mean": 0.019345812499523163, + "step": 574 + }, + { + "clip_ratio/high_max": 1.4024310985405464e-05, + "clip_ratio/high_mean": 3.506077746351366e-06, + "clip_ratio/low_mean": 3.8480168882415455e-05, + "clip_ratio/low_min": 8.625057944300352e-06, + "clip_ratio/region_mean": 4.198624606033263e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16178.0, + "completions/mean_length": 6046.4921875, + "completions/mean_terminated_length": 5965.09423828125, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "entropy": 1.0245087146759033, + "epoch": 0.5289788408463661, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015273626195266843, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 486574779.0, + "reward": 0.3046875, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998648166656494, + "sampling/importance_sampling_ratio/min": 0.00043810487841255963, + "sampling/sampling_logp_difference/max": 7.7330522537231445, + "sampling/sampling_logp_difference/mean": 0.01977401226758957, + "step": 575 + }, + { + "clip_ratio/high_max": 1.1012245522579178e-05, + "clip_ratio/high_mean": 2.7530613806447946e-06, + "clip_ratio/low_mean": 2.9637111538249883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239017382838938e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16086.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 5987.0859375, + "completions/mean_terminated_length": 5987.0859375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.9373713582754135, + "epoch": 0.5298988040478381, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003076995024457574, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 487366590.0, + "reward": 0.4453125, + "reward_std": 0.24830511212348938, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.0004714882234111428, + "sampling/sampling_logp_difference/max": 7.659616470336914, + "sampling/sampling_logp_difference/mean": 0.018766682595014572, + "step": 576 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 487366590, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}