diff --git "a/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json" "b/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json" new file mode 100644--- /dev/null +++ "b/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json" @@ -0,0 +1,11938 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3532658693652254, + "eval_steps": 500, + "global_step": 384, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + }, + { + "clip_ratio/high_max": 3.2948471016425174e-05, + "clip_ratio/high_mean": 9.518853403278627e-06, + "clip_ratio/low_mean": 2.195712454522436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.14759782895635e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15892.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 5582.9765625, + "completions/mean_terminated_length": 5582.9765625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8629376217722893, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037982752546668053, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 279462542.0, + "reward": 0.5546875, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780058860779, + "sampling/importance_sampling_ratio/min": 0.0021874974481761456, + "sampling/sampling_logp_difference/max": 6.124997138977051, + "sampling/sampling_logp_difference/mean": 0.01906203106045723, + "step": 321 + }, + { + "clip_ratio/high_max": 1.1029473625967512e-05, + "clip_ratio/high_mean": 2.757368406491878e-06, + "clip_ratio/low_mean": 5.367386921761863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6431237737797346e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 6942.2578125, + "completions/mean_terminated_length": 6477.90966796875, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 0.8147861957550049, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027678858023136854, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 280370207.0, + "reward": 0.4375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998471736907959, + "sampling/importance_sampling_ratio/min": 0.00023058800434228033, + "sampling/sampling_logp_difference/max": 8.3748779296875, + "sampling/sampling_logp_difference/mean": 0.01940828748047352, + "step": 322 + }, + { + "clip_ratio/high_max": 2.6367894406575942e-05, + "clip_ratio/high_mean": 8.765707434577052e-06, + "clip_ratio/low_mean": 3.232976985145797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.109547796815605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6242.53125, + "completions/mean_terminated_length": 5915.38671875, + "completions/min_length": 1220.0, + "completions/min_terminated_length": 1220.0, + "entropy": 0.878915011882782, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00577945914119482, + "learning_rate": 1e-05, + "loss": 0.0839, + "num_tokens": 281189491.0, + "reward": 0.515625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 9.611724817659706e-05, + "sampling/sampling_logp_difference/max": 9.2499418258667, + "sampling/sampling_logp_difference/mean": 0.01948760263621807, + "step": 323 + }, + { + "clip_ratio/high_max": 3.50839609382092e-05, + "clip_ratio/high_mean": 1.1664920634757436e-05, + "clip_ratio/low_mean": 1.833109013205103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9996010880495305e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 7004.015625, + "completions/mean_terminated_length": 6622.71533203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.7964659407734871, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014128695474937558, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 282103997.0, + "reward": 0.4140625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.0024504722096025944, + "sampling/sampling_logp_difference/max": 6.011474609375, + "sampling/sampling_logp_difference/mean": 0.019019678235054016, + "step": 324 + }, + { + "clip_ratio/high_max": 1.832260545597819e-05, + "clip_ratio/high_mean": 4.580651363994548e-06, + "clip_ratio/low_mean": 5.309064226821647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.767129368905444e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7822.6953125, + "completions/mean_terminated_length": 7546.52392578125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.8571138679981232, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002476039342582226, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 283122382.0, + "reward": 0.4609375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.0009774373611435294, + "sampling/sampling_logp_difference/max": 6.930576324462891, + "sampling/sampling_logp_difference/mean": 0.020557202398777008, + "step": 325 + }, + { + "clip_ratio/high_max": 5.738419986300869e-06, + "clip_ratio/high_mean": 1.4346049965752172e-06, + "clip_ratio/low_mean": 4.19679121819172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3402517292179255e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7738.8984375, + "completions/mean_terminated_length": 6844.57763671875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 0.7839021533727646, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005309853237122297, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 284130081.0, + "reward": 0.5234375, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998971223831177, + "sampling/importance_sampling_ratio/min": 0.0001319014554610476, + "sampling/sampling_logp_difference/max": 8.933455467224121, + "sampling/sampling_logp_difference/mean": 0.01873316988348961, + "step": 326 + }, + { + "clip_ratio/high_max": 1.007085802484653e-05, + "clip_ratio/high_mean": 2.5177145062116324e-06, + "clip_ratio/low_mean": 4.043528815600439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.295300277590286e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15952.0, + "completions/mean_length": 7102.2421875, + "completions/mean_terminated_length": 6954.9130859375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8530801385641098, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228116944432259, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 285058720.0, + "reward": 0.5078125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00012956927821505815, + "sampling/sampling_logp_difference/max": 8.951294898986816, + "sampling/sampling_logp_difference/mean": 0.019325006753206253, + "step": 327 + }, + { + "clip_ratio/high_max": 4.06874551117653e-06, + "clip_ratio/high_mean": 1.0171863777941326e-06, + "clip_ratio/low_mean": 3.661125703047219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.762844340826632e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15594.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6583.4765625, + "completions/mean_terminated_length": 6583.4765625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.021921381354332, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004967439454048872, + "learning_rate": 1e-05, + "loss": 0.0374, + "num_tokens": 285919765.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.016675354912877083, + "sampling/sampling_logp_difference/max": 4.093823432922363, + "sampling/sampling_logp_difference/mean": 0.021393200382590294, + "step": 328 + }, + { + "clip_ratio/high_max": 1.2215251445013564e-05, + "clip_ratio/high_mean": 3.053812861253391e-06, + "clip_ratio/low_mean": 4.05305947879242e-05, + "clip_ratio/low_min": 4.215567059873138e-06, + "clip_ratio/region_mean": 4.358440742180392e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16299.0, + "completions/mean_length": 7770.5859375, + "completions/mean_terminated_length": 7346.97509765625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 1.0466903448104858, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004189736675471067, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 286935512.0, + "reward": 0.3828125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.011683559976518154, + "sampling/sampling_logp_difference/max": 4.449572563171387, + "sampling/sampling_logp_difference/mean": 0.021805983036756516, + "step": 329 + }, + { + "clip_ratio/high_max": 2.0567378214764176e-05, + "clip_ratio/high_mean": 5.141844553691044e-06, + "clip_ratio/low_mean": 1.8177100628236076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3318944840866607e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 5689.2421875, + "completions/mean_terminated_length": 5432.568359375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.7778806164860725, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032866497058421373, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 287681943.0, + "reward": 0.640625, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.00038077132194302976, + "sampling/sampling_logp_difference/max": 7.873311519622803, + "sampling/sampling_logp_difference/mean": 0.01789461076259613, + "step": 330 + }, + { + "clip_ratio/high_max": 3.109086901531555e-05, + "clip_ratio/high_mean": 7.772717253828887e-06, + "clip_ratio/low_mean": 3.1423560130861006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919627738468989e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13820.0, + "completions/mean_length": 6288.1875, + "completions/mean_terminated_length": 6127.93701171875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.7709921672940254, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023572889622300863, + "learning_rate": 1e-05, + "loss": 0.0746, + "num_tokens": 288506735.0, + "reward": 0.484375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 0.000430915504693985, + "sampling/sampling_logp_difference/max": 7.749598503112793, + "sampling/sampling_logp_difference/mean": 0.017407266423106194, + "step": 331 + }, + { + "clip_ratio/high_max": 3.4638953366084024e-05, + "clip_ratio/high_mean": 9.51674803673086e-06, + "clip_ratio/low_mean": 6.26047980176736e-05, + "clip_ratio/low_min": 5.51267930859467e-06, + "clip_ratio/region_mean": 7.212154741864651e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 6775.0234375, + "completions/mean_terminated_length": 6465.05615234375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9338318258523941, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034220058005303144, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 289395498.0, + "reward": 0.390625, + "reward_std": 0.34533774852752686, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.0317598432302475, + "sampling/sampling_logp_difference/max": 3.449552536010742, + "sampling/sampling_logp_difference/mean": 0.019930530339479446, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.159989991123439e-05, + "clip_ratio/low_min": 1.5592839645250933e-05, + "clip_ratio/region_mean": 7.159989991123439e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 7142.9375, + "completions/mean_terminated_length": 6844.83837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.971405878663063, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002513247774913907, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 290329082.0, + "reward": 0.328125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 3.152207455059397e-07, + "sampling/sampling_logp_difference/max": 14.969992637634277, + "sampling/sampling_logp_difference/mean": 0.022366533055901527, + "step": 333 + }, + { + "clip_ratio/high_max": 1.6507752206962323e-05, + "clip_ratio/high_mean": 4.126938051740581e-06, + "clip_ratio/low_mean": 1.7493430505055585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1620368215735652e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15581.0, + "completions/mean_length": 6412.2109375, + "completions/mean_terminated_length": 6333.69287109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "entropy": 0.9136044681072235, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0056767817586660385, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 291170133.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.000458698661532253, + "sampling/sampling_logp_difference/max": 7.687117099761963, + "sampling/sampling_logp_difference/mean": 0.020012658089399338, + "step": 334 + }, + { + "clip_ratio/high_max": 8.26085442895419e-06, + "clip_ratio/high_mean": 2.0652136072385474e-06, + "clip_ratio/low_mean": 3.6938338666914206e-05, + "clip_ratio/low_min": 5.699044777429663e-06, + "clip_ratio/region_mean": 3.900355193309224e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16111.0, + "completions/mean_length": 8066.1015625, + "completions/mean_terminated_length": 7797.7822265625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 1.0789504647254944, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00243841833434999, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 292222082.0, + "reward": 0.3046875, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999664425849915, + "sampling/importance_sampling_ratio/min": 8.481895929435268e-05, + "sampling/sampling_logp_difference/max": 9.374991416931152, + "sampling/sampling_logp_difference/mean": 0.023650091141462326, + "step": 335 + }, + { + "clip_ratio/high_max": 5.320054697222076e-06, + "clip_ratio/high_mean": 1.330013674305519e-06, + "clip_ratio/low_mean": 1.9117383317279746e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0447396991585265e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15176.0, + "completions/mean_length": 6836.046875, + "completions/mean_terminated_length": 6606.896484375, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 1.218759760260582, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020856577903032303, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 293115984.0, + "reward": 0.21875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 2.784526441246271e-05, + "sampling/sampling_logp_difference/max": 10.488847732543945, + "sampling/sampling_logp_difference/mean": 0.022012067958712578, + "step": 336 + }, + { + "clip_ratio/high_max": 2.5695502699818462e-05, + "clip_ratio/high_mean": 7.549717793153832e-06, + "clip_ratio/low_mean": 4.6741323160404136e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.429104089671455e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7501.9921875, + "completions/mean_terminated_length": 7140.9345703125, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "entropy": 0.8940394818782806, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005163854919373989, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 294099503.0, + "reward": 0.328125, + "reward_std": 0.30904707312583923, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.0006545600481331348, + "sampling/sampling_logp_difference/max": 7.331547260284424, + "sampling/sampling_logp_difference/mean": 0.020813245326280594, + "step": 337 + }, + { + "clip_ratio/high_max": 3.1606674838258186e-05, + "clip_ratio/high_mean": 9.45794374729303e-06, + "clip_ratio/low_mean": 4.5567895540443715e-05, + "clip_ratio/low_min": 4.458871444512624e-06, + "clip_ratio/region_mean": 5.502583962879726e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7204.828125, + "completions/mean_terminated_length": 6908.7255859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.9961872175335884, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029277894645929337, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 295042105.0, + "reward": 0.390625, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05, + "sampling/sampling_logp_difference/max": 10.872637748718262, + "sampling/sampling_logp_difference/mean": 0.020187582820653915, + "step": 338 + }, + { + "clip_ratio/high_max": 1.7963964182854397e-05, + "clip_ratio/high_mean": 5.194059781388205e-06, + "clip_ratio/low_mean": 1.8380221035840805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.357428081722901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6256.859375, + "completions/mean_terminated_length": 6013.80810546875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.9293600022792816, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032952844630926847, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 295867039.0, + "reward": 0.46875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999649524688721, + "sampling/importance_sampling_ratio/min": 7.995560008566827e-05, + "sampling/sampling_logp_difference/max": 9.434039115905762, + "sampling/sampling_logp_difference/mean": 0.019491540268063545, + "step": 339 + }, + { + "clip_ratio/high_max": 7.577551059512189e-06, + "clip_ratio/high_mean": 1.8943877648780472e-06, + "clip_ratio/low_mean": 2.7479814093567256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9374201631071628e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15412.0, + "completions/mean_length": 7397.84375, + "completions/mean_terminated_length": 7032.552734375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.8508890569210052, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029417150653898716, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 296832843.0, + "reward": 0.375, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000183582305908, + "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05, + "sampling/sampling_logp_difference/max": 10.93724250793457, + "sampling/sampling_logp_difference/mean": 0.01975393109023571, + "step": 340 + }, + { + "clip_ratio/high_max": 3.281225508544594e-05, + "clip_ratio/high_mean": 1.3302957199812226e-05, + "clip_ratio/low_mean": 5.109179869577929e-05, + "clip_ratio/low_min": 6.657612175331451e-06, + "clip_ratio/region_mean": 6.439475532715733e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6897.765625, + "completions/mean_terminated_length": 6823.07080078125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9046694040298462, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026788609102368355, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 297735285.0, + "reward": 0.421875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.001710799871943891, + "sampling/sampling_logp_difference/max": 6.370794296264648, + "sampling/sampling_logp_difference/mean": 0.020578179508447647, + "step": 341 + }, + { + "clip_ratio/high_max": 1.7319889593636617e-05, + "clip_ratio/high_mean": 5.168538336874917e-06, + "clip_ratio/low_mean": 7.019768918326008e-05, + "clip_ratio/low_min": 2.541147478041239e-05, + "clip_ratio/region_mean": 7.53662266106403e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 6971.9921875, + "completions/mean_terminated_length": 6509.10595703125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8658201694488525, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005915141198784113, + "learning_rate": 1e-05, + "loss": 0.0923, + "num_tokens": 298645124.0, + "reward": 0.3984375, + "reward_std": 0.3742823898792267, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999268651008606, + "sampling/importance_sampling_ratio/min": 0.000970841443631798, + "sampling/sampling_logp_difference/max": 6.937347412109375, + "sampling/sampling_logp_difference/mean": 0.01906151883304119, + "step": 342 + }, + { + "clip_ratio/high_max": 1.8332865238335216e-05, + "clip_ratio/high_mean": 4.583216309583804e-06, + "clip_ratio/low_mean": 6.167940273371642e-05, + "clip_ratio/low_min": 5.969151516183047e-06, + "clip_ratio/region_mean": 6.626261847486603e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15054.0, + "completions/mean_length": 6545.6953125, + "completions/mean_terminated_length": 5889.80859375, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.779609851539135, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032792428974062204, + "learning_rate": 1e-05, + "loss": 0.097, + "num_tokens": 299503781.0, + "reward": 0.609375, + "reward_std": 0.38293448090553284, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361634254456, + "sampling/importance_sampling_ratio/min": 0.002187495119869709, + "sampling/sampling_logp_difference/max": 6.124998092651367, + "sampling/sampling_logp_difference/mean": 0.017413027584552765, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.46246323235755e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.46246323235755e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7226.515625, + "completions/mean_terminated_length": 7006.736328125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9573849961161613, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005092279519885778, + "learning_rate": 1e-05, + "loss": 0.1102, + "num_tokens": 300447903.0, + "reward": 0.5390625, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999373555183411, + "sampling/importance_sampling_ratio/min": 0.000627054600045085, + "sampling/sampling_logp_difference/max": 7.374476909637451, + "sampling/sampling_logp_difference/mean": 0.021570835262537003, + "step": 344 + }, + { + "clip_ratio/high_max": 5.487269390869187e-06, + "clip_ratio/high_mean": 1.3718173477172968e-06, + "clip_ratio/low_mean": 4.7280102080549113e-05, + "clip_ratio/low_min": 1.0166083029616857e-05, + "clip_ratio/region_mean": 4.865191931457957e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14967.0, + "completions/mean_length": 5755.171875, + "completions/mean_terminated_length": 5323.10546875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8482184633612633, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005033228080719709, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 301206021.0, + "reward": 0.390625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.0014573346124961972, + "sampling/sampling_logp_difference/max": 6.531146049499512, + "sampling/sampling_logp_difference/mean": 0.018870476633310318, + "step": 345 + }, + { + "clip_ratio/high_max": 5.421346941147931e-06, + "clip_ratio/high_mean": 1.3553367352869827e-06, + "clip_ratio/low_mean": 1.6510994441887306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.786633117717429e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 7098.7265625, + "completions/mean_terminated_length": 6875.88037109375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "entropy": 0.87320177257061, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007659573573619127, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 302133890.0, + "reward": 0.421875, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0012466582702472806, + "sampling/sampling_logp_difference/max": 6.687288761138916, + "sampling/sampling_logp_difference/mean": 0.019994346424937248, + "step": 346 + }, + { + "clip_ratio/high_max": 1.1556229310372146e-05, + "clip_ratio/high_mean": 2.8890573275930365e-06, + "clip_ratio/low_mean": 3.8744643916288624e-05, + "clip_ratio/low_min": 6.108287834649673e-06, + "clip_ratio/region_mean": 4.1633702039689524e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16139.0, + "completions/mean_length": 6399.96875, + "completions/mean_terminated_length": 6077.90283203125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9481896534562111, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014135175151750445, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 302972566.0, + "reward": 0.4140625, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0025698256213217974, + "sampling/sampling_logp_difference/max": 5.963917255401611, + "sampling/sampling_logp_difference/mean": 0.02073008380830288, + "step": 347 + }, + { + "clip_ratio/high_max": 6.59491388432798e-06, + "clip_ratio/high_mean": 2.545892130001448e-06, + "clip_ratio/low_mean": 4.620846755187813e-05, + "clip_ratio/low_min": 6.243132702365983e-06, + "clip_ratio/region_mean": 4.875435956819274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 7298.078125, + "completions/mean_terminated_length": 7226.53564453125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "entropy": 0.8719206526875496, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027898226398974657, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 303925976.0, + "reward": 0.484375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.005236432887613773, + "sampling/sampling_logp_difference/max": 5.252114772796631, + "sampling/sampling_logp_difference/mean": 0.020944103598594666, + "step": 348 + }, + { + "clip_ratio/high_max": 1.052124343914329e-05, + "clip_ratio/high_mean": 2.6303108597858227e-06, + "clip_ratio/low_mean": 2.010384196182713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.273415248055244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14980.0, + "completions/mean_length": 5667.0390625, + "completions/mean_terminated_length": 5496.9287109375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "entropy": 0.8791451379656792, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012764945859089494, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 304675157.0, + "reward": 0.390625, + "reward_std": 0.17965976893901825, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 5.054428584116977e-06, + "sampling/sampling_logp_difference/max": 12.195245742797852, + "sampling/sampling_logp_difference/mean": 0.018928447738289833, + "step": 349 + }, + { + "clip_ratio/high_max": 9.578045592206763e-06, + "clip_ratio/high_mean": 2.3945113980516908e-06, + "clip_ratio/low_mean": 3.1114799753595435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350931149270764e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15354.0, + "completions/max_terminated_length": 15354.0, + "completions/mean_length": 5874.4453125, + "completions/mean_terminated_length": 5874.4453125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9577538818120956, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00509974779561162, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 305447038.0, + "reward": 0.515625, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.004791648127138615, + "sampling/sampling_logp_difference/max": 5.340880870819092, + "sampling/sampling_logp_difference/mean": 0.02114470861852169, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0903062275247066e-05, + "clip_ratio/high_mean": 2.7257655688117666e-06, + "clip_ratio/low_mean": 4.784364205079328e-05, + "clip_ratio/low_min": 3.861600362142781e-06, + "clip_ratio/region_mean": 5.056940744907479e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 6197.5703125, + "completions/mean_terminated_length": 6035.88134765625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.8665244281291962, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030849494505673647, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 306258023.0, + "reward": 0.515625, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998056888580322, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.021017421036958694, + "step": 351 + }, + { + "clip_ratio/high_max": 1.4299712574938894e-05, + "clip_ratio/high_mean": 4.3520980170796975e-06, + "clip_ratio/low_mean": 6.213493452378316e-05, + "clip_ratio/low_min": 1.0056635801447555e-05, + "clip_ratio/region_mean": 6.648703174505499e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7522.578125, + "completions/mean_terminated_length": 7381.9208984375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.8185881152749062, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002946985885500908, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 307240305.0, + "reward": 0.3125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.005127199459820986, + "sampling/sampling_logp_difference/max": 5.273195743560791, + "sampling/sampling_logp_difference/mean": 0.01965932548046112, + "step": 352 + }, + { + "clip_ratio/high_max": 1.693051035545068e-05, + "clip_ratio/high_mean": 5.08456730585749e-06, + "clip_ratio/low_mean": 4.2052345861520735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.713691282631771e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14090.0, + "completions/mean_length": 6403.2265625, + "completions/mean_terminated_length": 6163.6884765625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "entropy": 0.8359840363264084, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031181599479168653, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 308079318.0, + "reward": 0.5, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 6.73715621815063e-05, + "sampling/sampling_logp_difference/max": 9.605287551879883, + "sampling/sampling_logp_difference/mean": 0.01963040418922901, + "step": 353 + }, + { + "clip_ratio/high_max": 1.3988919135954347e-05, + "clip_ratio/high_mean": 3.497229783988587e-06, + "clip_ratio/low_mean": 6.722658486069122e-05, + "clip_ratio/low_min": 1.858519090092159e-05, + "clip_ratio/region_mean": 7.072381458783639e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7954.03125, + "completions/mean_terminated_length": 7751.71240234375, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.905990719795227, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002656223252415657, + "learning_rate": 1e-05, + "loss": 0.1022, + "num_tokens": 309117770.0, + "reward": 0.3828125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999536275863647, + "sampling/importance_sampling_ratio/min": 0.0003354826185386628, + "sampling/sampling_logp_difference/max": 7.999940395355225, + "sampling/sampling_logp_difference/mean": 0.020741507411003113, + "step": 354 + }, + { + "clip_ratio/high_max": 1.7610595023143105e-05, + "clip_ratio/high_mean": 4.402648755785776e-06, + "clip_ratio/low_mean": 4.337988764291367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.778253651238629e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6630.09375, + "completions/mean_terminated_length": 6315.45166015625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.870736837387085, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0060529084876179695, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 309988894.0, + "reward": 0.515625, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998822212219238, + "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05, + "sampling/sampling_logp_difference/max": 10.716434478759766, + "sampling/sampling_logp_difference/mean": 0.02060208097100258, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0448093235027045e-05, + "clip_ratio/high_mean": 2.6120233087567613e-06, + "clip_ratio/low_mean": 3.1030769946482906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364279325523967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15920.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6679.6171875, + "completions/mean_terminated_length": 6679.6171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9812518879771233, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00400698184967041, + "learning_rate": 1e-05, + "loss": 0.0605, + "num_tokens": 310864013.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999049305915833, + "sampling/importance_sampling_ratio/min": 0.0020593837834894657, + "sampling/sampling_logp_difference/max": 6.1853485107421875, + "sampling/sampling_logp_difference/mean": 0.02098071575164795, + "step": 356 + }, + { + "clip_ratio/high_max": 2.124982574969181e-05, + "clip_ratio/high_mean": 7.736592579021817e-06, + "clip_ratio/low_mean": 2.900951585615985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.674610888992902e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 5523.796875, + "completions/mean_terminated_length": 5173.4677734375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9120645374059677, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005929585546255112, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 311589987.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998446702957153, + "sampling/importance_sampling_ratio/min": 0.0010661041596904397, + "sampling/sampling_logp_difference/max": 6.843744277954102, + "sampling/sampling_logp_difference/mean": 0.019948206841945648, + "step": 357 + }, + { + "clip_ratio/high_max": 2.4486997745043482e-05, + "clip_ratio/high_mean": 8.219769085826556e-06, + "clip_ratio/low_mean": 5.346400575945154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.168377467474784e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15401.0, + "completions/mean_length": 6361.3671875, + "completions/mean_terminated_length": 6282.44873046875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.8044678047299385, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006622390355914831, + "learning_rate": 1e-05, + "loss": 0.1023, + "num_tokens": 312424034.0, + "reward": 0.5078125, + "reward_std": 0.3724474310874939, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.0003157092141918838, + "sampling/sampling_logp_difference/max": 8.060688972473145, + "sampling/sampling_logp_difference/mean": 0.018907658755779266, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0407376748844399e-05, + "clip_ratio/high_mean": 2.6018441872110998e-06, + "clip_ratio/low_mean": 5.925514369664597e-05, + "clip_ratio/low_min": 1.3324347946763737e-05, + "clip_ratio/region_mean": 6.185698703120579e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 7109.0, + "completions/mean_terminated_length": 7035.96826171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9167275875806808, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004639944992959499, + "learning_rate": 1e-05, + "loss": 0.0861, + "num_tokens": 313353346.0, + "reward": 0.4140625, + "reward_std": 0.3826971650123596, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999389052391052, + "sampling/importance_sampling_ratio/min": 0.0019070414127781987, + "sampling/sampling_logp_difference/max": 6.262202262878418, + "sampling/sampling_logp_difference/mean": 0.02155841514468193, + "step": 359 + }, + { + "clip_ratio/high_max": 3.959046694035351e-05, + "clip_ratio/high_mean": 1.0912523691786191e-05, + "clip_ratio/low_mean": 3.3944450819944905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485697365907981e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6314.2734375, + "completions/mean_terminated_length": 6072.60009765625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.8780038207769394, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007643720600754023, + "learning_rate": 1e-05, + "loss": 0.0873, + "num_tokens": 314180717.0, + "reward": 0.4609375, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999802112579346, + "sampling/importance_sampling_ratio/min": 0.021285315975546837, + "sampling/sampling_logp_difference/max": 3.8497378826141357, + "sampling/sampling_logp_difference/mean": 0.01964358240365982, + "step": 360 + }, + { + "clip_ratio/high_max": 3.065382111344661e-05, + "clip_ratio/high_mean": 9.187473835936544e-06, + "clip_ratio/low_mean": 4.137891801292426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.056639065514901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6718.2265625, + "completions/mean_terminated_length": 6486.24853515625, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8326799497008324, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050973957404494286, + "learning_rate": 1e-05, + "loss": 0.0109, + "num_tokens": 315060842.0, + "reward": 0.5078125, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.0009130688849836588, + "sampling/sampling_logp_difference/max": 6.998699188232422, + "sampling/sampling_logp_difference/mean": 0.019501537084579468, + "step": 361 + }, + { + "clip_ratio/high_max": 8.624853762739804e-06, + "clip_ratio/high_mean": 2.156213440684951e-06, + "clip_ratio/low_mean": 1.8797969062234188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0954182048171788e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 8666.8359375, + "completions/mean_terminated_length": 7941.291015625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.9526705741882324, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019092690199613571, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 316190325.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05, + "sampling/sampling_logp_difference/max": 10.249995231628418, + "sampling/sampling_logp_difference/mean": 0.02051631174981594, + "step": 362 + }, + { + "clip_ratio/high_max": 2.147400391550036e-05, + "clip_ratio/high_mean": 6.434908300434472e-06, + "clip_ratio/low_mean": 3.521234066283796e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164724816746457e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15164.0, + "completions/mean_length": 7661.8203125, + "completions/mean_terminated_length": 7002.16015625, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.8322782590985298, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019530428107827902, + "learning_rate": 1e-05, + "loss": 0.0729, + "num_tokens": 317191878.0, + "reward": 0.4609375, + "reward_std": 0.21382391452789307, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 8.546619210392237e-05, + "sampling/sampling_logp_difference/max": 9.367389678955078, + "sampling/sampling_logp_difference/mean": 0.019894573837518692, + "step": 363 + }, + { + "clip_ratio/high_max": 1.9436202364886412e-05, + "clip_ratio/high_mean": 6.089704697842535e-06, + "clip_ratio/low_mean": 4.2698405422925134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.878810955233348e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 7024.859375, + "completions/mean_terminated_length": 6800.240234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.794853538274765, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031784537713974714, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 318109004.0, + "reward": 0.4921875, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.0002962362195830792, + "sampling/sampling_logp_difference/max": 8.124353408813477, + "sampling/sampling_logp_difference/mean": 0.018519200384616852, + "step": 364 + }, + { + "clip_ratio/high_max": 4.127455667912727e-06, + "clip_ratio/high_mean": 1.0318639169781818e-06, + "clip_ratio/low_mean": 4.342453667049995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.445640047379129e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7282.1796875, + "completions/mean_terminated_length": 6912.1865234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.904067650437355, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005080109462141991, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 319059075.0, + "reward": 0.4140625, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.1194523349404335, + "sampling/sampling_logp_difference/max": 6.136754989624023, + "sampling/sampling_logp_difference/mean": 0.019978653639554977, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.608940076243016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.608940076243016e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7131.5234375, + "completions/mean_terminated_length": 6596.255859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.8849587142467499, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022667953744530678, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 319990046.0, + "reward": 0.46875, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0370909757912159, + "sampling/sampling_logp_difference/max": 3.294381618499756, + "sampling/sampling_logp_difference/mean": 0.02037571743130684, + "step": 366 + }, + { + "clip_ratio/high_max": 1.5356635913121863e-05, + "clip_ratio/high_mean": 3.839158978280466e-06, + "clip_ratio/low_mean": 3.4950805911648786e-05, + "clip_ratio/low_min": 4.876336333836662e-06, + "clip_ratio/region_mean": 3.8789965287833184e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 6655.4453125, + "completions/mean_terminated_length": 6578.84228515625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.7417122721672058, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00216497085057199, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 320860135.0, + "reward": 0.5625, + "reward_std": 0.3369230031967163, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0005190494703128934, + "sampling/sampling_logp_difference/max": 7.563511371612549, + "sampling/sampling_logp_difference/mean": 0.01771342009305954, + "step": 367 + }, + { + "clip_ratio/high_max": 1.7605634639039636e-05, + "clip_ratio/high_mean": 5.297029474604642e-06, + "clip_ratio/low_mean": 5.688933060810086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.218636053745286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15849.0, + "completions/mean_length": 7077.1640625, + "completions/mean_terminated_length": 6619.45068359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.8749325424432755, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028338562697172165, + "learning_rate": 1e-05, + "loss": 0.0643, + "num_tokens": 321783852.0, + "reward": 0.3828125, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998220205307007, + "sampling/importance_sampling_ratio/min": 7.83290306571871e-06, + "sampling/sampling_logp_difference/max": 11.757177352905273, + "sampling/sampling_logp_difference/mean": 0.020299233496189117, + "step": 368 + }, + { + "clip_ratio/high_max": 7.301828190975357e-06, + "clip_ratio/high_mean": 1.8254570477438392e-06, + "clip_ratio/low_mean": 5.158197632226802e-05, + "clip_ratio/low_min": 3.735804057214409e-06, + "clip_ratio/region_mean": 5.340743223314348e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6034.296875, + "completions/mean_terminated_length": 5525.294921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.80014718323946, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022897711023688316, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 322572882.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999347925186157, + "sampling/importance_sampling_ratio/min": 0.0004105660773348063, + "sampling/sampling_logp_difference/max": 7.7979736328125, + "sampling/sampling_logp_difference/mean": 0.01858348958194256, + "step": 369 + }, + { + "clip_ratio/high_max": 9.364057859784225e-06, + "clip_ratio/high_mean": 3.351393047523743e-06, + "clip_ratio/low_mean": 4.186752630630508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5218919240141986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 8172.109375, + "completions/mean_terminated_length": 7838.29248046875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.8732693120837212, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003263789461925626, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 323640904.0, + "reward": 0.2890625, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999354481697083, + "sampling/importance_sampling_ratio/min": 9.27252222027164e-06, + "sampling/sampling_logp_difference/max": 11.588455200195312, + "sampling/sampling_logp_difference/mean": 0.0208889190107584, + "step": 370 + }, + { + "clip_ratio/high_max": 2.0998899799451465e-05, + "clip_ratio/high_mean": 6.692962131182867e-06, + "clip_ratio/low_mean": 4.261424010110204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930720297124935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 7699.203125, + "completions/mean_terminated_length": 7419.04833984375, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.8296505436301231, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0042716520838439465, + "learning_rate": 1e-05, + "loss": 0.0937, + "num_tokens": 324643858.0, + "reward": 0.4921875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.00022192654432728887, + "sampling/sampling_logp_difference/max": 8.413164138793945, + "sampling/sampling_logp_difference/mean": 0.018926654011011124, + "step": 371 + }, + { + "clip_ratio/high_max": 7.061349151626928e-06, + "clip_ratio/high_mean": 1.765337287906732e-06, + "clip_ratio/low_mean": 4.5005243464402156e-05, + "clip_ratio/low_min": 3.861838649754645e-06, + "clip_ratio/region_mean": 4.6770580411248375e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 7450.1640625, + "completions/mean_terminated_length": 7450.1640625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 1.0400195196270943, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033558050636202097, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 325617687.0, + "reward": 0.2578125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999459385871887, + "sampling/importance_sampling_ratio/min": 0.039920732378959656, + "sampling/sampling_logp_difference/max": 3.2208595275878906, + "sampling/sampling_logp_difference/mean": 0.02249298244714737, + "step": 372 + }, + { + "clip_ratio/high_max": 1.3147802746971138e-05, + "clip_ratio/high_mean": 3.2869506867427845e-06, + "clip_ratio/low_mean": 2.4451034505545977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7737984851228248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15342.0, + "completions/mean_length": 6799.0703125, + "completions/mean_terminated_length": 6723.5986328125, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9737623482942581, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005797459278255701, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 326508384.0, + "reward": 0.3125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321699142456, + "sampling/importance_sampling_ratio/min": 7.535634836131067e-07, + "sampling/sampling_logp_difference/max": 14.0984525680542, + "sampling/sampling_logp_difference/mean": 0.021543748676776886, + "step": 373 + }, + { + "clip_ratio/high_max": 3.3594023989280686e-06, + "clip_ratio/high_mean": 8.398505997320171e-07, + "clip_ratio/low_mean": 2.3457610382138228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4297460981870245e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 7034.3671875, + "completions/mean_terminated_length": 6654.30078125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8749603256583214, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002258980879560113, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 327426407.0, + "reward": 0.4609375, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.008719252422451973, + "sampling/sampling_logp_difference/max": 4.742221832275391, + "sampling/sampling_logp_difference/mean": 0.01997346058487892, + "step": 374 + }, + { + "clip_ratio/high_max": 2.823375348270929e-05, + "clip_ratio/high_mean": 7.058438370677322e-06, + "clip_ratio/low_mean": 4.9395109726901865e-05, + "clip_ratio/low_min": 1.636556044104509e-05, + "clip_ratio/region_mean": 5.6453548268109444e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 6623.078125, + "completions/mean_terminated_length": 6388.81640625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.858784057199955, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002420129720121622, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 328292985.0, + "reward": 0.4140625, + "reward_std": 0.3077537417411804, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 0.00014900295354891568, + "sampling/sampling_logp_difference/max": 8.811544418334961, + "sampling/sampling_logp_difference/mean": 0.019645996391773224, + "step": 375 + }, + { + "clip_ratio/high_max": 1.8078507309837732e-05, + "clip_ratio/high_mean": 6.468551191574079e-06, + "clip_ratio/low_mean": 4.051302585139638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.698157727034413e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15229.0, + "completions/mean_length": 5902.4765625, + "completions/mean_terminated_length": 5564.36279296875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.904740035533905, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004107976797968149, + "learning_rate": 1e-05, + "loss": 0.0824, + "num_tokens": 329067006.0, + "reward": 0.5546875, + "reward_std": 0.3945493996143341, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05, + "sampling/sampling_logp_difference/max": 11.37439250946045, + "sampling/sampling_logp_difference/mean": 0.019582755863666534, + "step": 376 + }, + { + "clip_ratio/high_max": 2.553658168835682e-05, + "clip_ratio/high_mean": 7.276365181496658e-06, + "clip_ratio/low_mean": 1.7552573126522475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.482893796695862e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6425.6015625, + "completions/mean_terminated_length": 6267.5322265625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.964553713798523, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003208522219210863, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 329910691.0, + "reward": 0.359375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999419450759888, + "sampling/importance_sampling_ratio/min": 0.00137569778598845, + "sampling/sampling_logp_difference/max": 6.588794231414795, + "sampling/sampling_logp_difference/mean": 0.021154657006263733, + "step": 377 + }, + { + "clip_ratio/high_max": 6.8712420215888415e-06, + "clip_ratio/high_mean": 1.7178105053972104e-06, + "clip_ratio/low_mean": 4.0991827404468495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2709637853022286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 8006.4453125, + "completions/mean_terminated_length": 7594.43408203125, + "completions/min_length": 1235.0, + "completions/min_terminated_length": 1235.0, + "entropy": 0.8980336412787437, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002898421371355653, + "learning_rate": 1e-05, + "loss": 0.0815, + "num_tokens": 330956332.0, + "reward": 0.4296875, + "reward_std": 0.20175684988498688, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 9.378339746035635e-05, + "sampling/sampling_logp_difference/max": 9.27452278137207, + "sampling/sampling_logp_difference/mean": 0.021021340042352676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2689344689297286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2689344689297286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15484.0, + "completions/max_terminated_length": 15484.0, + "completions/mean_length": 7068.828125, + "completions/mean_terminated_length": 7068.828125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.9865007549524307, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0037063576746731997, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 331880918.0, + "reward": 0.3203125, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0001819290773710236, + "sampling/sampling_logp_difference/max": 8.611893653869629, + "sampling/sampling_logp_difference/mean": 0.02072504535317421, + "step": 379 + }, + { + "clip_ratio/high_max": 5.845633268108941e-06, + "clip_ratio/high_mean": 1.4614083170272352e-06, + "clip_ratio/low_mean": 3.207486906831036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353627721480734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 7379.390625, + "completions/mean_terminated_length": 7236.4609375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.8977236375212669, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001972826896235347, + "learning_rate": 1e-05, + "loss": 0.0228, + "num_tokens": 332849112.0, + "reward": 0.4140625, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 2.820451663865242e-05, + "sampling/sampling_logp_difference/max": 10.476028442382812, + "sampling/sampling_logp_difference/mean": 0.019411223009228706, + "step": 380 + }, + { + "clip_ratio/high_max": 4.875385002378607e-06, + "clip_ratio/high_mean": 1.2188462505946518e-06, + "clip_ratio/low_mean": 2.3530714997832547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.47495612484272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15517.0, + "completions/mean_length": 6867.9609375, + "completions/mean_terminated_length": 6793.03125, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "entropy": 0.9244343340396881, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.006926023401319981, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333746179.0, + "reward": 0.4140625, + "reward_std": 0.1433562934398651, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.0003875594411510974, + "sampling/sampling_logp_difference/max": 7.8556413650512695, + "sampling/sampling_logp_difference/mean": 0.020311862230300903, + "step": 381 + }, + { + "clip_ratio/high_max": 1.5651628245905158e-05, + "clip_ratio/high_mean": 4.836261211949022e-06, + "clip_ratio/low_mean": 5.268017821435933e-05, + "clip_ratio/low_min": 3.950945028918795e-06, + "clip_ratio/region_mean": 5.751643902840442e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7525.375, + "completions/mean_terminated_length": 6855.3955078125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9207312315702438, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0047226278111338615, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 334731027.0, + "reward": 0.3359375, + "reward_std": 0.3353874683380127, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999615550041199, + "sampling/importance_sampling_ratio/min": 0.00029753465787507594, + "sampling/sampling_logp_difference/max": 8.119979858398438, + "sampling/sampling_logp_difference/mean": 0.021496692672371864, + "step": 382 + }, + { + "clip_ratio/high_max": 3.815379886873416e-05, + "clip_ratio/high_mean": 9.53844971718354e-06, + "clip_ratio/low_mean": 4.519663821156428e-05, + "clip_ratio/low_min": 2.775434040813707e-06, + "clip_ratio/region_mean": 5.473508826980833e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16251.0, + "completions/mean_length": 6841.0625, + "completions/mean_terminated_length": 6453.13818359375, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.8979457840323448, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004971448332071304, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 335631243.0, + "reward": 0.390625, + "reward_std": 0.2596156895160675, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934196472168, + "sampling/importance_sampling_ratio/min": 9.655764188210014e-06, + "sampling/sampling_logp_difference/max": 11.547955513000488, + "sampling/sampling_logp_difference/mean": 0.020256079733371735, + "step": 383 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 3.1563491688757495e-05, + "clip_ratio/low_min": 3.1228139505401487e-06, + "clip_ratio/region_mean": 3.260408311689389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15060.0, + "completions/mean_length": 6919.8046875, + "completions/mean_terminated_length": 6454.35205078125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9241961911320686, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038604787550866604, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 336537162.0, + "reward": 0.375, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998080730438232, + "sampling/importance_sampling_ratio/min": 0.0009118975722230971, + "sampling/sampling_logp_difference/max": 6.999982833862305, + "sampling/sampling_logp_difference/mean": 0.02030865103006363, + "step": 384 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 336537162, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}