diff --git "a/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json" "b/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json" new file mode 100644--- /dev/null +++ "b/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json" @@ -0,0 +1,4002 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11775528978840846, + "eval_steps": 500, + "global_step": 128, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 102643751, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}