diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11203 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.0093, + "eval_steps": 500, + "global_step": 465, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 741.390625, + "completions/mean_terminated_length": 741.390625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.19560225727036595, + "epoch": 2e-05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6278125643730164, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0026, + "num_tokens": 102275.0, + "reward": -0.2660611569881439, + "reward_std": 9.006877899169922, + "rewards/rollout_reward_func/mean": -0.26606130599975586, + "rewards/rollout_reward_func/std": 10.133543014526367, + "sampling/importance_sampling_ratio/max": 1.4521965980529785, + "sampling/importance_sampling_ratio/mean": 1.0252978801727295, + "sampling/importance_sampling_ratio/min": 0.6192880272865295, + "sampling/sampling_logp_difference/max": 0.35935235023498535, + "sampling/sampling_logp_difference/mean": 0.013161457143723965, + "step": 1, + "step_time": 18.950907858999926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.19560225727036595, + "epoch": 4e-05, + "grad_norm": 0.6270994544029236, + "kl": 0.0, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0026, + "step": 2, + "step_time": 6.845600487000297 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0026041667442768812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0052083334885537624, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 745.078125, + "completions/mean_terminated_length": 745.078125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.1830942602828145, + "epoch": 6e-05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6748153567314148, + "kl": 0.0004804102204616356, + "learning_rate": 5.7142857142857145e-06, + "loss": -0.0139, + "num_tokens": 204643.0, + "reward": 0.07987305521965027, + "reward_std": 6.112407207489014, + "rewards/rollout_reward_func/mean": 0.07987302541732788, + "rewards/rollout_reward_func/std": 6.9746317863464355, + "sampling/importance_sampling_ratio/max": 1.6137751340866089, + "sampling/importance_sampling_ratio/mean": 1.0131056308746338, + "sampling/importance_sampling_ratio/min": 0.5117371678352356, + "sampling/sampling_logp_difference/max": 0.6347737312316895, + "sampling/sampling_logp_difference/mean": 0.013132400810718536, + "step": 3, + "step_time": 20.457778603000065 + }, + { + "clip_ratio/high_max": 0.0062500000931322575, + "clip_ratio/high_mean": 0.0015625000232830644, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002864583395421505, + "entropy": 0.18449228629469872, + "epoch": 8e-05, + "grad_norm": 0.7855743169784546, + "kl": 0.0004326992366259219, + "learning_rate": 8.571428571428573e-06, + "loss": -0.0127, + "step": 4, + "step_time": 7.153126219000001 + }, + { + "clip_ratio/high_max": 0.015625000465661287, + "clip_ratio/high_mean": 0.003906250116415322, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003906250116415322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 773.3125, + "completions/mean_terminated_length": 773.3125, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.19608404766768217, + "epoch": 0.0001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6154729723930359, + "kl": 0.0007404440693790093, + "learning_rate": 1.1428571428571429e-05, + "loss": -0.0267, + "num_tokens": 308926.0, + "reward": -2.357975721359253, + "reward_std": 5.998347282409668, + "rewards/rollout_reward_func/mean": -2.357975721359253, + "rewards/rollout_reward_func/std": 6.508192539215088, + "sampling/importance_sampling_ratio/max": 1.5696072578430176, + "sampling/importance_sampling_ratio/mean": 1.0018606185913086, + "sampling/importance_sampling_ratio/min": 0.6378414630889893, + "sampling/sampling_logp_difference/max": 0.4687232971191406, + "sampling/sampling_logp_difference/mean": 0.014497373253107071, + "step": 5, + "step_time": 21.077881563999767 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0027225379599258304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005326704704202712, + "entropy": 0.20008834172040224, + "epoch": 0.00012, + "grad_norm": 0.613211989402771, + "kl": 0.0017206422435265267, + "learning_rate": 1.4285714285714285e-05, + "loss": -0.0283, + "step": 6, + "step_time": 8.075609097999632 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 754.3125, + "completions/mean_terminated_length": 754.3125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.21214309986680746, + "epoch": 0.00014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5866758823394775, + "kl": 0.0038609652619925328, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.0015, + "num_tokens": 413194.0, + "reward": -0.5192327499389648, + "reward_std": 8.747434616088867, + "rewards/rollout_reward_func/mean": -0.5192328095436096, + "rewards/rollout_reward_func/std": 9.696125030517578, + "sampling/importance_sampling_ratio/max": 1.3741450309753418, + "sampling/importance_sampling_ratio/mean": 0.988805890083313, + "sampling/importance_sampling_ratio/min": 0.6078794002532959, + "sampling/sampling_logp_difference/max": 0.25654804706573486, + "sampling/sampling_logp_difference/mean": 0.012450095266103745, + "step": 7, + "step_time": 21.18102895699974 + }, + { + "clip_ratio/high_max": 0.042140152771025896, + "clip_ratio/high_mean": 0.010535038192756474, + "clip_ratio/low_mean": 0.011718750349245965, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02225378854200244, + "entropy": 0.2212895406410098, + "epoch": 0.00016, + "grad_norm": 0.5727657675743103, + "kl": 0.01148045047011692, + "learning_rate": 2e-05, + "loss": 0.0002, + "step": 8, + "step_time": 8.206865795999875 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.004142992664128542, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006747159408405423, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 742.765625, + "completions/mean_terminated_length": 742.765625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.25475312024354935, + "epoch": 0.00018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7087565064430237, + "kl": 0.03194100991822779, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.0241, + "num_tokens": 516181.0, + "reward": -2.378840684890747, + "reward_std": 6.36100959777832, + "rewards/rollout_reward_func/mean": -2.378840446472168, + "rewards/rollout_reward_func/std": 7.315836429595947, + "sampling/importance_sampling_ratio/max": 1.6080894470214844, + "sampling/importance_sampling_ratio/mean": 1.0152499675750732, + "sampling/importance_sampling_ratio/min": 0.4359276592731476, + "sampling/sampling_logp_difference/max": 0.4399428367614746, + "sampling/sampling_logp_difference/mean": 0.028559193015098572, + "step": 9, + "step_time": 22.1539967839999 + }, + { + "clip_ratio/high_max": 0.04734848625957966, + "clip_ratio/high_mean": 0.013139204937033355, + "clip_ratio/low_mean": 0.007930871448479593, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021070076385512948, + "entropy": 0.26416848599910736, + "epoch": 0.0002, + "grad_norm": 0.6573855876922607, + "kl": 0.03966027498245239, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.021, + "step": 10, + "step_time": 7.0971175310000945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 750.75, + "completions/mean_terminated_length": 750.75, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.2343001812696457, + "epoch": 0.00022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.864137589931488, + "kl": 0.03614223480690271, + "learning_rate": 2.857142857142857e-05, + "loss": -0.018, + "num_tokens": 619774.0, + "reward": -1.144383430480957, + "reward_std": 9.403154373168945, + "rewards/rollout_reward_func/mean": -1.144383192062378, + "rewards/rollout_reward_func/std": 10.208455085754395, + "sampling/importance_sampling_ratio/max": 1.6737509965896606, + "sampling/importance_sampling_ratio/mean": 1.0005735158920288, + "sampling/importance_sampling_ratio/min": 0.5264889001846313, + "sampling/sampling_logp_difference/max": 0.7381381988525391, + "sampling/sampling_logp_difference/mean": 0.03099803999066353, + "step": 11, + "step_time": 24.1759815060002 + }, + { + "clip_ratio/high_max": 0.04261363763362169, + "clip_ratio/high_mean": 0.011955492780543864, + "clip_ratio/low_mean": 0.018229166977107525, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030184659990482032, + "entropy": 0.23695118725299835, + "epoch": 0.00024, + "grad_norm": 0.5675711631774902, + "kl": 0.05231437139445916, + "learning_rate": 3.142857142857143e-05, + "loss": -0.0247, + "step": 12, + "step_time": 7.241505567000331 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 766.03125, + "completions/mean_terminated_length": 766.03125, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 0.23159058205783367, + "epoch": 0.00026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.560762345790863, + "kl": 0.10253941919654608, + "learning_rate": 3.428571428571429e-05, + "loss": -0.0132, + "num_tokens": 725286.0, + "reward": 0.9126645922660828, + "reward_std": 8.317488670349121, + "rewards/rollout_reward_func/mean": 0.9126646518707275, + "rewards/rollout_reward_func/std": 9.508187294006348, + "sampling/importance_sampling_ratio/max": 1.4912891387939453, + "sampling/importance_sampling_ratio/mean": 0.9157562255859375, + "sampling/importance_sampling_ratio/min": 0.15846048295497894, + "sampling/sampling_logp_difference/max": 0.9116353988647461, + "sampling/sampling_logp_difference/mean": 0.03342486917972565, + "step": 13, + "step_time": 24.797564555999315 + }, + { + "clip_ratio/high_max": 0.02083333395421505, + "clip_ratio/high_mean": 0.0052083334885537624, + "clip_ratio/low_mean": 0.04107481171377003, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046283145202323794, + "entropy": 0.21706843469291925, + "epoch": 0.00028, + "grad_norm": 0.737306535243988, + "kl": 0.20574123412370682, + "learning_rate": 3.7142857142857143e-05, + "loss": -0.0141, + "step": 14, + "step_time": 8.782559869000124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002864583395421505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002864583395421505, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 730.265625, + "completions/mean_terminated_length": 730.265625, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.20442467741668224, + "epoch": 0.0003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7728816866874695, + "kl": 0.10564538510516286, + "learning_rate": 4e-05, + "loss": 0.0314, + "num_tokens": 827749.0, + "reward": -1.6128692626953125, + "reward_std": 6.231240272521973, + "rewards/rollout_reward_func/mean": -1.6128690242767334, + "rewards/rollout_reward_func/std": 6.545647621154785, + "sampling/importance_sampling_ratio/max": 1.7540509700775146, + "sampling/importance_sampling_ratio/mean": 1.0142356157302856, + "sampling/importance_sampling_ratio/min": 0.45990973711013794, + "sampling/sampling_logp_difference/max": 0.7248215675354004, + "sampling/sampling_logp_difference/mean": 0.030622530728578568, + "step": 15, + "step_time": 24.38517581200017 + }, + { + "clip_ratio/high_max": 0.047821971122175455, + "clip_ratio/high_mean": 0.013257576036266983, + "clip_ratio/low_mean": 0.025236743036657572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0384943193057552, + "entropy": 0.19760818500071764, + "epoch": 0.00032, + "grad_norm": 0.6611685752868652, + "kl": 0.11387888877652586, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.0262, + "step": 16, + "step_time": 7.110903799999505 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 760.359375, + "completions/mean_terminated_length": 760.359375, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "entropy": 0.19120646081864834, + "epoch": 0.00034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0170923471450806, + "kl": 0.08101693401113153, + "learning_rate": 4.5714285714285716e-05, + "loss": -0.015, + "num_tokens": 931841.0, + "reward": -1.6879972219467163, + "reward_std": 9.023077011108398, + "rewards/rollout_reward_func/mean": -1.6879971027374268, + "rewards/rollout_reward_func/std": 10.298378944396973, + "sampling/importance_sampling_ratio/max": 2.430154800415039, + "sampling/importance_sampling_ratio/mean": 1.065093755722046, + "sampling/importance_sampling_ratio/min": 0.6535128951072693, + "sampling/sampling_logp_difference/max": 0.7661471366882324, + "sampling/sampling_logp_difference/mean": 0.024486079812049866, + "step": 17, + "step_time": 27.987481355 + }, + { + "clip_ratio/high_max": 0.043560607358813286, + "clip_ratio/high_mean": 0.016335227992385626, + "clip_ratio/low_mean": 0.01846590987406671, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03480113763362169, + "entropy": 0.19553834106773138, + "epoch": 0.00036, + "grad_norm": 0.5111234784126282, + "kl": 0.088710677344352, + "learning_rate": 4.8571428571428576e-05, + "loss": -0.0206, + "step": 18, + "step_time": 7.182192339999801 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003906250116415322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 733.640625, + "completions/mean_terminated_length": 733.640625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.1935133864171803, + "epoch": 0.00038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7022229433059692, + "kl": 0.1404350029770285, + "learning_rate": 5.142857142857143e-05, + "loss": -0.0003, + "num_tokens": 1033723.0, + "reward": -1.2022110223770142, + "reward_std": 10.956363677978516, + "rewards/rollout_reward_func/mean": -1.2022109031677246, + "rewards/rollout_reward_func/std": 12.292625427246094, + "sampling/importance_sampling_ratio/max": 1.6157236099243164, + "sampling/importance_sampling_ratio/mean": 0.9594892263412476, + "sampling/importance_sampling_ratio/min": 0.3754613697528839, + "sampling/sampling_logp_difference/max": 0.9176025390625, + "sampling/sampling_logp_difference/mean": 0.028035998344421387, + "step": 19, + "step_time": 27.688288005999993 + }, + { + "clip_ratio/high_max": 0.04876894084736705, + "clip_ratio/high_mean": 0.012192235211841762, + "clip_ratio/low_mean": 0.018584280740469694, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0307765161851421, + "entropy": 0.20130611211061478, + "epoch": 0.0004, + "grad_norm": 0.4695027768611908, + "kl": 0.18750765593722463, + "learning_rate": 5.428571428571428e-05, + "loss": -0.0054, + "step": 20, + "step_time": 7.739605327000618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0014204545877873898, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014204545877873898, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 737.9375, + "completions/mean_terminated_length": 737.9375, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.18132759165018797, + "epoch": 0.00042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1652212142944336, + "kl": 0.13582510640844703, + "learning_rate": 5.714285714285714e-05, + "loss": 0.0262, + "num_tokens": 1135968.0, + "reward": -0.28913062810897827, + "reward_std": 7.3008809089660645, + "rewards/rollout_reward_func/mean": -0.28913065791130066, + "rewards/rollout_reward_func/std": 7.988962650299072, + "sampling/importance_sampling_ratio/max": 2.336996555328369, + "sampling/importance_sampling_ratio/mean": 1.0362560749053955, + "sampling/importance_sampling_ratio/min": 0.6398296356201172, + "sampling/sampling_logp_difference/max": 0.6417920589447021, + "sampling/sampling_logp_difference/mean": 0.022837379947304726, + "step": 21, + "step_time": 28.57662482000046 + }, + { + "clip_ratio/high_max": 0.02083333395421505, + "clip_ratio/high_mean": 0.006510416860692203, + "clip_ratio/low_mean": 0.025386679684743285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03189709666185081, + "entropy": 0.1763849752023816, + "epoch": 0.00044, + "grad_norm": 0.3849461078643799, + "kl": 0.16632835287600756, + "learning_rate": 6e-05, + "loss": 0.0212, + "step": 22, + "step_time": 8.287740409000207 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0014204545877873898, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004024621332064271, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 723.875, + "completions/mean_terminated_length": 723.875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.1840990763157606, + "epoch": 0.00046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.860249936580658, + "kl": 0.25097968662157655, + "learning_rate": 6.285714285714286e-05, + "loss": 0.0286, + "num_tokens": 1237057.0, + "reward": 0.4839830696582794, + "reward_std": 10.420938491821289, + "rewards/rollout_reward_func/mean": 0.4839830994606018, + "rewards/rollout_reward_func/std": 11.429144859313965, + "sampling/importance_sampling_ratio/max": 2.106267213821411, + "sampling/importance_sampling_ratio/mean": 1.0313048362731934, + "sampling/importance_sampling_ratio/min": 0.574251651763916, + "sampling/sampling_logp_difference/max": 0.8508915901184082, + "sampling/sampling_logp_difference/mean": 0.02066868171095848, + "step": 23, + "step_time": 28.494462327999827 + }, + { + "clip_ratio/high_max": 0.06818181974813342, + "clip_ratio/high_mean": 0.021070076152682304, + "clip_ratio/low_mean": 0.018347538076341152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03941761387977749, + "entropy": 0.19043638091534376, + "epoch": 0.00048, + "grad_norm": 0.6448091864585876, + "kl": 0.35418248968198895, + "learning_rate": 6.571428571428571e-05, + "loss": 0.0215, + "step": 24, + "step_time": 7.416647947999536 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.004024621332064271, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005326704704202712, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 731.0625, + "completions/mean_terminated_length": 731.0625, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.1908296812325716, + "epoch": 0.0005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66495680809021, + "kl": 0.21043909061700106, + "learning_rate": 6.857142857142858e-05, + "loss": -0.0275, + "num_tokens": 1337760.0, + "reward": 0.9224299788475037, + "reward_std": 10.655890464782715, + "rewards/rollout_reward_func/mean": 0.9224300384521484, + "rewards/rollout_reward_func/std": 12.821269989013672, + "sampling/importance_sampling_ratio/max": 1.5019664764404297, + "sampling/importance_sampling_ratio/mean": 1.0262192487716675, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9519531726837158, + "sampling/sampling_logp_difference/mean": 0.018259627744555473, + "step": 25, + "step_time": 29.797745564000707 + }, + { + "clip_ratio/high_max": 0.05823863809928298, + "clip_ratio/high_mean": 0.017163826269097626, + "clip_ratio/low_mean": 0.024147727992385626, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.041311554377898574, + "entropy": 0.1913931304588914, + "epoch": 0.00052, + "grad_norm": 0.5575593709945679, + "kl": 0.26408666698262095, + "learning_rate": 7.142857142857143e-05, + "loss": -0.0322, + "step": 26, + "step_time": 7.109563219000847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 731.640625, + "completions/mean_terminated_length": 731.640625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.19422233663499355, + "epoch": 0.00054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6210283637046814, + "kl": 0.21635392913594842, + "learning_rate": 7.428571428571429e-05, + "loss": -0.0185, + "num_tokens": 1439214.0, + "reward": 0.326141357421875, + "reward_std": 13.388666152954102, + "rewards/rollout_reward_func/mean": 0.32614123821258545, + "rewards/rollout_reward_func/std": 14.97364616394043, + "sampling/importance_sampling_ratio/max": 1.5914506912231445, + "sampling/importance_sampling_ratio/mean": 1.0221253633499146, + "sampling/importance_sampling_ratio/min": 0.7667937874794006, + "sampling/sampling_logp_difference/max": 0.37548696994781494, + "sampling/sampling_logp_difference/mean": 0.012905368581414223, + "step": 27, + "step_time": 28.513997486000562 + }, + { + "clip_ratio/high_max": 0.05255681974813342, + "clip_ratio/high_mean": 0.01574337179772556, + "clip_ratio/low_mean": 0.01661931863054633, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.032362690777517855, + "entropy": 0.1939925504848361, + "epoch": 0.00056, + "grad_norm": 0.2964678406715393, + "kl": 0.22840850101783872, + "learning_rate": 7.714285714285715e-05, + "loss": -0.0252, + "step": 28, + "step_time": 8.46359607699992 + }, + { + "clip_ratio/high_max": 0.010890151839703321, + "clip_ratio/high_mean": 0.0027225379599258304, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004024621448479593, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 714.359375, + "completions/mean_terminated_length": 714.359375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "entropy": 0.1717732958495617, + "epoch": 0.00058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3590966761112213, + "kl": 0.24717363435775042, + "learning_rate": 8e-05, + "loss": 0.0036, + "num_tokens": 1539055.0, + "reward": 1.930895447731018, + "reward_std": 8.148633003234863, + "rewards/rollout_reward_func/mean": 1.930895447731018, + "rewards/rollout_reward_func/std": 9.020356178283691, + "sampling/importance_sampling_ratio/max": 1.6024476289749146, + "sampling/importance_sampling_ratio/mean": 1.0161041021347046, + "sampling/importance_sampling_ratio/min": 0.7807760238647461, + "sampling/sampling_logp_difference/max": 0.35602256655693054, + "sampling/sampling_logp_difference/mean": 0.011149970814585686, + "step": 29, + "step_time": 28.064759372000253 + }, + { + "clip_ratio/high_max": 0.027083334047347307, + "clip_ratio/high_mean": 0.006770833511836827, + "clip_ratio/low_mean": 0.029711175127886236, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03648200852330774, + "entropy": 0.16414203867316246, + "epoch": 0.0006, + "grad_norm": 0.38951048254966736, + "kl": 0.28005583630874753, + "learning_rate": 8.285714285714287e-05, + "loss": 0.0013, + "step": 30, + "step_time": 7.401456857000312 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 708.046875, + "completions/mean_terminated_length": 708.046875, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.16439654119312763, + "epoch": 0.00062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5445168614387512, + "kl": 0.2800124539062381, + "learning_rate": 8.571428571428571e-05, + "loss": 0.0097, + "num_tokens": 1638113.0, + "reward": 0.29781579971313477, + "reward_std": 10.009416580200195, + "rewards/rollout_reward_func/mean": 0.29781582951545715, + "rewards/rollout_reward_func/std": 11.176705360412598, + "sampling/importance_sampling_ratio/max": 1.755067229270935, + "sampling/importance_sampling_ratio/mean": 1.0180511474609375, + "sampling/importance_sampling_ratio/min": 0.580125629901886, + "sampling/sampling_logp_difference/max": 0.5197739601135254, + "sampling/sampling_logp_difference/mean": 0.013791397213935852, + "step": 31, + "step_time": 30.773730244999797 + }, + { + "clip_ratio/high_max": 0.03645833395421505, + "clip_ratio/high_mean": 0.013139204704202712, + "clip_ratio/low_mean": 0.03042140242177993, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04356060677673668, + "entropy": 0.15187342395074666, + "epoch": 0.00064, + "grad_norm": 0.30164626240730286, + "kl": 0.32055927254259586, + "learning_rate": 8.857142857142857e-05, + "loss": 0.0037, + "step": 32, + "step_time": 7.328695028999618 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0027225379599258304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004024621332064271, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 701.5, + "completions/mean_terminated_length": 701.5, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "entropy": 0.1332990936934948, + "epoch": 0.00066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43104609847068787, + "kl": 0.32164820563048124, + "learning_rate": 9.142857142857143e-05, + "loss": 0.0025, + "num_tokens": 1738075.0, + "reward": 3.1038765907287598, + "reward_std": 11.951395988464355, + "rewards/rollout_reward_func/mean": 3.1038765907287598, + "rewards/rollout_reward_func/std": 12.847871780395508, + "sampling/importance_sampling_ratio/max": 1.3508435487747192, + "sampling/importance_sampling_ratio/mean": 0.9952214360237122, + "sampling/importance_sampling_ratio/min": 0.6407750844955444, + "sampling/sampling_logp_difference/max": 0.47523796558380127, + "sampling/sampling_logp_difference/mean": 0.013571259565651417, + "step": 33, + "step_time": 27.829260915000077 + }, + { + "clip_ratio/high_max": 0.03219697065651417, + "clip_ratio/high_mean": 0.010653409408405423, + "clip_ratio/low_mean": 0.029000947601161897, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03965435700956732, + "entropy": 0.12380115175619721, + "epoch": 0.00068, + "grad_norm": 0.27367016673088074, + "kl": 0.423783166334033, + "learning_rate": 9.428571428571429e-05, + "loss": -0.0, + "step": 34, + "step_time": 7.799126809999507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 687.4375, + "completions/mean_terminated_length": 687.4375, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.10798696288838983, + "epoch": 0.0007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4482150673866272, + "kl": 0.3214763030409813, + "learning_rate": 9.714285714285715e-05, + "loss": 0.028, + "num_tokens": 1836043.0, + "reward": 3.037400960922241, + "reward_std": 12.985002517700195, + "rewards/rollout_reward_func/mean": 3.037400960922241, + "rewards/rollout_reward_func/std": 13.425616264343262, + "sampling/importance_sampling_ratio/max": 1.4862518310546875, + "sampling/importance_sampling_ratio/mean": 1.0146703720092773, + "sampling/importance_sampling_ratio/min": 0.5140225291252136, + "sampling/sampling_logp_difference/max": 0.8002816438674927, + "sampling/sampling_logp_difference/mean": 0.01363956555724144, + "step": 35, + "step_time": 28.662696071000028 + }, + { + "clip_ratio/high_max": 0.042140152771025896, + "clip_ratio/high_mean": 0.010535038192756474, + "clip_ratio/low_mean": 0.014441288309171796, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024976326269097626, + "entropy": 0.1118780323304236, + "epoch": 0.00072, + "grad_norm": 0.1983855962753296, + "kl": 0.373223016038537, + "learning_rate": 0.0001, + "loss": 0.0232, + "step": 36, + "step_time": 8.269840026000338 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 679.3125, + "completions/mean_terminated_length": 679.3125, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "entropy": 0.12342227855697274, + "epoch": 0.00074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6195780634880066, + "kl": 0.45714515913277864, + "learning_rate": 9.999999998148153e-05, + "loss": -0.0249, + "num_tokens": 1932947.0, + "reward": 3.72019362449646, + "reward_std": 11.354637145996094, + "rewards/rollout_reward_func/mean": 3.720193862915039, + "rewards/rollout_reward_func/std": 11.66490650177002, + "sampling/importance_sampling_ratio/max": 2.1260557174682617, + "sampling/importance_sampling_ratio/mean": 1.049971580505371, + "sampling/importance_sampling_ratio/min": 0.6164436340332031, + "sampling/sampling_logp_difference/max": 0.5450749397277832, + "sampling/sampling_logp_difference/mean": 0.01501537300646305, + "step": 37, + "step_time": 27.480367904999866 + }, + { + "clip_ratio/high_max": 0.05303030414506793, + "clip_ratio/high_mean": 0.014678030624054372, + "clip_ratio/low_mean": 0.014322917209938169, + "clip_ratio/low_min": 0.0052083334885537624, + "clip_ratio/region_mean": 0.029000947950407863, + "entropy": 0.13006606698036194, + "epoch": 0.00076, + "grad_norm": 0.2681926488876343, + "kl": 0.4847450293600559, + "learning_rate": 9.999999992592612e-05, + "loss": -0.0318, + "step": 38, + "step_time": 7.225284665000345 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 700.09375, + "completions/mean_terminated_length": 700.09375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.15452369069680572, + "epoch": 0.00078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4834868311882019, + "kl": 0.4672291334718466, + "learning_rate": 9.999999983333379e-05, + "loss": -0.0162, + "num_tokens": 2032280.0, + "reward": 5.62964391708374, + "reward_std": 9.88559341430664, + "rewards/rollout_reward_func/mean": 5.629644393920898, + "rewards/rollout_reward_func/std": 12.693258285522461, + "sampling/importance_sampling_ratio/max": 1.5066994428634644, + "sampling/importance_sampling_ratio/mean": 1.0094711780548096, + "sampling/importance_sampling_ratio/min": 0.6512829065322876, + "sampling/sampling_logp_difference/max": 0.4918508529663086, + "sampling/sampling_logp_difference/mean": 0.01460680365562439, + "step": 39, + "step_time": 30.803230847000123 + }, + { + "clip_ratio/high_max": 0.05823863809928298, + "clip_ratio/high_mean": 0.01976799312978983, + "clip_ratio/low_mean": 0.02734375069849193, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04711174394469708, + "entropy": 0.14933442790061235, + "epoch": 0.0008, + "grad_norm": 0.34873443841934204, + "kl": 0.5781354140490294, + "learning_rate": 9.99999997037045e-05, + "loss": -0.0203, + "step": 40, + "step_time": 7.3111222899999575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0028409091755747795, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0028409091755747795, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 686.59375, + "completions/mean_terminated_length": 686.59375, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.15176831698045135, + "epoch": 0.00082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4504550099372864, + "kl": 0.6600655419752002, + "learning_rate": 9.999999953703829e-05, + "loss": -0.0185, + "num_tokens": 2130497.0, + "reward": 2.0073609352111816, + "reward_std": 8.8825044631958, + "rewards/rollout_reward_func/mean": 2.0073609352111816, + "rewards/rollout_reward_func/std": 9.321340560913086, + "sampling/importance_sampling_ratio/max": 1.5246989727020264, + "sampling/importance_sampling_ratio/mean": 1.0359078645706177, + "sampling/importance_sampling_ratio/min": 0.3844473361968994, + "sampling/sampling_logp_difference/max": 0.955810546875, + "sampling/sampling_logp_difference/mean": 0.012838078662753105, + "step": 41, + "step_time": 28.587795755999878 + }, + { + "clip_ratio/high_max": 0.03172348579391837, + "clip_ratio/high_mean": 0.009232954820618033, + "clip_ratio/low_mean": 0.022608901956118643, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03184185642749071, + "entropy": 0.14899979438632727, + "epoch": 0.00084, + "grad_norm": 2.304894208908081, + "kl": 1.5326191950589418, + "learning_rate": 9.999999933333512e-05, + "loss": -0.0201, + "step": 42, + "step_time": 8.04831712999976 + }, + { + "clip_ratio/high_max": 0.005681818351149559, + "clip_ratio/high_mean": 0.0014204545877873898, + "clip_ratio/low_mean": 0.0026041667442768812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004024621332064271, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 685.40625, + "completions/mean_terminated_length": 685.40625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.1393027831800282, + "epoch": 0.00086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5649179816246033, + "kl": 0.7229090742766857, + "learning_rate": 9.999999909259503e-05, + "loss": -0.017, + "num_tokens": 2228288.0, + "reward": 1.6912901401519775, + "reward_std": 10.596427917480469, + "rewards/rollout_reward_func/mean": 1.691290020942688, + "rewards/rollout_reward_func/std": 12.0145263671875, + "sampling/importance_sampling_ratio/max": 1.3425889015197754, + "sampling/importance_sampling_ratio/mean": 0.9553788304328918, + "sampling/importance_sampling_ratio/min": 0.5974801778793335, + "sampling/sampling_logp_difference/max": 0.34511590003967285, + "sampling/sampling_logp_difference/mean": 0.01251951139420271, + "step": 43, + "step_time": 27.475775691999615 + }, + { + "clip_ratio/high_max": 0.04829545598477125, + "clip_ratio/high_mean": 0.015980114112608135, + "clip_ratio/low_mean": 0.024053030996583402, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.040033145574852824, + "entropy": 0.14617095375433564, + "epoch": 0.00088, + "grad_norm": 0.3445337116718292, + "kl": 0.5654929745942354, + "learning_rate": 9.9999998814818e-05, + "loss": -0.023, + "step": 44, + "step_time": 7.598477493999553 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 713.671875, + "completions/mean_terminated_length": 713.671875, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.14390681218355894, + "epoch": 0.0009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45263612270355225, + "kl": 0.6904484182596207, + "learning_rate": 9.999999850000404e-05, + "loss": -0.005, + "num_tokens": 2328132.0, + "reward": 2.4324169158935547, + "reward_std": 13.961143493652344, + "rewards/rollout_reward_func/mean": 2.4324169158935547, + "rewards/rollout_reward_func/std": 14.438629150390625, + "sampling/importance_sampling_ratio/max": 1.3720179796218872, + "sampling/importance_sampling_ratio/mean": 1.00229012966156, + "sampling/importance_sampling_ratio/min": 0.6608520746231079, + "sampling/sampling_logp_difference/max": 0.301973819732666, + "sampling/sampling_logp_difference/mean": 0.010271631181240082, + "step": 45, + "step_time": 28.995988180999802 + }, + { + "clip_ratio/high_max": 0.026041667442768812, + "clip_ratio/high_mean": 0.006510416860692203, + "clip_ratio/low_mean": 0.02043087175115943, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026941288728266954, + "entropy": 0.1375666274689138, + "epoch": 0.00092, + "grad_norm": 0.3008887469768524, + "kl": 0.6632084101438522, + "learning_rate": 9.999999814815312e-05, + "loss": -0.0106, + "step": 46, + "step_time": 7.42895066499932 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0027225379599258304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005326704704202712, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 698.640625, + "completions/mean_terminated_length": 698.640625, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.14624580927193165, + "epoch": 0.00094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36133161187171936, + "kl": 0.5184649843722582, + "learning_rate": 9.99999977592653e-05, + "loss": -0.0129, + "num_tokens": 2426521.0, + "reward": 1.375571846961975, + "reward_std": 11.66879940032959, + "rewards/rollout_reward_func/mean": 1.3755717277526855, + "rewards/rollout_reward_func/std": 11.796045303344727, + "sampling/importance_sampling_ratio/max": 1.8656487464904785, + "sampling/importance_sampling_ratio/mean": 1.0228910446166992, + "sampling/importance_sampling_ratio/min": 0.505867063999176, + "sampling/sampling_logp_difference/max": 0.6223084926605225, + "sampling/sampling_logp_difference/mean": 0.011709067039191723, + "step": 47, + "step_time": 29.763493531999984 + }, + { + "clip_ratio/high_max": 0.03172348579391837, + "clip_ratio/high_mean": 0.007930871448479593, + "clip_ratio/low_mean": 0.02568655402865261, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03361742536071688, + "entropy": 0.14968854701146483, + "epoch": 0.00096, + "grad_norm": 0.17635680735111237, + "kl": 0.5038973540067673, + "learning_rate": 9.999999733334051e-05, + "loss": -0.0167, + "step": 48, + "step_time": 7.652514348999603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0027225379599258304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027225379599258304, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 704.453125, + "completions/mean_terminated_length": 704.453125, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.14840606460347772, + "epoch": 0.00098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5855311751365662, + "kl": 0.5907826572656631, + "learning_rate": 9.99999968703788e-05, + "loss": 0.0381, + "num_tokens": 2526069.0, + "reward": 4.523091793060303, + "reward_std": 11.536006927490234, + "rewards/rollout_reward_func/mean": 4.523091793060303, + "rewards/rollout_reward_func/std": 12.290811538696289, + "sampling/importance_sampling_ratio/max": 2.122157573699951, + "sampling/importance_sampling_ratio/mean": 1.0083321332931519, + "sampling/importance_sampling_ratio/min": 0.6556381583213806, + "sampling/sampling_logp_difference/max": 0.5623667240142822, + "sampling/sampling_logp_difference/mean": 0.012646196410059929, + "step": 49, + "step_time": 27.48595007899985 + }, + { + "clip_ratio/high_max": 0.05445075919851661, + "clip_ratio/high_mean": 0.017518940148875117, + "clip_ratio/low_mean": 0.03401988744735718, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.051538827014155686, + "entropy": 0.14183657616376877, + "epoch": 0.001, + "grad_norm": 0.3584051728248596, + "kl": 0.5096510350704193, + "learning_rate": 9.999999637038015e-05, + "loss": 0.0365, + "step": 50, + "step_time": 9.165422230000104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0027225379599258304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027225379599258304, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 692.375, + "completions/mean_terminated_length": 692.375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.14247119799256325, + "epoch": 0.00102, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49646589159965515, + "kl": 0.4570716666057706, + "learning_rate": 9.999999583334457e-05, + "loss": -0.0101, + "num_tokens": 2623145.0, + "reward": 4.133634567260742, + "reward_std": 10.326797485351562, + "rewards/rollout_reward_func/mean": 4.133634567260742, + "rewards/rollout_reward_func/std": 10.82159423828125, + "sampling/importance_sampling_ratio/max": 1.6070019006729126, + "sampling/importance_sampling_ratio/mean": 0.996033787727356, + "sampling/importance_sampling_ratio/min": 0.5886021852493286, + "sampling/sampling_logp_difference/max": 0.543494701385498, + "sampling/sampling_logp_difference/mean": 0.010751021094620228, + "step": 51, + "step_time": 28.26161867099927 + }, + { + "clip_ratio/high_max": 0.04829545598477125, + "clip_ratio/high_mean": 0.013375947251915932, + "clip_ratio/low_mean": 0.02781723579391837, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0411931830458343, + "entropy": 0.13112169969826937, + "epoch": 0.00104, + "grad_norm": 0.34045207500457764, + "kl": 0.5393304694443941, + "learning_rate": 9.999999525927207e-05, + "loss": -0.016, + "step": 52, + "step_time": 6.901260032999289 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 691.875, + "completions/mean_terminated_length": 691.875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.11744047561660409, + "epoch": 0.00106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3921552300453186, + "kl": 0.42071591690182686, + "learning_rate": 9.999999464816261e-05, + "loss": 0.0037, + "num_tokens": 2721107.0, + "reward": 4.605119705200195, + "reward_std": 12.441184997558594, + "rewards/rollout_reward_func/mean": 4.605119228363037, + "rewards/rollout_reward_func/std": 14.067066192626953, + "sampling/importance_sampling_ratio/max": 1.3290151357650757, + "sampling/importance_sampling_ratio/mean": 0.9739052057266235, + "sampling/importance_sampling_ratio/min": 0.38011765480041504, + "sampling/sampling_logp_difference/max": 0.929356575012207, + "sampling/sampling_logp_difference/mean": 0.010732135735452175, + "step": 53, + "step_time": 30.069013398000834 + }, + { + "clip_ratio/high_max": 0.02651515230536461, + "clip_ratio/high_mean": 0.006628788076341152, + "clip_ratio/low_mean": 0.022904830053448677, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029533618362620473, + "entropy": 0.10777218686416745, + "epoch": 0.00108, + "grad_norm": 0.23905742168426514, + "kl": 0.5194222312420607, + "learning_rate": 9.999999400001624e-05, + "loss": 0.002, + "step": 54, + "step_time": 7.081706939000014 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0014204545877873898, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027225379599258304, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 702.796875, + "completions/mean_terminated_length": 702.796875, + "completions/min_length": 614.0, + "completions/min_terminated_length": 614.0, + "entropy": 0.11523706745356321, + "epoch": 0.0011, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.500625491142273, + "kl": 0.5581346470862627, + "learning_rate": 9.999999331483292e-05, + "loss": -0.0203, + "num_tokens": 2818643.0, + "reward": 3.496170997619629, + "reward_std": 14.47857666015625, + "rewards/rollout_reward_func/mean": 3.496170997619629, + "rewards/rollout_reward_func/std": 14.920737266540527, + "sampling/importance_sampling_ratio/max": 1.5530641078948975, + "sampling/importance_sampling_ratio/mean": 1.0201001167297363, + "sampling/importance_sampling_ratio/min": 0.5336768627166748, + "sampling/sampling_logp_difference/max": 0.6660118103027344, + "sampling/sampling_logp_difference/mean": 0.013495232909917831, + "step": 55, + "step_time": 28.81458637300034 + }, + { + "clip_ratio/high_max": 0.036931819282472134, + "clip_ratio/high_mean": 0.011837121681310236, + "clip_ratio/low_mean": 0.02758049312978983, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03941761504393071, + "entropy": 0.10888301394879818, + "epoch": 0.00112, + "grad_norm": 0.29490140080451965, + "kl": 0.5603756010532379, + "learning_rate": 9.999999259261268e-05, + "loss": -0.0253, + "step": 56, + "step_time": 8.193311973000164 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0026041667442768812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0052083334885537624, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 693.90625, + "completions/mean_terminated_length": 693.90625, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.12224696017801762, + "epoch": 0.00114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4371468722820282, + "kl": 0.5304271820932627, + "learning_rate": 9.99999918333555e-05, + "loss": 0.0189, + "num_tokens": 2916279.0, + "reward": 3.36903715133667, + "reward_std": 12.011173248291016, + "rewards/rollout_reward_func/mean": 3.369036912918091, + "rewards/rollout_reward_func/std": 12.399989128112793, + "sampling/importance_sampling_ratio/max": 1.8561766147613525, + "sampling/importance_sampling_ratio/mean": 1.0033948421478271, + "sampling/importance_sampling_ratio/min": 0.3815801441669464, + "sampling/sampling_logp_difference/max": 0.957763671875, + "sampling/sampling_logp_difference/mean": 0.011768012307584286, + "step": 57, + "step_time": 28.027659202000223 + }, + { + "clip_ratio/high_max": 0.05255681974813342, + "clip_ratio/high_mean": 0.015743371564894915, + "clip_ratio/low_mean": 0.019767993013374507, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0355113644618541, + "entropy": 0.12679382599890232, + "epoch": 0.00116, + "grad_norm": 0.3022422790527344, + "kl": 0.5225307196378708, + "learning_rate": 9.999999103706142e-05, + "loss": 0.015, + "step": 58, + "step_time": 8.72335070799977 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0026041667442768812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003906250116415322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 681.265625, + "completions/mean_terminated_length": 681.265625, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.12399047752842307, + "epoch": 0.00118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6583297848701477, + "kl": 0.5364211667329073, + "learning_rate": 9.999999020373037e-05, + "loss": 0.0117, + "num_tokens": 3012934.0, + "reward": 2.8170366287231445, + "reward_std": 12.926514625549316, + "rewards/rollout_reward_func/mean": 2.8170366287231445, + "rewards/rollout_reward_func/std": 13.227665901184082, + "sampling/importance_sampling_ratio/max": 2.4036612510681152, + "sampling/importance_sampling_ratio/mean": 0.9975829720497131, + "sampling/importance_sampling_ratio/min": 0.6259334683418274, + "sampling/sampling_logp_difference/max": 0.720775842666626, + "sampling/sampling_logp_difference/mean": 0.010457618162035942, + "step": 59, + "step_time": 28.9596313010004 + }, + { + "clip_ratio/high_max": 0.0416666679084301, + "clip_ratio/high_mean": 0.011718750349245965, + "clip_ratio/low_mean": 0.03385416732635349, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04557291779201478, + "entropy": 0.11637644609436393, + "epoch": 0.0012, + "grad_norm": 1.9307663440704346, + "kl": 1.8184253200888634, + "learning_rate": 9.999998933336241e-05, + "loss": 0.0213, + "step": 60, + "step_time": 7.307322721999981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0014204545877873898, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014204545877873898, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 685.78125, + "completions/mean_terminated_length": 685.78125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.11613691644743085, + "epoch": 0.00122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44358986616134644, + "kl": 0.5193471424281597, + "learning_rate": 9.999998842595753e-05, + "loss": -0.0024, + "num_tokens": 3109806.0, + "reward": 4.651793479919434, + "reward_std": 12.063810348510742, + "rewards/rollout_reward_func/mean": 4.651793479919434, + "rewards/rollout_reward_func/std": 12.754688262939453, + "sampling/importance_sampling_ratio/max": 1.6620776653289795, + "sampling/importance_sampling_ratio/mean": 0.9981948137283325, + "sampling/importance_sampling_ratio/min": 0.6313586831092834, + "sampling/sampling_logp_difference/max": 0.4394187927246094, + "sampling/sampling_logp_difference/mean": 0.009169764816761017, + "step": 61, + "step_time": 30.805930039000714 + }, + { + "clip_ratio/high_max": 0.026988637167960405, + "clip_ratio/high_mean": 0.01065340917557478, + "clip_ratio/low_mean": 0.020951705169864, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03160511434543878, + "entropy": 0.12027787417173386, + "epoch": 0.00124, + "grad_norm": 0.3839333951473236, + "kl": 0.5386558780446649, + "learning_rate": 9.999998748151572e-05, + "loss": -0.0001, + "step": 62, + "step_time": 7.061834261000513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 694.46875, + "completions/mean_terminated_length": 694.46875, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "entropy": 0.13281571818515658, + "epoch": 0.00126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47250673174858093, + "kl": 0.5621049534529448, + "learning_rate": 9.999998650003696e-05, + "loss": -0.0068, + "num_tokens": 3207160.0, + "reward": 4.072500705718994, + "reward_std": 12.934675216674805, + "rewards/rollout_reward_func/mean": 4.072500705718994, + "rewards/rollout_reward_func/std": 13.5437650680542, + "sampling/importance_sampling_ratio/max": 1.4505815505981445, + "sampling/importance_sampling_ratio/mean": 1.0127054452896118, + "sampling/importance_sampling_ratio/min": 0.644386887550354, + "sampling/sampling_logp_difference/max": 0.46297478675842285, + "sampling/sampling_logp_difference/mean": 0.01112096942961216, + "step": 63, + "step_time": 27.765410665000445 + }, + { + "clip_ratio/high_max": 0.03645833441987634, + "clip_ratio/high_mean": 0.010416666977107525, + "clip_ratio/low_mean": 0.025236743153072894, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03565340966451913, + "entropy": 0.11304981098510325, + "epoch": 0.00128, + "grad_norm": 0.23461361229419708, + "kl": 0.707372922450304, + "learning_rate": 9.999998548152131e-05, + "loss": -0.0107, + "step": 64, + "step_time": 9.65409977299987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 715.78125, + "completions/mean_terminated_length": 715.78125, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.11744949175044894, + "epoch": 0.0013, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5356135368347168, + "kl": 2.483667228370905, + "learning_rate": 9.999998442596872e-05, + "loss": 0.0155, + "num_tokens": 3305784.0, + "reward": 3.657202959060669, + "reward_std": 10.959955215454102, + "rewards/rollout_reward_func/mean": 3.657203197479248, + "rewards/rollout_reward_func/std": 12.17599105834961, + "sampling/importance_sampling_ratio/max": 1.36454439163208, + "sampling/importance_sampling_ratio/mean": 1.0064573287963867, + "sampling/importance_sampling_ratio/min": 0.6259024739265442, + "sampling/sampling_logp_difference/max": 0.4463231563568115, + "sampling/sampling_logp_difference/mean": 0.008584607392549515, + "step": 65, + "step_time": 29.066453170999694 + }, + { + "clip_ratio/high_max": 0.031250000931322575, + "clip_ratio/high_mean": 0.010416666860692203, + "clip_ratio/low_mean": 0.02178030402865261, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.032196971122175455, + "entropy": 0.14252985129132867, + "epoch": 0.00132, + "grad_norm": 0.2563531696796417, + "kl": 0.6245546955615282, + "learning_rate": 9.999998333337922e-05, + "loss": -0.0004, + "step": 66, + "step_time": 8.02741467600049 + }, + { + "clip_ratio/high_max": 0.005681818351149559, + "clip_ratio/high_mean": 0.0014204545877873898, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014204545877873898, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 678.890625, + "completions/mean_terminated_length": 678.890625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.15250376611948013, + "epoch": 0.00134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5203387141227722, + "kl": 0.695558762177825, + "learning_rate": 9.999998220375278e-05, + "loss": -0.0145, + "num_tokens": 3401864.0, + "reward": 1.054423451423645, + "reward_std": 11.31953239440918, + "rewards/rollout_reward_func/mean": 1.054423451423645, + "rewards/rollout_reward_func/std": 12.172701835632324, + "sampling/importance_sampling_ratio/max": 1.2194569110870361, + "sampling/importance_sampling_ratio/mean": 0.9876125454902649, + "sampling/importance_sampling_ratio/min": 0.550414502620697, + "sampling/sampling_logp_difference/max": 0.5297477841377258, + "sampling/sampling_logp_difference/mean": 0.008570928126573563, + "step": 67, + "step_time": 30.060426205000795 + }, + { + "clip_ratio/high_max": 0.05965909268707037, + "clip_ratio/high_mean": 0.018939394736662507, + "clip_ratio/low_mean": 0.03338068269658834, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05232007708400488, + "entropy": 0.17216729745268822, + "epoch": 0.00136, + "grad_norm": 0.27647653222084045, + "kl": 0.6303851045668125, + "learning_rate": 9.999998103708944e-05, + "loss": -0.0169, + "step": 68, + "step_time": 7.55158718300072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0015625000232830644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015625000232830644, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 692.171875, + "completions/mean_terminated_length": 692.171875, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.17608004808425903, + "epoch": 0.00138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40714362263679504, + "kl": 0.5837149824947119, + "learning_rate": 9.999997983338918e-05, + "loss": 0.0075, + "num_tokens": 3498494.0, + "reward": 4.154041290283203, + "reward_std": 15.997432708740234, + "rewards/rollout_reward_func/mean": 4.154041290283203, + "rewards/rollout_reward_func/std": 18.081926345825195, + "sampling/importance_sampling_ratio/max": 1.240838885307312, + "sampling/importance_sampling_ratio/mean": 0.9964578747749329, + "sampling/importance_sampling_ratio/min": 0.756720781326294, + "sampling/sampling_logp_difference/max": 0.326712965965271, + "sampling/sampling_logp_difference/mean": 0.009719014167785645, + "step": 69, + "step_time": 28.760111235000295 + }, + { + "clip_ratio/high_max": 0.04450757708400488, + "clip_ratio/high_mean": 0.013731061248108745, + "clip_ratio/low_mean": 0.016698232851922512, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030429294100031257, + "entropy": 0.1826375536620617, + "epoch": 0.0014, + "grad_norm": 0.4711000919342041, + "kl": 0.5743975602090359, + "learning_rate": 9.999997859265198e-05, + "loss": 0.0045, + "step": 70, + "step_time": 8.15129353600014 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 695.796875, + "completions/mean_terminated_length": 695.796875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.20647307951003313, + "epoch": 0.00142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3142479956150055, + "kl": 0.5285101179033518, + "learning_rate": 9.999997731487787e-05, + "loss": -0.0177, + "num_tokens": 3595387.0, + "reward": 2.402292490005493, + "reward_std": 13.013188362121582, + "rewards/rollout_reward_func/mean": 2.402292251586914, + "rewards/rollout_reward_func/std": 13.636407852172852, + "sampling/importance_sampling_ratio/max": 1.3384240865707397, + "sampling/importance_sampling_ratio/mean": 1.011613368988037, + "sampling/importance_sampling_ratio/min": 0.776378870010376, + "sampling/sampling_logp_difference/max": 0.2462749481201172, + "sampling/sampling_logp_difference/mean": 0.009866164065897465, + "step": 71, + "step_time": 28.09002854999926 + }, + { + "clip_ratio/high_max": 0.043560607358813286, + "clip_ratio/high_mean": 0.013494318816810846, + "clip_ratio/low_mean": 0.012428977759554982, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025923296343535185, + "entropy": 0.20543431770056486, + "epoch": 0.00144, + "grad_norm": 0.23259234428405762, + "kl": 0.5239685252308846, + "learning_rate": 9.999997600006685e-05, + "loss": -0.0218, + "step": 72, + "step_time": 8.672955195000668 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 704.984375, + "completions/mean_terminated_length": 704.984375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.20190842729061842, + "epoch": 0.00146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3707256019115448, + "kl": 0.5400361772626638, + "learning_rate": 9.999997464821892e-05, + "loss": 0.006, + "num_tokens": 3692772.0, + "reward": 2.049668073654175, + "reward_std": 15.488001823425293, + "rewards/rollout_reward_func/mean": 2.049668073654175, + "rewards/rollout_reward_func/std": 15.380194664001465, + "sampling/importance_sampling_ratio/max": 1.1559480428695679, + "sampling/importance_sampling_ratio/mean": 0.970598578453064, + "sampling/importance_sampling_ratio/min": 0.6524748802185059, + "sampling/sampling_logp_difference/max": 0.35463929176330566, + "sampling/sampling_logp_difference/mean": 0.009403295814990997, + "step": 73, + "step_time": 28.82742140900018 + }, + { + "clip_ratio/high_max": 0.06818182021379471, + "clip_ratio/high_mean": 0.018347538425587118, + "clip_ratio/low_mean": 0.02402935700956732, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04237689543515444, + "entropy": 0.2009204039350152, + "epoch": 0.00148, + "grad_norm": 0.2297271341085434, + "kl": 0.5404210295528173, + "learning_rate": 9.999997325933408e-05, + "loss": 0.001, + "step": 74, + "step_time": 7.489119195001422 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0015625000232830644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666767559946, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 688.1875, + "completions/mean_terminated_length": 688.1875, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.19008919596672058, + "epoch": 0.0015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39288049936294556, + "kl": 0.5065996870398521, + "learning_rate": 9.999997183341232e-05, + "loss": -0.0174, + "num_tokens": 3789251.0, + "reward": 5.712855339050293, + "reward_std": 12.491518020629883, + "rewards/rollout_reward_func/mean": 5.712855339050293, + "rewards/rollout_reward_func/std": 13.803718566894531, + "sampling/importance_sampling_ratio/max": 1.3862897157669067, + "sampling/importance_sampling_ratio/mean": 0.9820230007171631, + "sampling/importance_sampling_ratio/min": 0.7251328825950623, + "sampling/sampling_logp_difference/max": 0.38344359397888184, + "sampling/sampling_logp_difference/mean": 0.011255129240453243, + "step": 75, + "step_time": 29.904527067999425 + }, + { + "clip_ratio/high_max": 0.0416666679084301, + "clip_ratio/high_mean": 0.014322916977107525, + "clip_ratio/low_mean": 0.03125000069849193, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04557291744276881, + "entropy": 0.1766198892146349, + "epoch": 0.00152, + "grad_norm": 0.24856555461883545, + "kl": 0.5580815225839615, + "learning_rate": 9.999997037045364e-05, + "loss": -0.0236, + "step": 76, + "step_time": 7.936869918999946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 704.921875, + "completions/mean_terminated_length": 704.921875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.1587599003687501, + "epoch": 0.00154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5212773680686951, + "kl": 0.5196739248931408, + "learning_rate": 9.999996887045807e-05, + "loss": -0.0035, + "num_tokens": 3886377.0, + "reward": 4.441685199737549, + "reward_std": 10.929279327392578, + "rewards/rollout_reward_func/mean": 4.441685676574707, + "rewards/rollout_reward_func/std": 12.737987518310547, + "sampling/importance_sampling_ratio/max": 1.4177803993225098, + "sampling/importance_sampling_ratio/mean": 0.9960745573043823, + "sampling/importance_sampling_ratio/min": 0.6403241157531738, + "sampling/sampling_logp_difference/max": 0.35891127586364746, + "sampling/sampling_logp_difference/mean": 0.009403642266988754, + "step": 77, + "step_time": 29.23442492700042 + }, + { + "clip_ratio/high_max": 0.03787878900766373, + "clip_ratio/high_mean": 0.009469697251915932, + "clip_ratio/low_mean": 0.02260890230536461, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03207859944086522, + "entropy": 0.14319165889173746, + "epoch": 0.00156, + "grad_norm": 0.21223606169223785, + "kl": 0.6083459779620171, + "learning_rate": 9.999996733342559e-05, + "loss": -0.0046, + "step": 78, + "step_time": 9.08587798599865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0052083334885537624, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0052083334885537624, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 699.296875, + "completions/mean_terminated_length": 699.296875, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.13262670719996095, + "epoch": 0.00158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3686180114746094, + "kl": 0.5813394356518984, + "learning_rate": 9.99999657593562e-05, + "loss": 0.0239, + "num_tokens": 3983088.0, + "reward": 4.565882682800293, + "reward_std": 10.690776824951172, + "rewards/rollout_reward_func/mean": 4.565882682800293, + "rewards/rollout_reward_func/std": 10.94388484954834, + "sampling/importance_sampling_ratio/max": 2.301131010055542, + "sampling/importance_sampling_ratio/mean": 1.038649559020996, + "sampling/importance_sampling_ratio/min": 0.6781718730926514, + "sampling/sampling_logp_difference/max": 0.7350552082061768, + "sampling/sampling_logp_difference/mean": 0.009047108702361584, + "step": 79, + "step_time": 29.03609049600027 + }, + { + "clip_ratio/high_max": 0.0691287899389863, + "clip_ratio/high_mean": 0.02249053120613098, + "clip_ratio/low_mean": 0.017282197484746575, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03977272880729288, + "entropy": 0.1365647497586906, + "epoch": 0.0016, + "grad_norm": 0.26296547055244446, + "kl": 0.5871373657137156, + "learning_rate": 9.99999641482499e-05, + "loss": 0.0196, + "step": 80, + "step_time": 8.78226529199901 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 704.28125, + "completions/mean_terminated_length": 704.28125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.13410852942615747, + "epoch": 0.00162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6391101479530334, + "kl": 0.49469868279993534, + "learning_rate": 9.999996250010672e-05, + "loss": -0.0038, + "num_tokens": 4080648.0, + "reward": 5.768660545349121, + "reward_std": 10.985546112060547, + "rewards/rollout_reward_func/mean": 5.768660068511963, + "rewards/rollout_reward_func/std": 11.962743759155273, + "sampling/importance_sampling_ratio/max": 1.4244225025177002, + "sampling/importance_sampling_ratio/mean": 1.0141850709915161, + "sampling/importance_sampling_ratio/min": 0.6568657755851746, + "sampling/sampling_logp_difference/max": 0.3986610174179077, + "sampling/sampling_logp_difference/mean": 0.009017249569296837, + "step": 81, + "step_time": 29.058386802999394 + }, + { + "clip_ratio/high_max": 0.06250000186264515, + "clip_ratio/high_mean": 0.016927083721384406, + "clip_ratio/low_mean": 0.025213068933226168, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04214015288744122, + "entropy": 0.1406740453094244, + "epoch": 0.00164, + "grad_norm": 0.3618878424167633, + "kl": 0.5326054207980633, + "learning_rate": 9.99999608149266e-05, + "loss": -0.0092, + "step": 82, + "step_time": 7.4923771910011965 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 690.765625, + "completions/mean_terminated_length": 690.765625, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "entropy": 0.13702308759093285, + "epoch": 0.00166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7054314017295837, + "kl": 0.5327184200286865, + "learning_rate": 9.999995909270962e-05, + "loss": 0.0131, + "num_tokens": 4176944.0, + "reward": 6.398343563079834, + "reward_std": 12.486600875854492, + "rewards/rollout_reward_func/mean": 6.398343563079834, + "rewards/rollout_reward_func/std": 13.118927955627441, + "sampling/importance_sampling_ratio/max": 1.1626012325286865, + "sampling/importance_sampling_ratio/mean": 0.9923787117004395, + "sampling/importance_sampling_ratio/min": 0.6767197847366333, + "sampling/sampling_logp_difference/max": 0.27681541442871094, + "sampling/sampling_logp_difference/mean": 0.007814774289727211, + "step": 83, + "step_time": 30.334892443000626 + }, + { + "clip_ratio/high_max": 0.052083334885537624, + "clip_ratio/high_mean": 0.014322917093522847, + "clip_ratio/low_mean": 0.02935606148093939, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04367897834163159, + "entropy": 0.13890184368938208, + "epoch": 0.00168, + "grad_norm": 0.23295485973358154, + "kl": 0.583111148327589, + "learning_rate": 9.999995733345573e-05, + "loss": 0.0096, + "step": 84, + "step_time": 8.188888645999668 + }, + { + "clip_ratio/high_max": 0.005681818351149559, + "clip_ratio/high_mean": 0.0014204545877873898, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027225379599258304, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 709.140625, + "completions/mean_terminated_length": 709.140625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.1646800385788083, + "epoch": 0.0017, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.721032977104187, + "kl": 0.5625268556177616, + "learning_rate": 9.999995553716494e-05, + "loss": -0.003, + "num_tokens": 4273965.0, + "reward": 5.8386735916137695, + "reward_std": 13.300103187561035, + "rewards/rollout_reward_func/mean": 5.8386735916137695, + "rewards/rollout_reward_func/std": 13.629975318908691, + "sampling/importance_sampling_ratio/max": 1.314743995666504, + "sampling/importance_sampling_ratio/mean": 1.0051491260528564, + "sampling/importance_sampling_ratio/min": 0.7047513127326965, + "sampling/sampling_logp_difference/max": 0.2584061622619629, + "sampling/sampling_logp_difference/mean": 0.009669218212366104, + "step": 85, + "step_time": 28.418585942000846 + }, + { + "clip_ratio/high_max": 0.06912878947332501, + "clip_ratio/high_mean": 0.019886364112608135, + "clip_ratio/low_mean": 0.04139046813361347, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.061276831780560315, + "entropy": 0.16476231161504984, + "epoch": 0.00172, + "grad_norm": 0.3757534921169281, + "kl": 0.6113345008343458, + "learning_rate": 9.999995370383726e-05, + "loss": -0.0069, + "step": 86, + "step_time": 8.756163650000417 + }, + { + "clip_ratio/high_max": 0.011363636702299118, + "clip_ratio/high_mean": 0.0028409091755747795, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00414299254771322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 684.84375, + "completions/mean_terminated_length": 684.84375, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "entropy": 0.15467680245637894, + "epoch": 0.00174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39151129126548767, + "kl": 0.5371765866875648, + "learning_rate": 9.999995183347267e-05, + "loss": 0.0105, + "num_tokens": 4369299.0, + "reward": 5.963912010192871, + "reward_std": 12.684013366699219, + "rewards/rollout_reward_func/mean": 5.963912010192871, + "rewards/rollout_reward_func/std": 13.017167091369629, + "sampling/importance_sampling_ratio/max": 1.2570720911026, + "sampling/importance_sampling_ratio/mean": 1.0000150203704834, + "sampling/importance_sampling_ratio/min": 0.6576955914497375, + "sampling/sampling_logp_difference/max": 0.23494529724121094, + "sampling/sampling_logp_difference/mean": 0.009037286043167114, + "step": 87, + "step_time": 27.959497561998433 + }, + { + "clip_ratio/high_max": 0.04876894038170576, + "clip_ratio/high_mean": 0.014796401956118643, + "clip_ratio/low_mean": 0.030184660223312676, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04498106241226196, + "entropy": 0.1502314694225788, + "epoch": 0.00176, + "grad_norm": 0.24433566629886627, + "kl": 0.518398828804493, + "learning_rate": 9.999994992607121e-05, + "loss": 0.0052, + "step": 88, + "step_time": 6.98385682199978 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 674.84375, + "completions/mean_terminated_length": 674.84375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.1636304627172649, + "epoch": 0.00178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4070056080818176, + "kl": 0.44829913787543774, + "learning_rate": 9.999994798163285e-05, + "loss": 0.0028, + "num_tokens": 4464636.0, + "reward": 4.596271991729736, + "reward_std": 12.002615928649902, + "rewards/rollout_reward_func/mean": 4.5962724685668945, + "rewards/rollout_reward_func/std": 12.03700065612793, + "sampling/importance_sampling_ratio/max": 1.8310773372650146, + "sampling/importance_sampling_ratio/mean": 1.0015285015106201, + "sampling/importance_sampling_ratio/min": 0.6802361011505127, + "sampling/sampling_logp_difference/max": 0.63387131690979, + "sampling/sampling_logp_difference/mean": 0.01002519205212593, + "step": 89, + "step_time": 29.20652451100068 + }, + { + "clip_ratio/high_max": 0.053503789473325014, + "clip_ratio/high_mean": 0.014678030740469694, + "clip_ratio/low_mean": 0.014914773171767592, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02959280402865261, + "entropy": 0.16931697819381952, + "epoch": 0.0018, + "grad_norm": 0.22567316889762878, + "kl": 0.45854073390364647, + "learning_rate": 9.999994600015763e-05, + "loss": -0.0044, + "step": 90, + "step_time": 7.806028198999684 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003906250116415322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 690.921875, + "completions/mean_terminated_length": 690.921875, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "entropy": 0.17555938381701708, + "epoch": 0.00182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6369052529335022, + "kl": 0.5130380634218454, + "learning_rate": 9.99999439816455e-05, + "loss": 0.0097, + "num_tokens": 4560466.0, + "reward": 4.266380786895752, + "reward_std": 8.932316780090332, + "rewards/rollout_reward_func/mean": 4.266380786895752, + "rewards/rollout_reward_func/std": 9.506205558776855, + "sampling/importance_sampling_ratio/max": 1.4317197799682617, + "sampling/importance_sampling_ratio/mean": 0.9800074100494385, + "sampling/importance_sampling_ratio/min": 0.6640469431877136, + "sampling/sampling_logp_difference/max": 0.39695852994918823, + "sampling/sampling_logp_difference/mean": 0.011851027607917786, + "step": 91, + "step_time": 30.03264877599986 + }, + { + "clip_ratio/high_max": 0.07812500186264515, + "clip_ratio/high_mean": 0.027343750349245965, + "clip_ratio/low_mean": 0.02604166732635349, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05338541814126074, + "entropy": 0.16226398199796677, + "epoch": 0.00184, + "grad_norm": 0.49021783471107483, + "kl": 0.678026232868433, + "learning_rate": 9.999994192609649e-05, + "loss": 0.0008, + "step": 92, + "step_time": 9.04058756600034 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0026041667442768812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003906250116415322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 706.40625, + "completions/mean_terminated_length": 706.40625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.1653224742040038, + "epoch": 0.00186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5930750966072083, + "kl": 0.533378497697413, + "learning_rate": 9.999993983351059e-05, + "loss": 0.0049, + "num_tokens": 4657400.0, + "reward": 4.687631607055664, + "reward_std": 12.176762580871582, + "rewards/rollout_reward_func/mean": 4.687631607055664, + "rewards/rollout_reward_func/std": 13.946465492248535, + "sampling/importance_sampling_ratio/max": 2.0640549659729004, + "sampling/importance_sampling_ratio/mean": 1.0510772466659546, + "sampling/importance_sampling_ratio/min": 0.6677830219268799, + "sampling/sampling_logp_difference/max": 0.5909380912780762, + "sampling/sampling_logp_difference/mean": 0.01191171444952488, + "step": 93, + "step_time": 28.226474250999672 + }, + { + "clip_ratio/high_max": 0.0416666679084301, + "clip_ratio/high_mean": 0.011718750349245965, + "clip_ratio/low_mean": 0.022135417442768812, + "clip_ratio/low_min": 0.0052083334885537624, + "clip_ratio/region_mean": 0.033854167675599456, + "entropy": 0.15958264330402017, + "epoch": 0.00188, + "grad_norm": 0.35930758714675903, + "kl": 0.7466034032404423, + "learning_rate": 9.999993770388783e-05, + "loss": 0.0032, + "step": 94, + "step_time": 8.234778083000037 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 697.828125, + "completions/mean_terminated_length": 697.828125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 0.16395951714366674, + "epoch": 0.0019, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30130699276924133, + "kl": 0.4953720346093178, + "learning_rate": 9.99999355372282e-05, + "loss": 0.0087, + "num_tokens": 4753836.0, + "reward": 4.204550743103027, + "reward_std": 11.951547622680664, + "rewards/rollout_reward_func/mean": 4.204550743103027, + "rewards/rollout_reward_func/std": 13.192495346069336, + "sampling/importance_sampling_ratio/max": 1.7262465953826904, + "sampling/importance_sampling_ratio/mean": 1.0100435018539429, + "sampling/importance_sampling_ratio/min": 0.6937407851219177, + "sampling/sampling_logp_difference/max": 0.5034514665603638, + "sampling/sampling_logp_difference/mean": 0.008402319625020027, + "step": 95, + "step_time": 29.812369647001105 + }, + { + "clip_ratio/high_max": 0.03645833441987634, + "clip_ratio/high_mean": 0.010416666977107525, + "clip_ratio/low_mean": 0.009114583604969084, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01953125058207661, + "entropy": 0.18544823909178376, + "epoch": 0.00192, + "grad_norm": 0.2023509442806244, + "kl": 0.44245083443820477, + "learning_rate": 9.999993333353168e-05, + "loss": 0.0061, + "step": 96, + "step_time": 7.195571093998296 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 687.921875, + "completions/mean_terminated_length": 687.921875, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.20835321862250566, + "epoch": 0.00194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35837119817733765, + "kl": 0.4389466196298599, + "learning_rate": 9.999993109279828e-05, + "loss": 0.0044, + "num_tokens": 4849131.0, + "reward": 3.880918264389038, + "reward_std": 8.090033531188965, + "rewards/rollout_reward_func/mean": 3.880918264389038, + "rewards/rollout_reward_func/std": 9.26294231414795, + "sampling/importance_sampling_ratio/max": 1.2344332933425903, + "sampling/importance_sampling_ratio/mean": 0.9644654989242554, + "sampling/importance_sampling_ratio/min": 0.7370292544364929, + "sampling/sampling_logp_difference/max": 0.29116082191467285, + "sampling/sampling_logp_difference/mean": 0.009601429104804993, + "step": 97, + "step_time": 30.270599251999556 + }, + { + "clip_ratio/high_max": 0.052083334885537624, + "clip_ratio/high_mean": 0.016927083721384406, + "clip_ratio/low_mean": 0.014559659757651389, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031486743479035795, + "entropy": 0.21354177221655846, + "epoch": 0.00196, + "grad_norm": 0.20351360738277435, + "kl": 0.43841097690165043, + "learning_rate": 9.999992881502804e-05, + "loss": 0.0004, + "step": 98, + "step_time": 7.506363271999817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 706.140625, + "completions/mean_terminated_length": 706.140625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.21307788416743279, + "epoch": 0.00198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4080103039741516, + "kl": 0.5334641952067614, + "learning_rate": 9.99999265002209e-05, + "loss": -0.003, + "num_tokens": 4945915.0, + "reward": 5.200403213500977, + "reward_std": 14.344334602355957, + "rewards/rollout_reward_func/mean": 5.200403213500977, + "rewards/rollout_reward_func/std": 14.294367790222168, + "sampling/importance_sampling_ratio/max": 1.2247880697250366, + "sampling/importance_sampling_ratio/mean": 1.0129998922348022, + "sampling/importance_sampling_ratio/min": 0.7771543860435486, + "sampling/sampling_logp_difference/max": 0.23006606101989746, + "sampling/sampling_logp_difference/mean": 0.00854739174246788, + "step": 99, + "step_time": 29.33993570499979 + }, + { + "clip_ratio/high_max": 0.015625000465661287, + "clip_ratio/high_mean": 0.006510416744276881, + "clip_ratio/low_mean": 0.023555872030556202, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030066288774833083, + "entropy": 0.21015852224081755, + "epoch": 0.002, + "grad_norm": 0.2297798991203308, + "kl": 0.5835338849574327, + "learning_rate": 9.999992414837691e-05, + "loss": -0.008, + "step": 100, + "step_time": 8.775622698999086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 711.671875, + "completions/mean_terminated_length": 711.671875, + "completions/min_length": 616.0, + "completions/min_terminated_length": 616.0, + "entropy": 0.2137407148256898, + "epoch": 0.00202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4429977834224701, + "kl": 0.4639507979154587, + "learning_rate": 9.999992175949606e-05, + "loss": -0.0173, + "num_tokens": 5042733.0, + "reward": 3.351179838180542, + "reward_std": 8.503268241882324, + "rewards/rollout_reward_func/mean": 3.351179838180542, + "rewards/rollout_reward_func/std": 8.948554039001465, + "sampling/importance_sampling_ratio/max": 1.328324556350708, + "sampling/importance_sampling_ratio/mean": 1.0001481771469116, + "sampling/importance_sampling_ratio/min": 0.5792597532272339, + "sampling/sampling_logp_difference/max": 0.4302701950073242, + "sampling/sampling_logp_difference/mean": 0.008802896365523338, + "step": 101, + "step_time": 29.50366010900052 + }, + { + "clip_ratio/high_max": 0.0572916679084301, + "clip_ratio/high_mean": 0.02083333407063037, + "clip_ratio/low_mean": 0.021188447950407863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04202178155537695, + "entropy": 0.19818230718374252, + "epoch": 0.00204, + "grad_norm": 0.228831484913826, + "kl": 0.523833503946662, + "learning_rate": 9.999991933357836e-05, + "loss": -0.0238, + "step": 102, + "step_time": 7.743058271999871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 679.5, + "completions/mean_terminated_length": 679.5, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.16811883123591542, + "epoch": 0.00206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2717866897583008, + "kl": 0.5116975158452988, + "learning_rate": 9.999991687062378e-05, + "loss": 0.0026, + "num_tokens": 5137485.0, + "reward": 3.233732223510742, + "reward_std": 12.289377212524414, + "rewards/rollout_reward_func/mean": 3.233732223510742, + "rewards/rollout_reward_func/std": 14.167500495910645, + "sampling/importance_sampling_ratio/max": 1.1452041864395142, + "sampling/importance_sampling_ratio/mean": 0.9949536323547363, + "sampling/importance_sampling_ratio/min": 0.8263934254646301, + "sampling/sampling_logp_difference/max": 0.11179852485656738, + "sampling/sampling_logp_difference/mean": 0.00560589786618948, + "step": 103, + "step_time": 28.410943980999036 + }, + { + "clip_ratio/high_max": 0.03645833441987634, + "clip_ratio/high_mean": 0.009114583604969084, + "clip_ratio/low_mean": 0.036576704937033355, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04569128877483308, + "entropy": 0.1520394361577928, + "epoch": 0.00208, + "grad_norm": 0.1855272352695465, + "kl": 0.548751313239336, + "learning_rate": 9.999991437063234e-05, + "loss": -0.0007, + "step": 104, + "step_time": 7.630572153999765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 689.109375, + "completions/mean_terminated_length": 689.109375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.15672127809375525, + "epoch": 0.0021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45729750394821167, + "kl": 0.6510039251297712, + "learning_rate": 9.999991183360407e-05, + "loss": -0.011, + "num_tokens": 5232831.0, + "reward": 4.220555305480957, + "reward_std": 10.952154159545898, + "rewards/rollout_reward_func/mean": 4.220555305480957, + "rewards/rollout_reward_func/std": 11.161866188049316, + "sampling/importance_sampling_ratio/max": 1.289732813835144, + "sampling/importance_sampling_ratio/mean": 0.9952840209007263, + "sampling/importance_sampling_ratio/min": 0.6639890074729919, + "sampling/sampling_logp_difference/max": 0.4248615503311157, + "sampling/sampling_logp_difference/mean": 0.009283961728215218, + "step": 105, + "step_time": 29.269957731999057 + }, + { + "clip_ratio/high_max": 0.02083333395421505, + "clip_ratio/high_mean": 0.006510416860692203, + "clip_ratio/low_mean": 0.015625000232830644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022135417093522847, + "entropy": 0.15183987142518163, + "epoch": 0.00212, + "grad_norm": 0.19675187766551971, + "kl": 0.7388164456933737, + "learning_rate": 9.999990925953892e-05, + "loss": -0.0165, + "step": 106, + "step_time": 7.576425396001014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 695.0625, + "completions/mean_terminated_length": 695.0625, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "entropy": 0.14494483266025782, + "epoch": 0.00214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4538170397281647, + "kl": 0.678084384649992, + "learning_rate": 9.999990664843695e-05, + "loss": 0.0147, + "num_tokens": 5328578.0, + "reward": 9.525361061096191, + "reward_std": 13.358152389526367, + "rewards/rollout_reward_func/mean": 9.525361061096191, + "rewards/rollout_reward_func/std": 14.251992225646973, + "sampling/importance_sampling_ratio/max": 1.1812809705734253, + "sampling/importance_sampling_ratio/mean": 0.9926539659500122, + "sampling/importance_sampling_ratio/min": 0.7029387950897217, + "sampling/sampling_logp_difference/max": 0.35564422607421875, + "sampling/sampling_logp_difference/mean": 0.007083391770720482, + "step": 107, + "step_time": 28.04022229299926 + }, + { + "clip_ratio/high_max": 0.046875000931322575, + "clip_ratio/high_mean": 0.015625000349245965, + "clip_ratio/low_mean": 0.015861742896959186, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03148674312978983, + "entropy": 0.1589709185063839, + "epoch": 0.00216, + "grad_norm": 0.22844459116458893, + "kl": 0.6251159347593784, + "learning_rate": 9.999990400029812e-05, + "loss": 0.0106, + "step": 108, + "step_time": 8.196292393000022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 703.75, + "completions/mean_terminated_length": 703.75, + "completions/min_length": 522.0, + "completions/min_terminated_length": 522.0, + "entropy": 0.17329717054963112, + "epoch": 0.00218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3682776987552643, + "kl": 0.6051198206841946, + "learning_rate": 9.999990131512245e-05, + "loss": 0.0061, + "num_tokens": 5424927.0, + "reward": 6.206368923187256, + "reward_std": 10.578010559082031, + "rewards/rollout_reward_func/mean": 6.206368923187256, + "rewards/rollout_reward_func/std": 11.067666053771973, + "sampling/importance_sampling_ratio/max": 1.4831089973449707, + "sampling/importance_sampling_ratio/mean": 1.002763271331787, + "sampling/importance_sampling_ratio/min": 0.7234499454498291, + "sampling/sampling_logp_difference/max": 0.3583219051361084, + "sampling/sampling_logp_difference/mean": 0.007746794261038303, + "step": 109, + "step_time": 27.809638274999543 + }, + { + "clip_ratio/high_max": 0.026988637167960405, + "clip_ratio/high_mean": 0.010653409641236067, + "clip_ratio/low_mean": 0.014441288309171796, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025094697950407863, + "entropy": 0.17460143100470304, + "epoch": 0.0022, + "grad_norm": 0.1794712245464325, + "kl": 0.6243367586284876, + "learning_rate": 9.999989859290995e-05, + "loss": 0.0027, + "step": 110, + "step_time": 7.0755484739993335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 697.578125, + "completions/mean_terminated_length": 697.578125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "entropy": 0.17424820829182863, + "epoch": 0.00222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42331627011299133, + "kl": 0.586581215262413, + "learning_rate": 9.99998958336606e-05, + "loss": -0.0044, + "num_tokens": 5520852.0, + "reward": 3.5279414653778076, + "reward_std": 14.582866668701172, + "rewards/rollout_reward_func/mean": 3.5279414653778076, + "rewards/rollout_reward_func/std": 15.890913963317871, + "sampling/importance_sampling_ratio/max": 1.2239214181900024, + "sampling/importance_sampling_ratio/mean": 0.9994624853134155, + "sampling/importance_sampling_ratio/min": 0.6852503418922424, + "sampling/sampling_logp_difference/max": 0.31956130266189575, + "sampling/sampling_logp_difference/mean": 0.006933148950338364, + "step": 111, + "step_time": 29.204085013999247 + }, + { + "clip_ratio/high_max": 0.02651515230536461, + "clip_ratio/high_mean": 0.006628788076341152, + "clip_ratio/low_mean": 0.018129006726667285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02475779491942376, + "entropy": 0.1639441135339439, + "epoch": 0.00224, + "grad_norm": 0.19418354332447052, + "kl": 0.650929281488061, + "learning_rate": 9.999989303737441e-05, + "loss": -0.0109, + "step": 112, + "step_time": 7.643361527999332 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 693.09375, + "completions/mean_terminated_length": 693.09375, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.14537212159484625, + "epoch": 0.00226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4737316071987152, + "kl": 0.669768800958991, + "learning_rate": 9.99998902040514e-05, + "loss": 0.0169, + "num_tokens": 5616460.0, + "reward": 3.8732333183288574, + "reward_std": 9.794268608093262, + "rewards/rollout_reward_func/mean": 3.8732333183288574, + "rewards/rollout_reward_func/std": 10.40365982055664, + "sampling/importance_sampling_ratio/max": 1.1869113445281982, + "sampling/importance_sampling_ratio/mean": 0.9964576959609985, + "sampling/importance_sampling_ratio/min": 0.5200645923614502, + "sampling/sampling_logp_difference/max": 0.6150112152099609, + "sampling/sampling_logp_difference/mean": 0.007128065451979637, + "step": 113, + "step_time": 27.900884353000038 + }, + { + "clip_ratio/high_max": 0.042140152771025896, + "clip_ratio/high_mean": 0.011837121681310236, + "clip_ratio/low_mean": 0.006510416860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01834753854200244, + "entropy": 0.14897123211994767, + "epoch": 0.00228, + "grad_norm": 0.20660799741744995, + "kl": 0.7189689762890339, + "learning_rate": 9.999988733369157e-05, + "loss": 0.0137, + "step": 114, + "step_time": 7.532232160000149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 689.0625, + "completions/mean_terminated_length": 689.0625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.16697307769209146, + "epoch": 0.0023, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37319666147232056, + "kl": 0.6085000336170197, + "learning_rate": 9.999988442629488e-05, + "loss": -0.015, + "num_tokens": 5711756.0, + "reward": 3.845529079437256, + "reward_std": 9.702705383300781, + "rewards/rollout_reward_func/mean": 3.845529079437256, + "rewards/rollout_reward_func/std": 9.905435562133789, + "sampling/importance_sampling_ratio/max": 1.3216222524642944, + "sampling/importance_sampling_ratio/mean": 1.0128694772720337, + "sampling/importance_sampling_ratio/min": 0.7146333456039429, + "sampling/sampling_logp_difference/max": 0.3742462396621704, + "sampling/sampling_logp_difference/mean": 0.006911748554557562, + "step": 115, + "step_time": 29.116642522001257 + }, + { + "clip_ratio/high_max": 0.0416666679084301, + "clip_ratio/high_mean": 0.011718750349245965, + "clip_ratio/low_mean": 0.020951705169864, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.032670455519109964, + "entropy": 0.17214004416018724, + "epoch": 0.00232, + "grad_norm": 0.19411630928516388, + "kl": 0.6454576198011637, + "learning_rate": 9.99998814818614e-05, + "loss": -0.0191, + "step": 116, + "step_time": 7.846242159999747 + }, + { + "clip_ratio/high_max": 0.0052083334885537624, + "clip_ratio/high_mean": 0.0013020833721384406, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041667442768812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 674.78125, + "completions/mean_terminated_length": 674.78125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.16358821745961905, + "epoch": 0.00234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3246734142303467, + "kl": 0.5800395030528307, + "learning_rate": 9.999987850039107e-05, + "loss": 0.0099, + "num_tokens": 5806145.0, + "reward": 1.2733659744262695, + "reward_std": 12.069713592529297, + "rewards/rollout_reward_func/mean": 1.2733662128448486, + "rewards/rollout_reward_func/std": 12.829185485839844, + "sampling/importance_sampling_ratio/max": 1.306739330291748, + "sampling/importance_sampling_ratio/mean": 1.0012977123260498, + "sampling/importance_sampling_ratio/min": 0.8135073781013489, + "sampling/sampling_logp_difference/max": 0.19866454601287842, + "sampling/sampling_logp_difference/mean": 0.006336529273539782, + "step": 117, + "step_time": 27.930649275999258 + }, + { + "clip_ratio/high_max": 0.02083333395421505, + "clip_ratio/high_mean": 0.006510416860692203, + "clip_ratio/low_mean": 0.013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01953125058207661, + "entropy": 0.16348634008318186, + "epoch": 0.00236, + "grad_norm": 0.11877016723155975, + "kl": 0.587722685188055, + "learning_rate": 9.999987548188396e-05, + "loss": 0.0055, + "step": 118, + "step_time": 7.173724952000157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 685.921875, + "completions/mean_terminated_length": 685.921875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.17666231095790863, + "epoch": 0.00238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2972959578037262, + "kl": 0.5733677446842194, + "learning_rate": 9.999987242634001e-05, + "loss": 0.0156, + "num_tokens": 5901319.0, + "reward": 6.098433494567871, + "reward_std": 11.96851921081543, + "rewards/rollout_reward_func/mean": 6.098433494567871, + "rewards/rollout_reward_func/std": 14.112695693969727, + "sampling/importance_sampling_ratio/max": 1.2103277444839478, + "sampling/importance_sampling_ratio/mean": 1.0073938369750977, + "sampling/importance_sampling_ratio/min": 0.7692804932594299, + "sampling/sampling_logp_difference/max": 0.13658356666564941, + "sampling/sampling_logp_difference/mean": 0.0063937013037502766, + "step": 119, + "step_time": 28.342584406000242 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.0026041667442768812, + "clip_ratio/low_mean": 0.009114583488553762, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011718750232830644, + "entropy": 0.16604932164773345, + "epoch": 0.0024, + "grad_norm": 0.23078079521656036, + "kl": 0.5974587891250849, + "learning_rate": 9.999986933375924e-05, + "loss": 0.0105, + "step": 120, + "step_time": 7.440147934999914 + }, + { + "clip_ratio/high_max": 0.005681818351149559, + "clip_ratio/high_mean": 0.0014204545877873898, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014204545877873898, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 677.953125, + "completions/mean_terminated_length": 677.953125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.13314053160138428, + "epoch": 0.00242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2688275873661041, + "kl": 0.6813949979841709, + "learning_rate": 9.999986620414167e-05, + "loss": -0.0055, + "num_tokens": 5995970.0, + "reward": 4.1811299324035645, + "reward_std": 11.76725959777832, + "rewards/rollout_reward_func/mean": 4.1811299324035645, + "rewards/rollout_reward_func/std": 12.213129997253418, + "sampling/importance_sampling_ratio/max": 1.4055489301681519, + "sampling/importance_sampling_ratio/mean": 1.0007095336914062, + "sampling/importance_sampling_ratio/min": 0.7907775640487671, + "sampling/sampling_logp_difference/max": 0.2328205108642578, + "sampling/sampling_logp_difference/mean": 0.0057580312713980675, + "step": 121, + "step_time": 25.73195371799966 + }, + { + "clip_ratio/high_max": 0.03172348579391837, + "clip_ratio/high_mean": 0.007930871448479593, + "clip_ratio/low_mean": 0.007812500232830644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015743371681310236, + "entropy": 0.12837151251733303, + "epoch": 0.00244, + "grad_norm": 0.19652943313121796, + "kl": 0.6755912862718105, + "learning_rate": 9.99998630374873e-05, + "loss": -0.0109, + "step": 122, + "step_time": 7.963045050999881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 676.75, + "completions/mean_terminated_length": 676.75, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.14288373803719878, + "epoch": 0.00246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5129408836364746, + "kl": 0.6468502469360828, + "learning_rate": 9.999985983379613e-05, + "loss": -0.002, + "num_tokens": 6090409.0, + "reward": 5.090976238250732, + "reward_std": 8.817068099975586, + "rewards/rollout_reward_func/mean": 5.090975761413574, + "rewards/rollout_reward_func/std": 9.348170280456543, + "sampling/importance_sampling_ratio/max": 1.2873598337173462, + "sampling/importance_sampling_ratio/mean": 0.9989021420478821, + "sampling/importance_sampling_ratio/min": 0.8453167676925659, + "sampling/sampling_logp_difference/max": 0.1934504508972168, + "sampling/sampling_logp_difference/mean": 0.0064778015948832035, + "step": 123, + "step_time": 28.174620942000274 + }, + { + "clip_ratio/high_max": 0.026041667442768812, + "clip_ratio/high_mean": 0.006510416860692203, + "clip_ratio/low_mean": 0.02367424312978983, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03018465987406671, + "entropy": 0.12851850083097816, + "epoch": 0.00248, + "grad_norm": 0.170461967587471, + "kl": 0.6984463054686785, + "learning_rate": 9.999985659306817e-05, + "loss": -0.0077, + "step": 124, + "step_time": 6.415902794999965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 686.765625, + "completions/mean_terminated_length": 686.765625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "entropy": 0.12949980096891522, + "epoch": 0.0025, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4389054477214813, + "kl": 0.8373041488230228, + "learning_rate": 9.999985331530339e-05, + "loss": -0.0001, + "num_tokens": 6185533.0, + "reward": 6.523627281188965, + "reward_std": 12.731056213378906, + "rewards/rollout_reward_func/mean": 6.523627281188965, + "rewards/rollout_reward_func/std": 13.220861434936523, + "sampling/importance_sampling_ratio/max": 1.4951905012130737, + "sampling/importance_sampling_ratio/mean": 1.0012614727020264, + "sampling/importance_sampling_ratio/min": 0.7251157760620117, + "sampling/sampling_logp_difference/max": 0.39764922857284546, + "sampling/sampling_logp_difference/mean": 0.006425045896321535, + "step": 125, + "step_time": 27.883570014999805 + }, + { + "clip_ratio/high_max": 0.03645833441987634, + "clip_ratio/high_mean": 0.009114583604969084, + "clip_ratio/low_mean": 0.02854567370377481, + "clip_ratio/low_min": 0.0052083334885537624, + "clip_ratio/region_mean": 0.03766025695949793, + "entropy": 0.11564141698181629, + "epoch": 0.00252, + "grad_norm": 0.24558016657829285, + "kl": 1.0033343844115734, + "learning_rate": 9.999985000050182e-05, + "loss": -0.0041, + "step": 126, + "step_time": 6.9546678629999406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0013020833721384406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020833721384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 683.515625, + "completions/mean_terminated_length": 683.515625, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.10271549178287387, + "epoch": 0.00254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48136869072914124, + "kl": 0.8534571155905724, + "learning_rate": 9.999984664866347e-05, + "loss": 0.0132, + "num_tokens": 6280443.0, + "reward": 4.674668788909912, + "reward_std": 11.713541030883789, + "rewards/rollout_reward_func/mean": 4.67466926574707, + "rewards/rollout_reward_func/std": 13.705061912536621, + "sampling/importance_sampling_ratio/max": 1.1515135765075684, + "sampling/importance_sampling_ratio/mean": 0.9829530715942383, + "sampling/importance_sampling_ratio/min": 0.6125902533531189, + "sampling/sampling_logp_difference/max": 0.4248628616333008, + "sampling/sampling_logp_difference/mean": 0.00649910606443882, + "step": 127, + "step_time": 27.126788691001366 + }, + { + "clip_ratio/high_max": 0.03645833441987634, + "clip_ratio/high_mean": 0.009114583604969084, + "clip_ratio/low_mean": 0.015625000465661287, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02473958395421505, + "entropy": 0.10252567520365119, + "epoch": 0.00256, + "grad_norm": 0.25049537420272827, + "kl": 0.9629664830863476, + "learning_rate": 9.999984325978833e-05, + "loss": 0.0108, + "step": 128, + "step_time": 7.304040701002123 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 959.40625, + "completions/mean_terminated_length": 959.40625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 0.13085902528837323, + "epoch": 0.00258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6536189913749695, + "kl": 0.8716370463371277, + "learning_rate": 9.99998398338764e-05, + "loss": 0.0229, + "num_tokens": 6393090.0, + "reward": 5.6695556640625, + "reward_std": 11.05074405670166, + "rewards/rollout_reward_func/mean": 5.669555187225342, + "rewards/rollout_reward_func/std": 12.366477966308594, + "sampling/importance_sampling_ratio/max": 1.2895963191986084, + "sampling/importance_sampling_ratio/mean": 1.023085355758667, + "sampling/importance_sampling_ratio/min": 0.7725162506103516, + "sampling/sampling_logp_difference/max": 0.30040407180786133, + "sampling/sampling_logp_difference/mean": 0.008372966200113297, + "step": 129, + "step_time": 33.28244163999989 + }, + { + "clip_ratio/high_max": 0.07559524197131395, + "clip_ratio/high_mean": 0.028273811331018806, + "clip_ratio/low_mean": 0.02299107296857983, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.051264884416013956, + "entropy": 0.14083249866962433, + "epoch": 0.0026, + "grad_norm": 0.30618196725845337, + "kl": 0.9511819295585155, + "learning_rate": 9.999983637092769e-05, + "loss": 0.0154, + "step": 130, + "step_time": 8.346246693000012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 954.09375, + "completions/mean_terminated_length": 954.09375, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.1475105220451951, + "epoch": 0.00262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5288283228874207, + "kl": 0.912773609161377, + "learning_rate": 9.999983287094222e-05, + "loss": -0.0212, + "num_tokens": 6505385.0, + "reward": 7.057158470153809, + "reward_std": 10.295648574829102, + "rewards/rollout_reward_func/mean": 7.057158470153809, + "rewards/rollout_reward_func/std": 10.425559997558594, + "sampling/importance_sampling_ratio/max": 1.3561307191848755, + "sampling/importance_sampling_ratio/mean": 0.9847082495689392, + "sampling/importance_sampling_ratio/min": 0.6632312536239624, + "sampling/sampling_logp_difference/max": 0.22558808326721191, + "sampling/sampling_logp_difference/mean": 0.00731184845790267, + "step": 131, + "step_time": 33.95278312799974 + }, + { + "clip_ratio/high_max": 0.06726190773770213, + "clip_ratio/high_mean": 0.02313988225068897, + "clip_ratio/low_mean": 0.03020833560731262, + "clip_ratio/low_min": 0.004166666883975267, + "clip_ratio/region_mean": 0.0533482184400782, + "entropy": 0.16657310537993908, + "epoch": 0.00264, + "grad_norm": 0.29056552052497864, + "kl": 0.7771002501249313, + "learning_rate": 9.999982933391997e-05, + "loss": -0.0284, + "step": 132, + "step_time": 7.300914149999244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0031250001629814506, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 929.96875, + "completions/mean_terminated_length": 929.96875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.18474403023719788, + "epoch": 0.00266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6026266813278198, + "kl": 0.7070890348404646, + "learning_rate": 9.999982575986094e-05, + "loss": -0.0, + "num_tokens": 6616176.0, + "reward": 3.4366226196289062, + "reward_std": 14.906189918518066, + "rewards/rollout_reward_func/mean": 3.436622381210327, + "rewards/rollout_reward_func/std": 16.053083419799805, + "sampling/importance_sampling_ratio/max": 1.3064157962799072, + "sampling/importance_sampling_ratio/mean": 1.0054032802581787, + "sampling/importance_sampling_ratio/min": 0.5862367749214172, + "sampling/sampling_logp_difference/max": 0.5461184978485107, + "sampling/sampling_logp_difference/mean": 0.010052897036075592, + "step": 133, + "step_time": 32.881761665999875 + }, + { + "clip_ratio/high_max": 0.06369047937914729, + "clip_ratio/high_mean": 0.022172620403580368, + "clip_ratio/low_mean": 0.03557477821595967, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05774739931803197, + "entropy": 0.18857589829713106, + "epoch": 0.00268, + "grad_norm": 0.2586621344089508, + "kl": 0.8268643505871296, + "learning_rate": 9.999982214876515e-05, + "loss": -0.0078, + "step": 134, + "step_time": 7.700307692000479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 949.171875, + "completions/mean_terminated_length": 949.171875, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "entropy": 0.21391641069203615, + "epoch": 0.0027, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5073988437652588, + "kl": 0.669338870793581, + "learning_rate": 9.999981850063262e-05, + "loss": -0.0078, + "num_tokens": 6728116.0, + "reward": 5.17537784576416, + "reward_std": 13.093953132629395, + "rewards/rollout_reward_func/mean": 5.175378322601318, + "rewards/rollout_reward_func/std": 13.309264183044434, + "sampling/importance_sampling_ratio/max": 1.3000229597091675, + "sampling/importance_sampling_ratio/mean": 0.9869031310081482, + "sampling/importance_sampling_ratio/min": 0.7261144518852234, + "sampling/sampling_logp_difference/max": 0.1514453887939453, + "sampling/sampling_logp_difference/mean": 0.008218428120017052, + "step": 135, + "step_time": 32.26767973099959 + }, + { + "clip_ratio/high_max": 0.06815476482734084, + "clip_ratio/high_mean": 0.022321430151350796, + "clip_ratio/low_mean": 0.04136904957704246, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06369047961197793, + "entropy": 0.21292453352361917, + "epoch": 0.00272, + "grad_norm": 0.3758169710636139, + "kl": 0.6697604712098837, + "learning_rate": 9.99998148154633e-05, + "loss": -0.0147, + "step": 136, + "step_time": 8.984081079998305 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 941.34375, + "completions/mean_terminated_length": 941.34375, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.23286819364875555, + "epoch": 0.00274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6691973805427551, + "kl": 0.6865591164678335, + "learning_rate": 9.999981109325724e-05, + "loss": 0.0217, + "num_tokens": 6839571.0, + "reward": 4.762706756591797, + "reward_std": 11.42410659790039, + "rewards/rollout_reward_func/mean": 4.762706756591797, + "rewards/rollout_reward_func/std": 11.434100151062012, + "sampling/importance_sampling_ratio/max": 1.5375083684921265, + "sampling/importance_sampling_ratio/mean": 1.0140312910079956, + "sampling/importance_sampling_ratio/min": 0.691352128982544, + "sampling/sampling_logp_difference/max": 0.24680709838867188, + "sampling/sampling_logp_difference/mean": 0.010121582075953484, + "step": 137, + "step_time": 31.738008715999968 + }, + { + "clip_ratio/high_max": 0.07113095559179783, + "clip_ratio/high_mean": 0.022098215762525797, + "clip_ratio/low_mean": 0.0486922818236053, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.07079049723688513, + "entropy": 0.2127716289833188, + "epoch": 0.00276, + "grad_norm": 0.30709579586982727, + "kl": 0.6930392682552338, + "learning_rate": 9.999980733401442e-05, + "loss": 0.0087, + "step": 138, + "step_time": 8.016800426000827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 978.6875, + "completions/mean_terminated_length": 978.6875, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.19525799248367548, + "epoch": 0.00278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7576553225517273, + "kl": 0.7210239768028259, + "learning_rate": 9.999980353773486e-05, + "loss": 0.0087, + "num_tokens": 6953628.0, + "reward": 7.9931640625, + "reward_std": 14.572214126586914, + "rewards/rollout_reward_func/mean": 7.993164539337158, + "rewards/rollout_reward_func/std": 15.543896675109863, + "sampling/importance_sampling_ratio/max": 1.4368523359298706, + "sampling/importance_sampling_ratio/mean": 1.020465612411499, + "sampling/importance_sampling_ratio/min": 0.6616964340209961, + "sampling/sampling_logp_difference/max": 0.3545997142791748, + "sampling/sampling_logp_difference/mean": 0.009700989350676537, + "step": 139, + "step_time": 31.6874715999993 + }, + { + "clip_ratio/high_max": 0.07712912419810891, + "clip_ratio/high_mean": 0.02553228137549013, + "clip_ratio/low_mean": 0.053521828493103385, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.07905411045067012, + "entropy": 0.1893756091594696, + "epoch": 0.0028, + "grad_norm": 0.31711098551750183, + "kl": 0.786970479413867, + "learning_rate": 9.999979970441856e-05, + "loss": -0.0032, + "step": 140, + "step_time": 8.092264081999474 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 955.375, + "completions/mean_terminated_length": 955.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.17851338349282742, + "epoch": 0.00282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5795699954032898, + "kl": 0.7661695275455713, + "learning_rate": 9.999979583406551e-05, + "loss": -0.0028, + "num_tokens": 7066060.0, + "reward": 5.970464706420898, + "reward_std": 14.057101249694824, + "rewards/rollout_reward_func/mean": 5.970464706420898, + "rewards/rollout_reward_func/std": 15.589529991149902, + "sampling/importance_sampling_ratio/max": 1.2306140661239624, + "sampling/importance_sampling_ratio/mean": 1.0006752014160156, + "sampling/importance_sampling_ratio/min": 0.7063568830490112, + "sampling/sampling_logp_difference/max": 0.24321842193603516, + "sampling/sampling_logp_difference/mean": 0.008507179096341133, + "step": 141, + "step_time": 31.519457149999653 + }, + { + "clip_ratio/high_max": 0.10007440904155374, + "clip_ratio/high_mean": 0.03335193661041558, + "clip_ratio/low_mean": 0.04136905015911907, + "clip_ratio/low_min": 0.004166666883975267, + "clip_ratio/region_mean": 0.07472098711878061, + "entropy": 0.16169621469452977, + "epoch": 0.00284, + "grad_norm": 0.21583755314350128, + "kl": 0.8030649330466986, + "learning_rate": 9.999979192667573e-05, + "loss": -0.0127, + "step": 142, + "step_time": 8.37791394099986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 965.875, + "completions/mean_terminated_length": 965.875, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 0.1351936119608581, + "epoch": 0.00286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6277215480804443, + "kl": 0.6056302916258574, + "learning_rate": 9.999978798224921e-05, + "loss": -0.0037, + "num_tokens": 7179154.0, + "reward": 7.006319046020508, + "reward_std": 16.71393394470215, + "rewards/rollout_reward_func/mean": 7.006319522857666, + "rewards/rollout_reward_func/std": 17.009944915771484, + "sampling/importance_sampling_ratio/max": 1.4720120429992676, + "sampling/importance_sampling_ratio/mean": 1.0313916206359863, + "sampling/importance_sampling_ratio/min": 0.8535375595092773, + "sampling/sampling_logp_difference/max": 0.33231019973754883, + "sampling/sampling_logp_difference/mean": 0.007416378241032362, + "step": 143, + "step_time": 31.253298032000657 + }, + { + "clip_ratio/high_max": 0.03363095410168171, + "clip_ratio/high_mean": 0.010565476841293275, + "clip_ratio/low_mean": 0.022564054117538035, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03312953084241599, + "entropy": 0.12869372498244047, + "epoch": 0.00288, + "grad_norm": 0.3357137143611908, + "kl": 0.6441880892962217, + "learning_rate": 9.999978400078598e-05, + "loss": -0.011, + "step": 144, + "step_time": 8.612604698998894 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 981.234375, + "completions/mean_terminated_length": 981.234375, + "completions/min_length": 922.0, + "completions/min_terminated_length": 922.0, + "entropy": 0.14574182452633977, + "epoch": 0.0029, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6256417632102966, + "kl": 0.680026089772582, + "learning_rate": 9.9999779982286e-05, + "loss": 0.0066, + "num_tokens": 7293276.0, + "reward": 9.345416069030762, + "reward_std": 12.761893272399902, + "rewards/rollout_reward_func/mean": 9.345417022705078, + "rewards/rollout_reward_func/std": 14.231216430664062, + "sampling/importance_sampling_ratio/max": 1.2866383790969849, + "sampling/importance_sampling_ratio/mean": 0.9946876764297485, + "sampling/importance_sampling_ratio/min": 0.7063043117523193, + "sampling/sampling_logp_difference/max": 0.3008323907852173, + "sampling/sampling_logp_difference/mean": 0.007616790477186441, + "step": 145, + "step_time": 31.996783762999257 + }, + { + "clip_ratio/high_max": 0.05424107378348708, + "clip_ratio/high_mean": 0.020851935259997845, + "clip_ratio/low_mean": 0.03377976384945214, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05463169957511127, + "entropy": 0.14952234365046024, + "epoch": 0.00292, + "grad_norm": 0.3231852948665619, + "kl": 0.7531629204750061, + "learning_rate": 9.999977592674931e-05, + "loss": -0.0032, + "step": 146, + "step_time": 8.073437064001155 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 951.1875, + "completions/mean_terminated_length": 951.1875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.14304543379694223, + "epoch": 0.00294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4544009566307068, + "kl": 0.6561761032789946, + "learning_rate": 9.999977183417592e-05, + "loss": -0.0136, + "num_tokens": 7405394.0, + "reward": 9.592363357543945, + "reward_std": 11.82339859008789, + "rewards/rollout_reward_func/mean": 9.592363357543945, + "rewards/rollout_reward_func/std": 12.213863372802734, + "sampling/importance_sampling_ratio/max": 1.3994261026382446, + "sampling/importance_sampling_ratio/mean": 0.9877851009368896, + "sampling/importance_sampling_ratio/min": 0.5693183541297913, + "sampling/sampling_logp_difference/max": 0.5401673913002014, + "sampling/sampling_logp_difference/mean": 0.007635599002242088, + "step": 147, + "step_time": 31.870756492000055 + }, + { + "clip_ratio/high_max": 0.054166669491678476, + "clip_ratio/high_mean": 0.013541667372919619, + "clip_ratio/low_mean": 0.036681550089269876, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05022321757860482, + "entropy": 0.14903255039826035, + "epoch": 0.00296, + "grad_norm": 0.34076768159866333, + "kl": 0.6760309524834156, + "learning_rate": 9.99997677045658e-05, + "loss": -0.0174, + "step": 148, + "step_time": 8.03263958799971 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0022435898426920176, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004326923284679651, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 950.578125, + "completions/mean_terminated_length": 950.578125, + "completions/min_length": 673.0, + "completions/min_terminated_length": 673.0, + "entropy": 0.16968106850981712, + "epoch": 0.00298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5638662576675415, + "kl": 0.6232388503849506, + "learning_rate": 9.999976353791898e-05, + "loss": -0.0115, + "num_tokens": 7517436.0, + "reward": 6.506036281585693, + "reward_std": 12.593399047851562, + "rewards/rollout_reward_func/mean": 6.506035804748535, + "rewards/rollout_reward_func/std": 13.552786827087402, + "sampling/importance_sampling_ratio/max": 1.6476225852966309, + "sampling/importance_sampling_ratio/mean": 0.9991188645362854, + "sampling/importance_sampling_ratio/min": 0.5213066935539246, + "sampling/sampling_logp_difference/max": 0.576519250869751, + "sampling/sampling_logp_difference/mean": 0.01059242058545351, + "step": 149, + "step_time": 30.528242389000752 + }, + { + "clip_ratio/high_max": 0.05000000260770321, + "clip_ratio/high_mean": 0.01458333432674408, + "clip_ratio/low_mean": 0.03889938397333026, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05348271911498159, + "entropy": 0.17780038248747587, + "epoch": 0.003, + "grad_norm": 0.5385463833808899, + "kl": 0.8597960155457258, + "learning_rate": 9.999975933423545e-05, + "loss": -0.0172, + "step": 150, + "step_time": 8.0192518380004 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.002157738199457526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003199404920451343, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 953.953125, + "completions/mean_terminated_length": 953.953125, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "entropy": 0.1825277367606759, + "epoch": 0.00302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6920294165611267, + "kl": 0.6721424907445908, + "learning_rate": 9.999975509351522e-05, + "loss": -0.0165, + "num_tokens": 7629697.0, + "reward": 6.279596328735352, + "reward_std": 13.454200744628906, + "rewards/rollout_reward_func/mean": 6.279596328735352, + "rewards/rollout_reward_func/std": 15.490900039672852, + "sampling/importance_sampling_ratio/max": 1.2544176578521729, + "sampling/importance_sampling_ratio/mean": 0.9968298673629761, + "sampling/importance_sampling_ratio/min": 0.5891286730766296, + "sampling/sampling_logp_difference/max": 0.36822509765625, + "sampling/sampling_logp_difference/mean": 0.009644769132137299, + "step": 151, + "step_time": 30.041253716999563 + }, + { + "clip_ratio/high_max": 0.06250000279396772, + "clip_ratio/high_mean": 0.02187500149011612, + "clip_ratio/low_mean": 0.027847783756442368, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0497227858286351, + "entropy": 0.19313342962414026, + "epoch": 0.00304, + "grad_norm": 0.3150973320007324, + "kl": 0.6543413959443569, + "learning_rate": 9.99997508157583e-05, + "loss": -0.0263, + "step": 152, + "step_time": 8.048088266000377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 933.640625, + "completions/mean_terminated_length": 933.640625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.1851256461814046, + "epoch": 0.00306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6191554665565491, + "kl": 0.5646015591919422, + "learning_rate": 9.999974650096467e-05, + "loss": -0.0157, + "num_tokens": 7740640.0, + "reward": 7.951285362243652, + "reward_std": 13.322220802307129, + "rewards/rollout_reward_func/mean": 7.951285362243652, + "rewards/rollout_reward_func/std": 15.29836654663086, + "sampling/importance_sampling_ratio/max": 1.1902070045471191, + "sampling/importance_sampling_ratio/mean": 0.9911805987358093, + "sampling/importance_sampling_ratio/min": 0.6955353617668152, + "sampling/sampling_logp_difference/max": 0.37529921531677246, + "sampling/sampling_logp_difference/mean": 0.007848689332604408, + "step": 153, + "step_time": 30.541750664000574 + }, + { + "clip_ratio/high_max": 0.04301470750942826, + "clip_ratio/high_mean": 0.013878677156753838, + "clip_ratio/low_mean": 0.039536832249723375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.053415509522892535, + "entropy": 0.16637779865413904, + "epoch": 0.00308, + "grad_norm": 0.3494158089160919, + "kl": 0.6059492044150829, + "learning_rate": 9.999974214913437e-05, + "loss": -0.0231, + "step": 154, + "step_time": 8.139173758999277 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 972.640625, + "completions/mean_terminated_length": 972.640625, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.1503364727832377, + "epoch": 0.0031, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6759209036827087, + "kl": 0.6219805851578712, + "learning_rate": 9.999973776026739e-05, + "loss": 0.0152, + "num_tokens": 7854154.0, + "reward": 5.902735710144043, + "reward_std": 12.42209243774414, + "rewards/rollout_reward_func/mean": 5.902735710144043, + "rewards/rollout_reward_func/std": 12.867145538330078, + "sampling/importance_sampling_ratio/max": 1.4259474277496338, + "sampling/importance_sampling_ratio/mean": 1.0006431341171265, + "sampling/importance_sampling_ratio/min": 0.6987265348434448, + "sampling/sampling_logp_difference/max": 0.35797882080078125, + "sampling/sampling_logp_difference/mean": 0.008803295902907848, + "step": 155, + "step_time": 31.54653142600091 + }, + { + "clip_ratio/high_max": 0.054464288521558046, + "clip_ratio/high_mean": 0.018824405618943274, + "clip_ratio/low_mean": 0.0364583358168602, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05528274178504944, + "entropy": 0.1241895561106503, + "epoch": 0.00312, + "grad_norm": 0.955508828163147, + "kl": 0.9998617265373468, + "learning_rate": 9.999973333436372e-05, + "loss": 0.017, + "step": 156, + "step_time": 7.910055370999544 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0006127451197244227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016544118407182395, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 972.546875, + "completions/mean_terminated_length": 972.546875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.11084589222446084, + "epoch": 0.00314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9052144885063171, + "kl": 0.9162529278546572, + "learning_rate": 9.999972887142338e-05, + "loss": 0.0236, + "num_tokens": 7967770.0, + "reward": 10.1655855178833, + "reward_std": 15.845230102539062, + "rewards/rollout_reward_func/mean": 10.1655855178833, + "rewards/rollout_reward_func/std": 17.717178344726562, + "sampling/importance_sampling_ratio/max": 1.5550763607025146, + "sampling/importance_sampling_ratio/mean": 1.0152667760849, + "sampling/importance_sampling_ratio/min": 0.6825421452522278, + "sampling/sampling_logp_difference/max": 0.38708627223968506, + "sampling/sampling_logp_difference/mean": 0.006948791444301605, + "step": 157, + "step_time": 30.977979516999312 + }, + { + "clip_ratio/high_max": 0.041964287869632244, + "clip_ratio/high_mean": 0.013616072130389512, + "clip_ratio/low_mean": 0.019929535686969757, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03354560805018991, + "entropy": 0.11029910668730736, + "epoch": 0.00316, + "grad_norm": 0.3586527705192566, + "kl": 0.996163547039032, + "learning_rate": 9.999972437144637e-05, + "loss": 0.018, + "step": 158, + "step_time": 8.73399685899949 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 954.4375, + "completions/mean_terminated_length": 954.4375, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.14973071590065956, + "epoch": 0.00318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7992783784866333, + "kl": 0.5131530929356813, + "learning_rate": 9.999971983443269e-05, + "loss": -0.0019, + "num_tokens": 8080082.0, + "reward": 5.8201141357421875, + "reward_std": 11.146739959716797, + "rewards/rollout_reward_func/mean": 5.8201141357421875, + "rewards/rollout_reward_func/std": 11.795808792114258, + "sampling/importance_sampling_ratio/max": 1.2158492803573608, + "sampling/importance_sampling_ratio/mean": 0.9923404455184937, + "sampling/importance_sampling_ratio/min": 0.623603343963623, + "sampling/sampling_logp_difference/max": 0.24274826049804688, + "sampling/sampling_logp_difference/mean": 0.007134515792131424, + "step": 159, + "step_time": 31.143712819999564 + }, + { + "clip_ratio/high_max": 0.06250000232830644, + "clip_ratio/high_mean": 0.017708334140479565, + "clip_ratio/low_mean": 0.028382036020047963, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046090369927696884, + "entropy": 0.15424074092879891, + "epoch": 0.0032, + "grad_norm": 0.4114607274532318, + "kl": 0.5258241277188063, + "learning_rate": 9.999971526038235e-05, + "loss": -0.0105, + "step": 160, + "step_time": 7.376053459000104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1066.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 964.390625, + "completions/mean_terminated_length": 964.390625, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "entropy": 0.14214739575982094, + "epoch": 0.00322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6593955159187317, + "kl": 0.7137422636151314, + "learning_rate": 9.999971064929537e-05, + "loss": 0.0221, + "num_tokens": 8193063.0, + "reward": 7.681003093719482, + "reward_std": 11.441247940063477, + "rewards/rollout_reward_func/mean": 7.681002616882324, + "rewards/rollout_reward_func/std": 13.56708812713623, + "sampling/importance_sampling_ratio/max": 1.4164402484893799, + "sampling/importance_sampling_ratio/mean": 1.0107839107513428, + "sampling/importance_sampling_ratio/min": 0.6920035481452942, + "sampling/sampling_logp_difference/max": 0.3535594940185547, + "sampling/sampling_logp_difference/mean": 0.007559535559266806, + "step": 161, + "step_time": 32.16549203100021 + }, + { + "clip_ratio/high_max": 0.045833335258066654, + "clip_ratio/high_mean": 0.014657739084213972, + "clip_ratio/low_mean": 0.033670345321297646, + "clip_ratio/low_min": 0.004166666883975267, + "clip_ratio/region_mean": 0.04832808405626565, + "entropy": 0.1284659137018025, + "epoch": 0.00324, + "grad_norm": 0.44948309659957886, + "kl": 0.8788620755076408, + "learning_rate": 9.999970600117172e-05, + "loss": 0.0155, + "step": 162, + "step_time": 8.349364119001166 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 958.1875, + "completions/mean_terminated_length": 958.1875, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.1297779600135982, + "epoch": 0.00326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45152774453163147, + "kl": 0.6032252982258797, + "learning_rate": 9.999970131601142e-05, + "loss": -0.007, + "num_tokens": 8305653.0, + "reward": 9.560303688049316, + "reward_std": 12.965145111083984, + "rewards/rollout_reward_func/mean": 9.560302734375, + "rewards/rollout_reward_func/std": 13.572053909301758, + "sampling/importance_sampling_ratio/max": 1.3970085382461548, + "sampling/importance_sampling_ratio/mean": 0.9942675828933716, + "sampling/importance_sampling_ratio/min": 0.5912600755691528, + "sampling/sampling_logp_difference/max": 0.43671131134033203, + "sampling/sampling_logp_difference/mean": 0.006968793459236622, + "step": 163, + "step_time": 29.62484441499919 + }, + { + "clip_ratio/high_max": 0.04534313944168389, + "clip_ratio/high_mean": 0.013419118302408606, + "clip_ratio/low_mean": 0.028385418467223644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04180453671142459, + "entropy": 0.12798475893214345, + "epoch": 0.00328, + "grad_norm": 0.37086573243141174, + "kl": 0.5329502020031214, + "learning_rate": 9.99996965938145e-05, + "loss": -0.0114, + "step": 164, + "step_time": 9.19148286500058 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 958.28125, + "completions/mean_terminated_length": 958.28125, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.14121837774291635, + "epoch": 0.0033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.445486456155777, + "kl": 0.6868670284748077, + "learning_rate": 9.999969183458092e-05, + "loss": 0.017, + "num_tokens": 8418180.0, + "reward": 6.036255836486816, + "reward_std": 14.006401062011719, + "rewards/rollout_reward_func/mean": 6.036255836486816, + "rewards/rollout_reward_func/std": 15.667006492614746, + "sampling/importance_sampling_ratio/max": 1.4084051847457886, + "sampling/importance_sampling_ratio/mean": 0.9844825267791748, + "sampling/importance_sampling_ratio/min": 0.6458684802055359, + "sampling/sampling_logp_difference/max": 0.35437726974487305, + "sampling/sampling_logp_difference/mean": 0.008984029293060303, + "step": 165, + "step_time": 30.86212910500126 + }, + { + "clip_ratio/high_max": 0.041964287869632244, + "clip_ratio/high_mean": 0.012574405525811017, + "clip_ratio/low_mean": 0.02604166802484542, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.038616073317825794, + "entropy": 0.13694474566727877, + "epoch": 0.00332, + "grad_norm": 0.2597510814666748, + "kl": 0.670884259045124, + "learning_rate": 9.999968703831071e-05, + "loss": 0.012, + "step": 166, + "step_time": 8.765868728999521 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 964.015625, + "completions/mean_terminated_length": 964.015625, + "completions/min_length": 773.0, + "completions/min_terminated_length": 773.0, + "entropy": 0.13714495720341802, + "epoch": 0.00334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.742760181427002, + "kl": 0.5935596115887165, + "learning_rate": 9.999968220500386e-05, + "loss": 0.0264, + "num_tokens": 8531148.0, + "reward": 6.6519269943237305, + "reward_std": 14.873868942260742, + "rewards/rollout_reward_func/mean": 6.6519269943237305, + "rewards/rollout_reward_func/std": 15.216424942016602, + "sampling/importance_sampling_ratio/max": 1.4992643594741821, + "sampling/importance_sampling_ratio/mean": 1.0216107368469238, + "sampling/importance_sampling_ratio/min": 0.7036370635032654, + "sampling/sampling_logp_difference/max": 0.351947546005249, + "sampling/sampling_logp_difference/mean": 0.008944995701313019, + "step": 167, + "step_time": 30.057006109999747 + }, + { + "clip_ratio/high_max": 0.03750000195577741, + "clip_ratio/high_mean": 0.013541667489334941, + "clip_ratio/low_mean": 0.03437500225845724, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04791666998062283, + "entropy": 0.13065697345882654, + "epoch": 0.00336, + "grad_norm": 8.381538391113281, + "kl": 7.166379388421774, + "learning_rate": 9.999967733466041e-05, + "loss": 0.0808, + "step": 168, + "step_time": 8.213664751000124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 943.078125, + "completions/mean_terminated_length": 943.078125, + "completions/min_length": 868.0, + "completions/min_terminated_length": 868.0, + "entropy": 0.13596792286261916, + "epoch": 0.00338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6320606470108032, + "kl": 0.5089176166802645, + "learning_rate": 9.999967242728034e-05, + "loss": -0.0005, + "num_tokens": 8642652.0, + "reward": 9.83786392211914, + "reward_std": 12.724628448486328, + "rewards/rollout_reward_func/mean": 9.83786392211914, + "rewards/rollout_reward_func/std": 13.589927673339844, + "sampling/importance_sampling_ratio/max": 1.5156316757202148, + "sampling/importance_sampling_ratio/mean": 1.001371145248413, + "sampling/importance_sampling_ratio/min": 0.75341796875, + "sampling/sampling_logp_difference/max": 0.40897202491760254, + "sampling/sampling_logp_difference/mean": 0.006749385967850685, + "step": 169, + "step_time": 30.052868118000788 + }, + { + "clip_ratio/high_max": 0.020833334419876337, + "clip_ratio/high_mean": 0.007291667046956718, + "clip_ratio/low_mean": 0.03333333553746343, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04062500281725079, + "entropy": 0.13328771898522973, + "epoch": 0.0034, + "grad_norm": 0.27786943316459656, + "kl": 0.5417735707014799, + "learning_rate": 9.999966748286363e-05, + "loss": -0.004, + "step": 170, + "step_time": 7.808134698000686 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 977.46875, + "completions/mean_terminated_length": 977.46875, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "entropy": 0.14305478753522038, + "epoch": 0.00342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.477180153131485, + "kl": 0.9006227869540453, + "learning_rate": 9.999966250141033e-05, + "loss": -0.016, + "num_tokens": 8756508.0, + "reward": 9.534229278564453, + "reward_std": 10.647237777709961, + "rewards/rollout_reward_func/mean": 9.534229278564453, + "rewards/rollout_reward_func/std": 11.566615104675293, + "sampling/importance_sampling_ratio/max": 1.4990143775939941, + "sampling/importance_sampling_ratio/mean": 1.0070048570632935, + "sampling/importance_sampling_ratio/min": 0.6254692077636719, + "sampling/sampling_logp_difference/max": 0.4892125129699707, + "sampling/sampling_logp_difference/mean": 0.008062894456088543, + "step": 171, + "step_time": 29.967204156000207 + }, + { + "clip_ratio/high_max": 0.03333333507180214, + "clip_ratio/high_mean": 0.009375000605359674, + "clip_ratio/low_mean": 0.03333333553746343, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.042708336492069066, + "entropy": 0.13219841895624995, + "epoch": 0.00344, + "grad_norm": 0.2979583740234375, + "kl": 0.9737532902508974, + "learning_rate": 9.999965748292042e-05, + "loss": -0.0247, + "step": 172, + "step_time": 8.450734508001005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0031250001629814506, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 977.578125, + "completions/mean_terminated_length": 977.578125, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "entropy": 0.13215081067755818, + "epoch": 0.00346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.570915699005127, + "kl": 0.7707110401242971, + "learning_rate": 9.999965242739393e-05, + "loss": 0.0115, + "num_tokens": 8870395.0, + "reward": 7.963113784790039, + "reward_std": 12.185734748840332, + "rewards/rollout_reward_func/mean": 7.963113784790039, + "rewards/rollout_reward_func/std": 12.419037818908691, + "sampling/importance_sampling_ratio/max": 1.2637660503387451, + "sampling/importance_sampling_ratio/mean": 0.9871397614479065, + "sampling/importance_sampling_ratio/min": 0.6115806102752686, + "sampling/sampling_logp_difference/max": 0.3316690921783447, + "sampling/sampling_logp_difference/mean": 0.0069004204124212265, + "step": 173, + "step_time": 29.865270385998883 + }, + { + "clip_ratio/high_max": 0.05000000214204192, + "clip_ratio/high_mean": 0.013541667256504297, + "clip_ratio/low_mean": 0.025976563920266926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03951823094394058, + "entropy": 0.1266618687659502, + "epoch": 0.00348, + "grad_norm": 0.3126421570777893, + "kl": 0.7724483050405979, + "learning_rate": 9.999964733483083e-05, + "loss": 0.0074, + "step": 174, + "step_time": 8.14785716599863 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 986.78125, + "completions/mean_terminated_length": 986.78125, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "entropy": 0.12785040121525526, + "epoch": 0.0035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4320923089981079, + "kl": 0.5314337890595198, + "learning_rate": 9.999964220523112e-05, + "loss": 0.0134, + "num_tokens": 8984945.0, + "reward": 11.988597869873047, + "reward_std": 11.876688957214355, + "rewards/rollout_reward_func/mean": 11.988597869873047, + "rewards/rollout_reward_func/std": 12.529437065124512, + "sampling/importance_sampling_ratio/max": 1.5641355514526367, + "sampling/importance_sampling_ratio/mean": 1.0155951976776123, + "sampling/importance_sampling_ratio/min": 0.7307262420654297, + "sampling/sampling_logp_difference/max": 0.28014975786209106, + "sampling/sampling_logp_difference/mean": 0.006405924912542105, + "step": 175, + "step_time": 30.800439373998415 + }, + { + "clip_ratio/high_max": 0.025000001303851604, + "clip_ratio/high_mean": 0.008333333767950535, + "clip_ratio/low_mean": 0.015625001047737896, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023958335164934397, + "entropy": 0.12503943219780922, + "epoch": 0.00352, + "grad_norm": 0.25414347648620605, + "kl": 0.545308168977499, + "learning_rate": 9.999963703859485e-05, + "loss": 0.0068, + "step": 176, + "step_time": 8.294947108000088 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 947.78125, + "completions/mean_terminated_length": 947.78125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.11795077985152602, + "epoch": 0.00354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5428488850593567, + "kl": 0.5484364293515682, + "learning_rate": 9.9999631834922e-05, + "loss": 0.0209, + "num_tokens": 9096764.0, + "reward": 7.462541580200195, + "reward_std": 9.003820419311523, + "rewards/rollout_reward_func/mean": 7.462541103363037, + "rewards/rollout_reward_func/std": 9.709749221801758, + "sampling/importance_sampling_ratio/max": 1.6056361198425293, + "sampling/importance_sampling_ratio/mean": 1.0011367797851562, + "sampling/importance_sampling_ratio/min": 0.6226766109466553, + "sampling/sampling_logp_difference/max": 0.48480892181396484, + "sampling/sampling_logp_difference/mean": 0.007405002135783434, + "step": 177, + "step_time": 30.438013943000442 + }, + { + "clip_ratio/high_max": 0.025000001303851604, + "clip_ratio/high_mean": 0.006250000325962901, + "clip_ratio/low_mean": 0.021875001140870154, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028125001583248377, + "entropy": 0.11180919618345797, + "epoch": 0.00356, + "grad_norm": 1.0773159265518188, + "kl": 0.7693799175322056, + "learning_rate": 9.999962659421255e-05, + "loss": 0.0218, + "step": 178, + "step_time": 8.289468396000302 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005208333604969084, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 962.765625, + "completions/mean_terminated_length": 962.765625, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.12420041672885418, + "epoch": 0.00358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5452980399131775, + "kl": 0.5841826293617487, + "learning_rate": 9.999962131646658e-05, + "loss": 0.0223, + "num_tokens": 9209601.0, + "reward": 9.949074745178223, + "reward_std": 11.123800277709961, + "rewards/rollout_reward_func/mean": 9.949074745178223, + "rewards/rollout_reward_func/std": 11.492538452148438, + "sampling/importance_sampling_ratio/max": 1.846232295036316, + "sampling/importance_sampling_ratio/mean": 1.0060797929763794, + "sampling/importance_sampling_ratio/min": 0.692804217338562, + "sampling/sampling_logp_difference/max": 0.6036995649337769, + "sampling/sampling_logp_difference/mean": 0.0071367728523910046, + "step": 179, + "step_time": 29.633916566999687 + }, + { + "clip_ratio/high_max": 0.03750000195577741, + "clip_ratio/high_mean": 0.014583334093913436, + "clip_ratio/low_mean": 0.018824405735358596, + "clip_ratio/low_min": 0.004166666883975267, + "clip_ratio/region_mean": 0.033407740062102675, + "entropy": 0.11627750238403678, + "epoch": 0.0036, + "grad_norm": 0.38062411546707153, + "kl": 0.639982882887125, + "learning_rate": 9.999961600168402e-05, + "loss": 0.0192, + "step": 180, + "step_time": 8.508149862998835 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 964.46875, + "completions/mean_terminated_length": 964.46875, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "entropy": 0.10024931281805038, + "epoch": 0.00362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7843199968338013, + "kl": 0.5290133021771908, + "learning_rate": 9.999961064986489e-05, + "loss": -0.0105, + "num_tokens": 9322591.0, + "reward": 9.743326187133789, + "reward_std": 11.718559265136719, + "rewards/rollout_reward_func/mean": 9.743326187133789, + "rewards/rollout_reward_func/std": 11.767054557800293, + "sampling/importance_sampling_ratio/max": 1.2395166158676147, + "sampling/importance_sampling_ratio/mean": 0.9893835783004761, + "sampling/importance_sampling_ratio/min": 0.7077917456626892, + "sampling/sampling_logp_difference/max": 0.36174678802490234, + "sampling/sampling_logp_difference/mean": 0.0061057801358401775, + "step": 181, + "step_time": 30.010152957000173 + }, + { + "clip_ratio/high_max": 0.04583333572372794, + "clip_ratio/high_mean": 0.01458333432674408, + "clip_ratio/low_mean": 0.019791668048128486, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.034375002374872565, + "entropy": 0.0898241214454174, + "epoch": 0.00364, + "grad_norm": 0.898304283618927, + "kl": 1.3444663938134909, + "learning_rate": 9.999960526100922e-05, + "loss": -0.0074, + "step": 182, + "step_time": 8.117577253999116 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0011160714784637094, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00424107164144516, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 965.296875, + "completions/mean_terminated_length": 965.296875, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "entropy": 0.12110280524939299, + "epoch": 0.00366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47400349378585815, + "kl": 0.513819495216012, + "learning_rate": 9.999959983511699e-05, + "loss": 0.0011, + "num_tokens": 9435640.0, + "reward": 11.970619201660156, + "reward_std": 16.7136287689209, + "rewards/rollout_reward_func/mean": 11.970619201660156, + "rewards/rollout_reward_func/std": 17.193565368652344, + "sampling/importance_sampling_ratio/max": 1.4852927923202515, + "sampling/importance_sampling_ratio/mean": 0.9956411123275757, + "sampling/importance_sampling_ratio/min": 0.58425372838974, + "sampling/sampling_logp_difference/max": 0.4939703941345215, + "sampling/sampling_logp_difference/mean": 0.007358514238148928, + "step": 183, + "step_time": 30.018645907000064 + }, + { + "clip_ratio/high_max": 0.03750000195577741, + "clip_ratio/high_mean": 0.01041666732635349, + "clip_ratio/low_mean": 0.01875000086147338, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029166668420657516, + "entropy": 0.12478661234490573, + "epoch": 0.00368, + "grad_norm": 0.29323798418045044, + "kl": 0.46843259409070015, + "learning_rate": 9.999959437218822e-05, + "loss": -0.0073, + "step": 184, + "step_time": 8.045792180003446 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 959.875, + "completions/mean_terminated_length": 959.875, + "completions/min_length": 676.0, + "completions/min_terminated_length": 676.0, + "entropy": 0.12372714094817638, + "epoch": 0.0037, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49018675088882446, + "kl": 0.5567406937479973, + "learning_rate": 9.999958887222293e-05, + "loss": -0.0266, + "num_tokens": 9548327.0, + "reward": 8.300872802734375, + "reward_std": 11.473505020141602, + "rewards/rollout_reward_func/mean": 8.300872802734375, + "rewards/rollout_reward_func/std": 13.137120246887207, + "sampling/importance_sampling_ratio/max": 1.3434193134307861, + "sampling/importance_sampling_ratio/mean": 1.0231890678405762, + "sampling/importance_sampling_ratio/min": 0.8001201748847961, + "sampling/sampling_logp_difference/max": 0.24235105514526367, + "sampling/sampling_logp_difference/mean": 0.006944713182747364, + "step": 185, + "step_time": 30.03380806199948 + }, + { + "clip_ratio/high_max": 0.058333336375653744, + "clip_ratio/high_mean": 0.01770833437331021, + "clip_ratio/low_mean": 0.012500000651925802, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03020833490882069, + "entropy": 0.13152629090473056, + "epoch": 0.00372, + "grad_norm": 0.23521849513053894, + "kl": 0.5634740013629198, + "learning_rate": 9.999958333522109e-05, + "loss": -0.0341, + "step": 186, + "step_time": 8.600791754000966 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 945.21875, + "completions/mean_terminated_length": 945.21875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.1315653999336064, + "epoch": 0.00374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36500951647758484, + "kl": 0.5178995914757252, + "learning_rate": 9.999957776118273e-05, + "loss": -0.0136, + "num_tokens": 9660136.0, + "reward": 7.931632041931152, + "reward_std": 11.40542984008789, + "rewards/rollout_reward_func/mean": 7.931632041931152, + "rewards/rollout_reward_func/std": 12.151664733886719, + "sampling/importance_sampling_ratio/max": 1.7536835670471191, + "sampling/importance_sampling_ratio/mean": 1.001771092414856, + "sampling/importance_sampling_ratio/min": 0.7216951251029968, + "sampling/sampling_logp_difference/max": 0.5699708461761475, + "sampling/sampling_logp_difference/mean": 0.0067958529107272625, + "step": 187, + "step_time": 29.347854906000975 + }, + { + "clip_ratio/high_max": 0.054166669491678476, + "clip_ratio/high_mean": 0.01770833448972553, + "clip_ratio/low_mean": 0.025694445823319256, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04340278054587543, + "entropy": 0.13414463540539145, + "epoch": 0.00376, + "grad_norm": 0.21745486557483673, + "kl": 0.5746774040162563, + "learning_rate": 9.999957215010784e-05, + "loss": -0.019, + "step": 188, + "step_time": 8.856123159000163 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 947.34375, + "completions/mean_terminated_length": 947.34375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.14527452224865556, + "epoch": 0.00378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4758737087249756, + "kl": 0.6551676895469427, + "learning_rate": 9.999956650199645e-05, + "loss": -0.0064, + "num_tokens": 9771998.0, + "reward": 8.513150215148926, + "reward_std": 14.811095237731934, + "rewards/rollout_reward_func/mean": 8.513150215148926, + "rewards/rollout_reward_func/std": 15.769759178161621, + "sampling/importance_sampling_ratio/max": 1.4140323400497437, + "sampling/importance_sampling_ratio/mean": 1.0076611042022705, + "sampling/importance_sampling_ratio/min": 0.5691302418708801, + "sampling/sampling_logp_difference/max": 0.7131770253181458, + "sampling/sampling_logp_difference/mean": 0.009376442059874535, + "step": 189, + "step_time": 30.213357230003567 + }, + { + "clip_ratio/high_max": 0.054166669491678476, + "clip_ratio/high_mean": 0.014583334210328758, + "clip_ratio/low_mean": 0.0281250016996637, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.042708336375653744, + "entropy": 0.13438974926248193, + "epoch": 0.0038, + "grad_norm": 0.2324807345867157, + "kl": 0.737682543694973, + "learning_rate": 9.999956081684854e-05, + "loss": -0.0149, + "step": 190, + "step_time": 7.734431613998822 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 962.203125, + "completions/mean_terminated_length": 962.203125, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.1253855088725686, + "epoch": 0.00382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41439345479011536, + "kl": 0.709712341427803, + "learning_rate": 9.999955509466414e-05, + "loss": 0.0269, + "num_tokens": 9884808.0, + "reward": 9.057092666625977, + "reward_std": 9.098945617675781, + "rewards/rollout_reward_func/mean": 9.05709171295166, + "rewards/rollout_reward_func/std": 10.38012981414795, + "sampling/importance_sampling_ratio/max": 1.3585758209228516, + "sampling/importance_sampling_ratio/mean": 0.989570677280426, + "sampling/importance_sampling_ratio/min": 0.6827925443649292, + "sampling/sampling_logp_difference/max": 0.40184950828552246, + "sampling/sampling_logp_difference/mean": 0.00655590184032917, + "step": 191, + "step_time": 31.590866651999022 + }, + { + "clip_ratio/high_max": 0.03392857313156128, + "clip_ratio/high_mean": 0.010565476841293275, + "clip_ratio/low_mean": 0.0293154779355973, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0398809548933059, + "entropy": 0.11225170968100429, + "epoch": 0.00384, + "grad_norm": 0.23349761962890625, + "kl": 0.8278532009571791, + "learning_rate": 9.999954933544323e-05, + "loss": 0.0201, + "step": 192, + "step_time": 7.970918687003177 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 977.734375, + "completions/mean_terminated_length": 977.734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.11717891087755561, + "epoch": 0.00386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4010028839111328, + "kl": 0.6346222888678312, + "learning_rate": 9.999954353918583e-05, + "loss": 0.0125, + "num_tokens": 9998710.0, + "reward": 12.752401351928711, + "reward_std": 15.009429931640625, + "rewards/rollout_reward_func/mean": 12.752399444580078, + "rewards/rollout_reward_func/std": 15.288240432739258, + "sampling/importance_sampling_ratio/max": 1.3140867948532104, + "sampling/importance_sampling_ratio/mean": 0.9636229276657104, + "sampling/importance_sampling_ratio/min": 0.5537927746772766, + "sampling/sampling_logp_difference/max": 0.36048221588134766, + "sampling/sampling_logp_difference/mean": 0.007171455770730972, + "step": 193, + "step_time": 30.459012025998163 + }, + { + "clip_ratio/high_max": 0.029166667722165585, + "clip_ratio/high_mean": 0.007291666930541396, + "clip_ratio/low_mean": 0.03020833502523601, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03750000218860805, + "entropy": 0.11490702140145004, + "epoch": 0.00388, + "grad_norm": 0.23535722494125366, + "kl": 0.6073946505784988, + "learning_rate": 9.999953770589194e-05, + "loss": 0.006, + "step": 194, + "step_time": 8.631377130000146 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 970.203125, + "completions/mean_terminated_length": 970.203125, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.11113500501960516, + "epoch": 0.0039, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.604935348033905, + "kl": 0.6790309809148312, + "learning_rate": 9.999953183556157e-05, + "loss": 0.0026, + "num_tokens": 10112081.0, + "reward": 7.972203731536865, + "reward_std": 13.011554718017578, + "rewards/rollout_reward_func/mean": 7.972204208374023, + "rewards/rollout_reward_func/std": 13.773921966552734, + "sampling/importance_sampling_ratio/max": 1.3542617559432983, + "sampling/importance_sampling_ratio/mean": 0.9855128526687622, + "sampling/importance_sampling_ratio/min": 0.597061276435852, + "sampling/sampling_logp_difference/max": 0.4635782241821289, + "sampling/sampling_logp_difference/mean": 0.006834958214312792, + "step": 195, + "step_time": 30.052685054003632 + }, + { + "clip_ratio/high_max": 0.029166668187826872, + "clip_ratio/high_mean": 0.007291667046956718, + "clip_ratio/low_mean": 0.015625000814907253, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022916667978279293, + "entropy": 0.11238743201829493, + "epoch": 0.00392, + "grad_norm": 0.4268299341201782, + "kl": 0.700402544811368, + "learning_rate": 9.999952592819473e-05, + "loss": -0.0015, + "step": 196, + "step_time": 8.260044886000287 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0032738096779212356, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005357143119908869, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 934.5, + "completions/mean_terminated_length": 934.5, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "entropy": 0.1125073074363172, + "epoch": 0.00394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6487244963645935, + "kl": 0.6208249572664499, + "learning_rate": 9.99995199837914e-05, + "loss": 0.0046, + "num_tokens": 10223022.0, + "reward": 8.661399841308594, + "reward_std": 15.73376178741455, + "rewards/rollout_reward_func/mean": 8.661399841308594, + "rewards/rollout_reward_func/std": 15.457544326782227, + "sampling/importance_sampling_ratio/max": 1.324127435684204, + "sampling/importance_sampling_ratio/mean": 1.0008368492126465, + "sampling/importance_sampling_ratio/min": 0.6733382344245911, + "sampling/sampling_logp_difference/max": 0.35140299797058105, + "sampling/sampling_logp_difference/mean": 0.007979365065693855, + "step": 197, + "step_time": 31.166683005998493 + }, + { + "clip_ratio/high_max": 0.021130953449755907, + "clip_ratio/high_mean": 0.00840773864183575, + "clip_ratio/low_mean": 0.02730654936749488, + "clip_ratio/low_min": 0.004166666883975267, + "clip_ratio/region_mean": 0.035714288242161274, + "entropy": 0.11146878870204091, + "epoch": 0.00396, + "grad_norm": 0.5962705016136169, + "kl": 0.9501709761098027, + "learning_rate": 9.999951400235163e-05, + "loss": 0.004, + "step": 198, + "step_time": 8.287430281997331 + }, + { + "clip_ratio/high_max": 0.012797619681805372, + "clip_ratio/high_mean": 0.003199404920451343, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005282738362438977, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 980.65625, + "completions/mean_terminated_length": 980.65625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 0.11779335234314203, + "epoch": 0.00398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.523526668548584, + "kl": 0.5669353120028973, + "learning_rate": 9.999950798387541e-05, + "loss": 0.0049, + "num_tokens": 10337112.0, + "reward": 10.420181274414062, + "reward_std": 16.354602813720703, + "rewards/rollout_reward_func/mean": 10.420181274414062, + "rewards/rollout_reward_func/std": 17.055269241333008, + "sampling/importance_sampling_ratio/max": 1.23856782913208, + "sampling/importance_sampling_ratio/mean": 0.9714287519454956, + "sampling/importance_sampling_ratio/min": 0.7061982750892639, + "sampling/sampling_logp_difference/max": 0.447023868560791, + "sampling/sampling_logp_difference/mean": 0.00747651606798172, + "step": 199, + "step_time": 30.34761462899951 + }, + { + "clip_ratio/high_max": 0.029464287217706442, + "clip_ratio/high_mean": 0.010491072083823383, + "clip_ratio/low_mean": 0.02091703994665295, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031408111681230366, + "entropy": 0.11395548144355416, + "epoch": 0.004, + "grad_norm": 0.3284382224082947, + "kl": 0.5632808655500412, + "learning_rate": 9.999950192836271e-05, + "loss": -0.001, + "step": 200, + "step_time": 8.547375084998748 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 971.3125, + "completions/mean_terminated_length": 971.3125, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.1107462802901864, + "epoch": 0.00402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42466774582862854, + "kl": 0.504519259557128, + "learning_rate": 9.999949583581359e-05, + "loss": 0.0037, + "num_tokens": 10450565.0, + "reward": 12.199589729309082, + "reward_std": 12.77005672454834, + "rewards/rollout_reward_func/mean": 12.199588775634766, + "rewards/rollout_reward_func/std": 13.816198348999023, + "sampling/importance_sampling_ratio/max": 1.1825975179672241, + "sampling/importance_sampling_ratio/mean": 0.9908883571624756, + "sampling/importance_sampling_ratio/min": 0.6934873461723328, + "sampling/sampling_logp_difference/max": 0.3765444755554199, + "sampling/sampling_logp_difference/mean": 0.006183322053402662, + "step": 201, + "step_time": 30.161383785001817 + }, + { + "clip_ratio/high_max": 0.03750000195577741, + "clip_ratio/high_mean": 0.011458334047347307, + "clip_ratio/low_mean": 0.021875001257285476, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03333333553746343, + "entropy": 0.10510897357016802, + "epoch": 0.00404, + "grad_norm": 0.21419784426689148, + "kl": 0.5648845955729485, + "learning_rate": 9.999948970622802e-05, + "loss": -0.0012, + "step": 202, + "step_time": 8.714965140998174 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 979.578125, + "completions/mean_terminated_length": 979.578125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.12641333835199475, + "epoch": 0.00406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6306821703910828, + "kl": 0.5104430429637432, + "learning_rate": 9.9999483539606e-05, + "loss": -0.0021, + "num_tokens": 10564630.0, + "reward": 10.778827667236328, + "reward_std": 13.483461380004883, + "rewards/rollout_reward_func/mean": 10.778827667236328, + "rewards/rollout_reward_func/std": 14.313225746154785, + "sampling/importance_sampling_ratio/max": 1.4068244695663452, + "sampling/importance_sampling_ratio/mean": 0.9891500473022461, + "sampling/importance_sampling_ratio/min": 0.6753217577934265, + "sampling/sampling_logp_difference/max": 0.3969893455505371, + "sampling/sampling_logp_difference/mean": 0.007549474947154522, + "step": 203, + "step_time": 29.916299866999907 + }, + { + "clip_ratio/high_max": 0.04583333572372794, + "clip_ratio/high_mean": 0.013541667489334941, + "clip_ratio/low_mean": 0.03132440650369972, + "clip_ratio/low_min": 0.004166666883975267, + "clip_ratio/region_mean": 0.04486607445869595, + "entropy": 0.12076347460970283, + "epoch": 0.00408, + "grad_norm": 0.29815390706062317, + "kl": 0.5736292470246553, + "learning_rate": 9.999947733594757e-05, + "loss": -0.0096, + "step": 204, + "step_time": 7.709945141001299 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.006250000325962901, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007291667046956718, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 948.3125, + "completions/mean_terminated_length": 948.3125, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "entropy": 0.10954847000539303, + "epoch": 0.0041, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8904768228530884, + "kl": 0.5102124018594623, + "learning_rate": 9.999947109525271e-05, + "loss": 0.0269, + "num_tokens": 10676487.0, + "reward": 7.509866237640381, + "reward_std": 12.055532455444336, + "rewards/rollout_reward_func/mean": 7.509865760803223, + "rewards/rollout_reward_func/std": 12.425904273986816, + "sampling/importance_sampling_ratio/max": 2.821709156036377, + "sampling/importance_sampling_ratio/mean": 1.0446405410766602, + "sampling/importance_sampling_ratio/min": 0.6838214993476868, + "sampling/sampling_logp_difference/max": 0.6221010684967041, + "sampling/sampling_logp_difference/mean": 0.007641012314707041, + "step": 205, + "step_time": 32.10779399100011 + }, + { + "clip_ratio/high_max": 0.029166668187826872, + "clip_ratio/high_mean": 0.008333333767950535, + "clip_ratio/low_mean": 0.0238932310603559, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.032226564711891115, + "entropy": 0.09176747733727098, + "epoch": 0.00412, + "grad_norm": 0.5064001083374023, + "kl": 0.6276722047477961, + "learning_rate": 9.999946481752144e-05, + "loss": 0.0257, + "step": 206, + "step_time": 8.04664100899663 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 948.5, + "completions/mean_terminated_length": 948.5, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.0839753916952759, + "epoch": 0.00414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6526506543159485, + "kl": 0.5436345608904958, + "learning_rate": 9.999945850275377e-05, + "loss": -0.0066, + "num_tokens": 10788398.0, + "reward": 4.734495639801025, + "reward_std": 13.251731872558594, + "rewards/rollout_reward_func/mean": 4.734495639801025, + "rewards/rollout_reward_func/std": 15.050627708435059, + "sampling/importance_sampling_ratio/max": 1.249489426612854, + "sampling/importance_sampling_ratio/mean": 1.0017802715301514, + "sampling/importance_sampling_ratio/min": 0.5872460603713989, + "sampling/sampling_logp_difference/max": 0.5519323348999023, + "sampling/sampling_logp_difference/mean": 0.007509762421250343, + "step": 207, + "step_time": 30.502280216000145 + }, + { + "clip_ratio/high_max": 0.041666668839752674, + "clip_ratio/high_mean": 0.010416667209938169, + "clip_ratio/low_mean": 0.020126489107497036, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030543157132342458, + "entropy": 0.0718412920832634, + "epoch": 0.00416, + "grad_norm": 0.9516690969467163, + "kl": 1.0864872355014086, + "learning_rate": 9.999945215094969e-05, + "loss": -0.0086, + "step": 208, + "step_time": 8.340999965001174 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 948.453125, + "completions/mean_terminated_length": 948.453125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.0782642443664372, + "epoch": 0.00418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9959932565689087, + "kl": 0.5945225208997726, + "learning_rate": 9.99994457621092e-05, + "loss": 0.0171, + "num_tokens": 10900280.0, + "reward": 9.700709342956543, + "reward_std": 13.213409423828125, + "rewards/rollout_reward_func/mean": 9.700709342956543, + "rewards/rollout_reward_func/std": 14.225313186645508, + "sampling/importance_sampling_ratio/max": 1.3207358121871948, + "sampling/importance_sampling_ratio/mean": 0.968299150466919, + "sampling/importance_sampling_ratio/min": 0.3971961438655853, + "sampling/sampling_logp_difference/max": 0.8888199329376221, + "sampling/sampling_logp_difference/mean": 0.00841559562832117, + "step": 209, + "step_time": 29.9366263410011 + }, + { + "clip_ratio/high_max": 0.025000001303851604, + "clip_ratio/high_mean": 0.006250000325962901, + "clip_ratio/low_mean": 0.02656250144354999, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03281250165309757, + "entropy": 0.07524953898973763, + "epoch": 0.0042, + "grad_norm": 0.22927281260490417, + "kl": 0.5741278808563948, + "learning_rate": 9.999943933623233e-05, + "loss": 0.0142, + "step": 210, + "step_time": 8.610201505000987 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0011160714784637094, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00424107164144516, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 961.625, + "completions/mean_terminated_length": 961.625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.08438117569312453, + "epoch": 0.00422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6056447625160217, + "kl": 0.4765475448220968, + "learning_rate": 9.999943287331907e-05, + "loss": -0.0396, + "num_tokens": 11013133.0, + "reward": 6.143215179443359, + "reward_std": 9.006479263305664, + "rewards/rollout_reward_func/mean": 6.143215179443359, + "rewards/rollout_reward_func/std": 10.255783081054688, + "sampling/importance_sampling_ratio/max": 1.5546733140945435, + "sampling/importance_sampling_ratio/mean": 0.9941245913505554, + "sampling/importance_sampling_ratio/min": 0.5497701168060303, + "sampling/sampling_logp_difference/max": 0.6002916693687439, + "sampling/sampling_logp_difference/mean": 0.0072316620498895645, + "step": 211, + "step_time": 29.858850818000974 + }, + { + "clip_ratio/high_max": 0.020833334419876337, + "clip_ratio/high_mean": 0.006250000325962901, + "clip_ratio/low_mean": 0.01889881060924381, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02514881093520671, + "entropy": 0.08111793245188892, + "epoch": 0.00424, + "grad_norm": 0.3952238857746124, + "kl": 0.5354121858254075, + "learning_rate": 9.999942637336943e-05, + "loss": -0.0419, + "step": 212, + "step_time": 8.145115041997997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0031250001629814506, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 974.1875, + "completions/mean_terminated_length": 974.1875, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "entropy": 0.08407697454094887, + "epoch": 0.00426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5582248568534851, + "kl": 0.5609004180878401, + "learning_rate": 9.999941983638342e-05, + "loss": -0.0096, + "num_tokens": 11126805.0, + "reward": 7.366800308227539, + "reward_std": 11.575126647949219, + "rewards/rollout_reward_func/mean": 7.366800785064697, + "rewards/rollout_reward_func/std": 12.478679656982422, + "sampling/importance_sampling_ratio/max": 1.7624305486679077, + "sampling/importance_sampling_ratio/mean": 1.0073318481445312, + "sampling/importance_sampling_ratio/min": 0.5805040001869202, + "sampling/sampling_logp_difference/max": 0.5259637832641602, + "sampling/sampling_logp_difference/mean": 0.007181447930634022, + "step": 213, + "step_time": 30.68919447299777 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.005208333721384406, + "clip_ratio/low_mean": 0.02083333453629166, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026041668257676065, + "entropy": 0.08252408797852695, + "epoch": 0.00428, + "grad_norm": 0.4972332715988159, + "kl": 0.8270881623029709, + "learning_rate": 9.999941326236106e-05, + "loss": -0.0102, + "step": 214, + "step_time": 8.636868058998516 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 947.234375, + "completions/mean_terminated_length": 947.234375, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.09235736005939543, + "epoch": 0.0043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7149195075035095, + "kl": 0.6700677536427975, + "learning_rate": 9.999940665130233e-05, + "loss": 0.0269, + "num_tokens": 11238594.0, + "reward": 8.236663818359375, + "reward_std": 12.342934608459473, + "rewards/rollout_reward_func/mean": 8.236662864685059, + "rewards/rollout_reward_func/std": 13.346291542053223, + "sampling/importance_sampling_ratio/max": 1.3215135335922241, + "sampling/importance_sampling_ratio/mean": 1.0117213726043701, + "sampling/importance_sampling_ratio/min": 0.607474684715271, + "sampling/sampling_logp_difference/max": 0.3575429916381836, + "sampling/sampling_logp_difference/mean": 0.00792029220610857, + "step": 215, + "step_time": 30.079993358000138 + }, + { + "clip_ratio/high_max": 0.03333333507180214, + "clip_ratio/high_mean": 0.009375000605359674, + "clip_ratio/low_mean": 0.03020833502523601, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.039583335630595684, + "entropy": 0.09003956150263548, + "epoch": 0.00432, + "grad_norm": 0.22825075685977936, + "kl": 0.7063372246921062, + "learning_rate": 9.999940000320725e-05, + "loss": 0.0204, + "step": 216, + "step_time": 8.830438550000508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 973.421875, + "completions/mean_terminated_length": 973.421875, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "entropy": 0.08580271410755813, + "epoch": 0.00434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.544740617275238, + "kl": 0.8732884284108877, + "learning_rate": 9.999939331807582e-05, + "loss": -0.0038, + "num_tokens": 11352163.0, + "reward": 6.939154148101807, + "reward_std": 12.035371780395508, + "rewards/rollout_reward_func/mean": 6.939153671264648, + "rewards/rollout_reward_func/std": 12.4366455078125, + "sampling/importance_sampling_ratio/max": 1.316995620727539, + "sampling/importance_sampling_ratio/mean": 1.0068674087524414, + "sampling/importance_sampling_ratio/min": 0.7823165059089661, + "sampling/sampling_logp_difference/max": 0.2636311650276184, + "sampling/sampling_logp_difference/mean": 0.0060178861021995544, + "step": 217, + "step_time": 30.360063980000632 + }, + { + "clip_ratio/high_max": 0.025000001303851604, + "clip_ratio/high_mean": 0.006250000325962901, + "clip_ratio/low_mean": 0.018750000977888703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025000001420266926, + "entropy": 0.08710528793744743, + "epoch": 0.00436, + "grad_norm": 0.38059887290000916, + "kl": 0.908846540376544, + "learning_rate": 9.999938659590807e-05, + "loss": -0.0104, + "step": 218, + "step_time": 7.607007630000226 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0030598959419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006184896221384406, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 963.703125, + "completions/mean_terminated_length": 963.703125, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "entropy": 0.09868060098960996, + "epoch": 0.00438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48504236340522766, + "kl": 0.5343823749572039, + "learning_rate": 9.999937983670398e-05, + "loss": 0.0194, + "num_tokens": 11465042.0, + "reward": 6.008334636688232, + "reward_std": 13.721019744873047, + "rewards/rollout_reward_func/mean": 6.008334636688232, + "rewards/rollout_reward_func/std": 14.5517578125, + "sampling/importance_sampling_ratio/max": 1.471420407295227, + "sampling/importance_sampling_ratio/mean": 0.9755445718765259, + "sampling/importance_sampling_ratio/min": 0.5715925097465515, + "sampling/sampling_logp_difference/max": 0.46605920791625977, + "sampling/sampling_logp_difference/mean": 0.008881919085979462, + "step": 219, + "step_time": 31.71598935200018 + }, + { + "clip_ratio/high_max": 0.03750000195577741, + "clip_ratio/high_mean": 0.010416667209938169, + "clip_ratio/low_mean": 0.022851563524454832, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.033268230734393, + "entropy": 0.09918246325105429, + "epoch": 0.0044, + "grad_norm": 0.22379587590694427, + "kl": 0.5816311649978161, + "learning_rate": 9.999937304046355e-05, + "loss": 0.0147, + "step": 220, + "step_time": 8.254640479001864 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005208333604969084, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 955.53125, + "completions/mean_terminated_length": 955.53125, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.09134439891204238, + "epoch": 0.00442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7109507322311401, + "kl": 0.6976822856813669, + "learning_rate": 9.999936620718681e-05, + "loss": 0.0063, + "num_tokens": 11577407.0, + "reward": 7.297264099121094, + "reward_std": 9.222230911254883, + "rewards/rollout_reward_func/mean": 7.297264575958252, + "rewards/rollout_reward_func/std": 10.19138240814209, + "sampling/importance_sampling_ratio/max": 1.4536468982696533, + "sampling/importance_sampling_ratio/mean": 0.9984610080718994, + "sampling/importance_sampling_ratio/min": 0.7001582384109497, + "sampling/sampling_logp_difference/max": 0.37113046646118164, + "sampling/sampling_logp_difference/mean": 0.006036648992449045, + "step": 221, + "step_time": 29.85640163500102 + }, + { + "clip_ratio/high_max": 0.03333333507180214, + "clip_ratio/high_mean": 0.009375000488944352, + "clip_ratio/low_mean": 0.014583334210328758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02395833469927311, + "entropy": 0.09828592231497169, + "epoch": 0.00444, + "grad_norm": 1.0275782346725464, + "kl": 0.5333473347127438, + "learning_rate": 9.999935933687375e-05, + "loss": 0.0064, + "step": 222, + "step_time": 8.916792385998633 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 955.5625, + "completions/mean_terminated_length": 955.5625, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "entropy": 0.11322583490982652, + "epoch": 0.00446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6080738306045532, + "kl": 0.4430428724735975, + "learning_rate": 9.999935242952441e-05, + "loss": 0.0136, + "num_tokens": 11689757.0, + "reward": 7.010858535766602, + "reward_std": 12.169811248779297, + "rewards/rollout_reward_func/mean": 7.010858535766602, + "rewards/rollout_reward_func/std": 12.808332443237305, + "sampling/importance_sampling_ratio/max": 1.297809362411499, + "sampling/importance_sampling_ratio/mean": 0.9824950695037842, + "sampling/importance_sampling_ratio/min": 0.6718153953552246, + "sampling/sampling_logp_difference/max": 0.3088874816894531, + "sampling/sampling_logp_difference/mean": 0.007251087576150894, + "step": 223, + "step_time": 31.12233561499943 + }, + { + "clip_ratio/high_max": 0.05000000260770321, + "clip_ratio/high_mean": 0.01770833437331021, + "clip_ratio/low_mean": 0.02285156410653144, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04055989871267229, + "entropy": 0.11228394089266658, + "epoch": 0.00448, + "grad_norm": 0.5497627258300781, + "kl": 0.6058794800192118, + "learning_rate": 9.999934548513874e-05, + "loss": 0.0127, + "step": 224, + "step_time": 8.354907415001435 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0031250001629814506, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1039.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 941.9375, + "completions/mean_terminated_length": 941.9375, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "entropy": 0.10548029001802206, + "epoch": 0.0045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5894492864608765, + "kl": 0.5073134936392307, + "learning_rate": 9.999933850371681e-05, + "loss": 0.0086, + "num_tokens": 11801157.0, + "reward": 6.051244258880615, + "reward_std": 9.136930465698242, + "rewards/rollout_reward_func/mean": 6.051244258880615, + "rewards/rollout_reward_func/std": 9.732189178466797, + "sampling/importance_sampling_ratio/max": 1.4082491397857666, + "sampling/importance_sampling_ratio/mean": 0.9974700212478638, + "sampling/importance_sampling_ratio/min": 0.6021063923835754, + "sampling/sampling_logp_difference/max": 0.5747667551040649, + "sampling/sampling_logp_difference/mean": 0.0068025123327970505, + "step": 225, + "step_time": 31.567075909998493 + }, + { + "clip_ratio/high_max": 0.025000001303851604, + "clip_ratio/high_mean": 0.006250000325962901, + "clip_ratio/low_mean": 0.027083334745839238, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.033333335421048105, + "entropy": 0.10574616026133299, + "epoch": 0.00452, + "grad_norm": 0.2798631489276886, + "kl": 0.7715174313634634, + "learning_rate": 9.999933148525857e-05, + "loss": 0.007, + "step": 226, + "step_time": 7.815335610000147 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.0031250001629814506, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005208333604969084, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 959.5625, + "completions/mean_terminated_length": 959.5625, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.11399172944948077, + "epoch": 0.00454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.568816602230072, + "kl": 0.5976129788905382, + "learning_rate": 9.999932442976408e-05, + "loss": -0.0166, + "num_tokens": 11913755.0, + "reward": 8.459915161132812, + "reward_std": 15.611612319946289, + "rewards/rollout_reward_func/mean": 8.459915161132812, + "rewards/rollout_reward_func/std": 15.882699012756348, + "sampling/importance_sampling_ratio/max": 1.710551381111145, + "sampling/importance_sampling_ratio/mean": 1.017797827720642, + "sampling/importance_sampling_ratio/min": 0.7439659833908081, + "sampling/sampling_logp_difference/max": 0.36053359508514404, + "sampling/sampling_logp_difference/mean": 0.008504325523972511, + "step": 227, + "step_time": 31.015096133000043 + }, + { + "clip_ratio/high_max": 0.03750000195577741, + "clip_ratio/high_mean": 0.013541667489334941, + "clip_ratio/low_mean": 0.020833334769122303, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03437500272411853, + "entropy": 0.11824841611087322, + "epoch": 0.00456, + "grad_norm": 0.31711068749427795, + "kl": 0.6181838270276785, + "learning_rate": 9.999931733723329e-05, + "loss": -0.0224, + "step": 228, + "step_time": 8.539993245000005 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 949.875, + "completions/mean_terminated_length": 949.875, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.11835443088784814, + "epoch": 0.00458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.510497510433197, + "kl": 0.562442360445857, + "learning_rate": 9.999931020766625e-05, + "loss": -0.0151, + "num_tokens": 12025731.0, + "reward": 7.8150177001953125, + "reward_std": 10.93730640411377, + "rewards/rollout_reward_func/mean": 7.8150177001953125, + "rewards/rollout_reward_func/std": 12.047165870666504, + "sampling/importance_sampling_ratio/max": 1.8081343173980713, + "sampling/importance_sampling_ratio/mean": 1.0230156183242798, + "sampling/importance_sampling_ratio/min": 0.5872366428375244, + "sampling/sampling_logp_difference/max": 0.5174302458763123, + "sampling/sampling_logp_difference/mean": 0.008099589496850967, + "step": 229, + "step_time": 30.196818825002993 + }, + { + "clip_ratio/high_max": 0.029166668187826872, + "clip_ratio/high_mean": 0.007291667046956718, + "clip_ratio/low_mean": 0.028125001466833055, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03541666886303574, + "entropy": 0.12235437287017703, + "epoch": 0.0046, + "grad_norm": 0.6880154013633728, + "kl": 0.5698418729007244, + "learning_rate": 9.999930304106295e-05, + "loss": -0.0198, + "step": 230, + "step_time": 9.264137213997856 + }, + { + "clip_ratio/high_max": 0.01666666753590107, + "clip_ratio/high_mean": 0.004166666883975267, + "clip_ratio/low_mean": 0.0031250001629814506, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007291667046956718, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 953.078125, + "completions/mean_terminated_length": 953.078125, + "completions/min_length": 641.0, + "completions/min_terminated_length": 641.0, + "entropy": 0.10955408262088895, + "epoch": 0.00462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4517523944377899, + "kl": 0.562993137165904, + "learning_rate": 9.99992958374234e-05, + "loss": 0.0172, + "num_tokens": 12137928.0, + "reward": 6.407680034637451, + "reward_std": 12.907535552978516, + "rewards/rollout_reward_func/mean": 6.407680034637451, + "rewards/rollout_reward_func/std": 14.238213539123535, + "sampling/importance_sampling_ratio/max": 1.3800395727157593, + "sampling/importance_sampling_ratio/mean": 0.989479124546051, + "sampling/importance_sampling_ratio/min": 0.5886368155479431, + "sampling/sampling_logp_difference/max": 0.4858388900756836, + "sampling/sampling_logp_difference/mean": 0.007202588953077793, + "step": 231, + "step_time": 30.985224181000376 + }, + { + "clip_ratio/high_max": 0.04583333572372794, + "clip_ratio/high_mean": 0.012500000768341124, + "clip_ratio/low_mean": 0.023177084513008595, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03567708551418036, + "entropy": 0.09367383690550923, + "epoch": 0.00464, + "grad_norm": 0.3370003402233124, + "kl": 0.5898754354566336, + "learning_rate": 9.99992885967476e-05, + "loss": 0.0141, + "step": 232, + "step_time": 7.848031210001864 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0031250001629814506, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005208333604969084, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 956.953125, + "completions/mean_terminated_length": 956.953125, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 0.09109799051657319, + "epoch": 0.00466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9314411878585815, + "kl": 0.6043513156473637, + "learning_rate": 9.999928131903557e-05, + "loss": -0.0214, + "num_tokens": 12250409.0, + "reward": 4.949494361877441, + "reward_std": 13.414144515991211, + "rewards/rollout_reward_func/mean": 4.949494361877441, + "rewards/rollout_reward_func/std": 14.449867248535156, + "sampling/importance_sampling_ratio/max": 1.7543931007385254, + "sampling/importance_sampling_ratio/mean": 1.0081617832183838, + "sampling/importance_sampling_ratio/min": 0.7344788908958435, + "sampling/sampling_logp_difference/max": 0.40094685554504395, + "sampling/sampling_logp_difference/mean": 0.007154828868806362, + "step": 233, + "step_time": 31.97192203099803 + }, + { + "clip_ratio/high_max": 0.06250000279396772, + "clip_ratio/high_mean": 0.018750001094304025, + "clip_ratio/low_mean": 0.026041668141260743, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04479166946839541, + "entropy": 0.07302908715792, + "epoch": 0.00468, + "grad_norm": 0.6798368692398071, + "kl": 1.063211616128683, + "learning_rate": 9.999927400428733e-05, + "loss": -0.0247, + "step": 234, + "step_time": 8.365730943999552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 939.296875, + "completions/mean_terminated_length": 939.296875, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.08065455732867122, + "epoch": 0.0047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6933859586715698, + "kl": 0.5536066945642233, + "learning_rate": 9.999926665250286e-05, + "loss": -0.0262, + "num_tokens": 12361673.0, + "reward": 6.229083061218262, + "reward_std": 13.382326126098633, + "rewards/rollout_reward_func/mean": 6.229083061218262, + "rewards/rollout_reward_func/std": 14.236706733703613, + "sampling/importance_sampling_ratio/max": 1.6296361684799194, + "sampling/importance_sampling_ratio/mean": 0.9904996752738953, + "sampling/importance_sampling_ratio/min": 0.554724395275116, + "sampling/sampling_logp_difference/max": 0.5841927528381348, + "sampling/sampling_logp_difference/mean": 0.007334005553275347, + "step": 235, + "step_time": 30.556930122000267 + }, + { + "clip_ratio/high_max": 0.041666668839752674, + "clip_ratio/high_mean": 0.012500000768341124, + "clip_ratio/low_mean": 0.023177084629423916, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03567708586342633, + "entropy": 0.07718153693713248, + "epoch": 0.00472, + "grad_norm": 0.373847097158432, + "kl": 0.7903371974825859, + "learning_rate": 9.999925926368217e-05, + "loss": -0.0281, + "step": 236, + "step_time": 8.405578448001506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 969.71875, + "completions/mean_terminated_length": 969.71875, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.09033584129065275, + "epoch": 0.00474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.672709584236145, + "kl": 0.4807140491902828, + "learning_rate": 9.999925183782528e-05, + "loss": 0.0206, + "num_tokens": 12475023.0, + "reward": 7.968780517578125, + "reward_std": 14.767425537109375, + "rewards/rollout_reward_func/mean": 7.968780517578125, + "rewards/rollout_reward_func/std": 15.451577186584473, + "sampling/importance_sampling_ratio/max": 1.446393370628357, + "sampling/importance_sampling_ratio/mean": 1.0078678131103516, + "sampling/importance_sampling_ratio/min": 0.7536318898200989, + "sampling/sampling_logp_difference/max": 0.36260342597961426, + "sampling/sampling_logp_difference/mean": 0.006248952820897102, + "step": 237, + "step_time": 30.60275455199826 + }, + { + "clip_ratio/high_max": 0.054166669491678476, + "clip_ratio/high_mean": 0.01770833448972553, + "clip_ratio/low_mean": 0.010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028125002165324986, + "entropy": 0.10236532241106033, + "epoch": 0.00476, + "grad_norm": 0.1718801110982895, + "kl": 0.45644159242510796, + "learning_rate": 9.999924437493219e-05, + "loss": 0.0137, + "step": 238, + "step_time": 8.174696675001542 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004166666883975267, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 986.125, + "completions/mean_terminated_length": 986.125, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 0.11793840350583196, + "epoch": 0.00478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.403202623128891, + "kl": 0.5491157062351704, + "learning_rate": 9.99992368750029e-05, + "loss": 0.0205, + "num_tokens": 12589470.0, + "reward": 7.7764387130737305, + "reward_std": 10.855308532714844, + "rewards/rollout_reward_func/mean": 7.776438236236572, + "rewards/rollout_reward_func/std": 11.845745086669922, + "sampling/importance_sampling_ratio/max": 1.4125927686691284, + "sampling/importance_sampling_ratio/mean": 1.0074553489685059, + "sampling/importance_sampling_ratio/min": 0.6787428855895996, + "sampling/sampling_logp_difference/max": 0.36117464303970337, + "sampling/sampling_logp_difference/mean": 0.007153394166380167, + "step": 239, + "step_time": 31.446214477003195 + }, + { + "clip_ratio/high_max": 0.06614583590999246, + "clip_ratio/high_mean": 0.02070312586147338, + "clip_ratio/low_mean": 0.018750000977888703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03945312695577741, + "entropy": 0.12702699471265078, + "epoch": 0.0048, + "grad_norm": 0.28333526849746704, + "kl": 0.5486433319747448, + "learning_rate": 9.999922933803743e-05, + "loss": 0.0157, + "step": 240, + "step_time": 8.18167577299937 + }, + { + "clip_ratio/high_max": 0.012500000651925802, + "clip_ratio/high_mean": 0.004101562546566129, + "clip_ratio/low_mean": 0.0015997024602256715, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0057012650067918, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 947.078125, + "completions/mean_terminated_length": 947.078125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.15819989750161767, + "epoch": 0.00482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5497854948043823, + "kl": 0.6653936766088009, + "learning_rate": 9.999922176403578e-05, + "loss": 0.0274, + "num_tokens": 12701387.0, + "reward": 4.367884635925293, + "reward_std": 14.958491325378418, + "rewards/rollout_reward_func/mean": 4.367884635925293, + "rewards/rollout_reward_func/std": 15.79384708404541, + "sampling/importance_sampling_ratio/max": 1.7386236190795898, + "sampling/importance_sampling_ratio/mean": 1.0037915706634521, + "sampling/importance_sampling_ratio/min": 1.5100153958014693e-17, + "sampling/sampling_logp_difference/max": 32.36700439453125, + "sampling/sampling_logp_difference/mean": 0.050702136009931564, + "step": 241, + "step_time": 30.312135679001585 + }, + { + "clip_ratio/high_max": 0.0713541698642075, + "clip_ratio/high_mean": 0.02304687607102096, + "clip_ratio/low_mean": 0.019182722782716155, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.042229598737321794, + "entropy": 0.1654459210112691, + "epoch": 0.00484, + "grad_norm": 0.2647717595100403, + "kl": 0.6706695519387722, + "learning_rate": 9.999921415299796e-05, + "loss": 0.0208, + "step": 242, + "step_time": 8.75198459999956 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 975.015625, + "completions/mean_terminated_length": 975.015625, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.1656077685765922, + "epoch": 0.00486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5207899808883667, + "kl": 0.4836368393152952, + "learning_rate": 9.999920650492399e-05, + "loss": -0.0058, + "num_tokens": 12815104.0, + "reward": 8.474200248718262, + "reward_std": 13.949186325073242, + "rewards/rollout_reward_func/mean": 8.474200248718262, + "rewards/rollout_reward_func/std": 15.287591934204102, + "sampling/importance_sampling_ratio/max": 1.3931519985198975, + "sampling/importance_sampling_ratio/mean": 0.9963239431381226, + "sampling/importance_sampling_ratio/min": 7.17475301392767e-10, + "sampling/sampling_logp_difference/max": 14.08260726928711, + "sampling/sampling_logp_difference/mean": 0.0297236330807209, + "step": 243, + "step_time": 29.95098099400184 + }, + { + "clip_ratio/high_max": 0.04583333572372794, + "clip_ratio/high_mean": 0.01562500116415322, + "clip_ratio/low_mean": 0.019308037008158863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.034933038405142725, + "entropy": 0.17500293161720037, + "epoch": 0.00488, + "grad_norm": 0.2083873599767685, + "kl": 0.4789597373455763, + "learning_rate": 9.999919881981386e-05, + "loss": -0.0127, + "step": 244, + "step_time": 9.603725530998418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0005580357392318547, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005580357392318547, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 960.71875, + "completions/mean_terminated_length": 960.71875, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.1825911095365882, + "epoch": 0.0049, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5889225602149963, + "kl": 0.9096355475485325, + "learning_rate": 9.999919109766759e-05, + "loss": 0.0086, + "num_tokens": 12927807.0, + "reward": 4.357412338256836, + "reward_std": 10.907012939453125, + "rewards/rollout_reward_func/mean": 4.357412338256836, + "rewards/rollout_reward_func/std": 11.52568531036377, + "sampling/importance_sampling_ratio/max": 1.709380030632019, + "sampling/importance_sampling_ratio/mean": 1.0086195468902588, + "sampling/importance_sampling_ratio/min": 0.7435536980628967, + "sampling/sampling_logp_difference/max": 0.24706459045410156, + "sampling/sampling_logp_difference/mean": 0.007025801111012697, + "step": 245, + "step_time": 31.461640581997926 + }, + { + "clip_ratio/high_max": 0.04583333572372794, + "clip_ratio/high_mean": 0.014583334210328758, + "clip_ratio/low_mean": 0.02460007555782795, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.039183410233817995, + "entropy": 0.1923405658453703, + "epoch": 0.00492, + "grad_norm": 0.2593821585178375, + "kl": 0.6401933804154396, + "learning_rate": 9.999918333848517e-05, + "loss": -0.0009, + "step": 246, + "step_time": 7.859529026000018 + }, + { + "clip_ratio/high_max": 0.004464285913854837, + "clip_ratio/high_mean": 0.0011160714784637094, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011160714784637094, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 962.265625, + "completions/mean_terminated_length": 962.265625, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "entropy": 0.21913561783730984, + "epoch": 0.00494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5455278158187866, + "kl": 0.5414405167102814, + "learning_rate": 9.999917554226662e-05, + "loss": 0.0071, + "num_tokens": 13040672.0, + "reward": 8.484970092773438, + "reward_std": 13.802679061889648, + "rewards/rollout_reward_func/mean": 8.484970092773438, + "rewards/rollout_reward_func/std": 13.872236251831055, + "sampling/importance_sampling_ratio/max": 1.3358259201049805, + "sampling/importance_sampling_ratio/mean": 0.9915522336959839, + "sampling/importance_sampling_ratio/min": 0.004653692711144686, + "sampling/sampling_logp_difference/max": 4.521495819091797, + "sampling/sampling_logp_difference/mean": 0.016463816165924072, + "step": 247, + "step_time": 32.296578387999034 + }, + { + "clip_ratio/high_max": 0.07113095559179783, + "clip_ratio/high_mean": 0.025074406410567462, + "clip_ratio/low_mean": 0.03020833490882069, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05528274131938815, + "entropy": 0.2346064280718565, + "epoch": 0.00496, + "grad_norm": 0.28457146883010864, + "kl": 0.5182771291583776, + "learning_rate": 9.999916770901196e-05, + "loss": 0.0003, + "step": 248, + "step_time": 8.322071146998496 + }, + { + "clip_ratio/high_max": 0.008333333767950535, + "clip_ratio/high_mean": 0.0020833334419876337, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 947.125, + "completions/mean_terminated_length": 947.125, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "entropy": 0.22256971709430218, + "epoch": 0.00498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4911050498485565, + "kl": 0.5106718242168427, + "learning_rate": 9.999915983872117e-05, + "loss": 0.0185, + "num_tokens": 13152426.0, + "reward": 8.859310150146484, + "reward_std": 14.251840591430664, + "rewards/rollout_reward_func/mean": 8.859310150146484, + "rewards/rollout_reward_func/std": 15.995503425598145, + "sampling/importance_sampling_ratio/max": 1.1877168416976929, + "sampling/importance_sampling_ratio/mean": 1.0040578842163086, + "sampling/importance_sampling_ratio/min": 0.7722747921943665, + "sampling/sampling_logp_difference/max": 0.2686450481414795, + "sampling/sampling_logp_difference/mean": 0.008781258016824722, + "step": 249, + "step_time": 31.257320647998313 + }, + { + "clip_ratio/high_max": 0.058333335909992456, + "clip_ratio/high_mean": 0.019791667815297842, + "clip_ratio/low_mean": 0.02544642984867096, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04523809743113816, + "entropy": 0.22759789694100618, + "epoch": 0.005, + "grad_norm": 0.32039502263069153, + "kl": 0.5074543356895447, + "learning_rate": 9.999915193139428e-05, + "loss": 0.0067, + "step": 250, + "step_time": 8.727711221999925 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0010416667209938169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020833334419876337, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 972.3125, + "completions/mean_terminated_length": 972.3125, + "completions/min_length": 885.0, + "completions/min_terminated_length": 885.0, + "entropy": 0.2637898661196232, + "epoch": 0.00502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6115661263465881, + "kl": 0.5041398257017136, + "learning_rate": 9.999914398703127e-05, + "loss": 0.0222, + "num_tokens": 13265926.0, + "reward": 8.36276626586914, + "reward_std": 12.634754180908203, + "rewards/rollout_reward_func/mean": 8.36276626586914, + "rewards/rollout_reward_func/std": 13.79938793182373, + "sampling/importance_sampling_ratio/max": 1.3860008716583252, + "sampling/importance_sampling_ratio/mean": 0.9989358186721802, + "sampling/importance_sampling_ratio/min": 0.6789365410804749, + "sampling/sampling_logp_difference/max": 0.4403858184814453, + "sampling/sampling_logp_difference/mean": 0.011640775017440319, + "step": 251, + "step_time": 30.78323078500125 + }, + { + "clip_ratio/high_max": 0.08333333721384406, + "clip_ratio/high_mean": 0.026041668141260743, + "clip_ratio/low_mean": 0.024479167768731713, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05052083625923842, + "entropy": 0.25762353744357824, + "epoch": 0.00504, + "grad_norm": 0.33068326115608215, + "kl": 0.5315965916961432, + "learning_rate": 9.99991360056322e-05, + "loss": 0.0108, + "step": 252, + "step_time": 8.417674423999415 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416667209938169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1039.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 962.484375, + "completions/mean_terminated_length": 962.484375, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.2320685014128685, + "epoch": 0.00506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.606693685054779, + "kl": 0.5243742056190968, + "learning_rate": 9.999912798719702e-05, + "loss": 0.0154, + "num_tokens": 13378838.0, + "reward": 5.471320152282715, + "reward_std": 16.305179595947266, + "rewards/rollout_reward_func/mean": 5.471320629119873, + "rewards/rollout_reward_func/std": 16.513338088989258, + "sampling/importance_sampling_ratio/max": 1.3763768672943115, + "sampling/importance_sampling_ratio/mean": 0.9975243806838989, + "sampling/importance_sampling_ratio/min": 0.706875205039978, + "sampling/sampling_logp_difference/max": 0.28901320695877075, + "sampling/sampling_logp_difference/mean": 0.009924216195940971, + "step": 253, + "step_time": 31.570553302000008 + }, + { + "clip_ratio/high_max": 0.058333336375653744, + "clip_ratio/high_mean": 0.018750001094304025, + "clip_ratio/low_mean": 0.026041667792014778, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.044791669119149446, + "entropy": 0.21940706949681044, + "epoch": 0.00508, + "grad_norm": 0.3425885736942291, + "kl": 0.6501965597271919, + "learning_rate": 9.999911993172577e-05, + "loss": 0.0077, + "step": 254, + "step_time": 8.264053588000024 + }, + { + "clip_ratio/high_max": 0.004166666883975267, + "clip_ratio/high_mean": 0.0010416667209938169, + "clip_ratio/low_mean": 0.0020833334419876337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031250001629814506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 953.46875, + "completions/mean_terminated_length": 953.46875, + "completions/min_length": 900.0, + "completions/min_terminated_length": 900.0, + "entropy": 0.21140480507165194, + "epoch": 0.0051, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5503631234169006, + "kl": 0.5327232480049133, + "learning_rate": 9.999911183921846e-05, + "loss": 0.0038, + "num_tokens": 13491042.0, + "reward": 8.259774208068848, + "reward_std": 10.848722457885742, + "rewards/rollout_reward_func/mean": 8.259774208068848, + "rewards/rollout_reward_func/std": 11.306256294250488, + "sampling/importance_sampling_ratio/max": 1.4545994997024536, + "sampling/importance_sampling_ratio/mean": 0.9899890422821045, + "sampling/importance_sampling_ratio/min": 0.6251944303512573, + "sampling/sampling_logp_difference/max": 0.42076706886291504, + "sampling/sampling_logp_difference/mean": 0.01122802309691906, + "step": 255, + "step_time": 31.768314117000045 + }, + { + "clip_ratio/high_max": 0.05476190708577633, + "clip_ratio/high_mean": 0.022023811121471226, + "clip_ratio/low_mean": 0.022916668327525258, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04494047968182713, + "entropy": 0.21271847933530807, + "epoch": 0.00512, + "grad_norm": 1.165165662765503, + "kl": 0.5640581175684929, + "learning_rate": 9.999910370967507e-05, + "loss": -0.0008, + "step": 256, + "step_time": 8.655397303001337 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008680555620230734, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 1241.921875, + "completions/mean_terminated_length": 1241.921875, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.2426544101908803, + "epoch": 0.00514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8530539870262146, + "kl": 0.5651203468441963, + "learning_rate": 9.999909554309565e-05, + "loss": 0.0047, + "num_tokens": 13621717.0, + "reward": 4.329623699188232, + "reward_std": 14.394445419311523, + "rewards/rollout_reward_func/mean": 4.329623222351074, + "rewards/rollout_reward_func/std": 14.93822193145752, + "sampling/importance_sampling_ratio/max": 1.2321135997772217, + "sampling/importance_sampling_ratio/mean": 0.9581992626190186, + "sampling/importance_sampling_ratio/min": 0.2660026550292969, + "sampling/sampling_logp_difference/max": 1.2229857444763184, + "sampling/sampling_logp_difference/mean": 0.013221165165305138, + "step": 257, + "step_time": 37.82865672800108 + }, + { + "clip_ratio/high_max": 0.06597222317941487, + "clip_ratio/high_mean": 0.027732091082725674, + "clip_ratio/low_mean": 0.038194445020053536, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.06592653610277921, + "entropy": 0.2270987592637539, + "epoch": 0.00516, + "grad_norm": 0.43574994802474976, + "kl": 0.6589642316102982, + "learning_rate": 9.999908733948017e-05, + "loss": -0.0093, + "step": 258, + "step_time": 10.512357729995529 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 1250.96875, + "completions/mean_terminated_length": 1250.96875, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.213697855360806, + "epoch": 0.00518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6505308747291565, + "kl": 0.5941296126693487, + "learning_rate": 9.999907909882866e-05, + "loss": -0.0204, + "num_tokens": 13753091.0, + "reward": 6.001728057861328, + "reward_std": 15.827871322631836, + "rewards/rollout_reward_func/mean": 6.001728057861328, + "rewards/rollout_reward_func/std": 16.02460479736328, + "sampling/importance_sampling_ratio/max": 1.3350346088409424, + "sampling/importance_sampling_ratio/mean": 0.9674654006958008, + "sampling/importance_sampling_ratio/min": 0.5470981001853943, + "sampling/sampling_logp_difference/max": 0.5512038469314575, + "sampling/sampling_logp_difference/mean": 0.011880462057888508, + "step": 259, + "step_time": 38.35317581399886 + }, + { + "clip_ratio/high_max": 0.05208333395421505, + "clip_ratio/high_mean": 0.015625000174622983, + "clip_ratio/low_mean": 0.04037990275537595, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.056004903570283204, + "entropy": 0.1957033844664693, + "epoch": 0.0052, + "grad_norm": 0.45430490374565125, + "kl": 0.781089099124074, + "learning_rate": 9.999907082114112e-05, + "loss": -0.0313, + "step": 260, + "step_time": 9.020615039000404 + }, + { + "clip_ratio/high_max": 0.016812865156680346, + "clip_ratio/high_mean": 0.004203216289170086, + "clip_ratio/low_mean": 0.0026041666860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006807382975239307, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 1213.078125, + "completions/mean_terminated_length": 1213.078125, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "entropy": 0.18847014661878347, + "epoch": 0.00522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6835371851921082, + "kl": 0.542348800227046, + "learning_rate": 9.999906250641758e-05, + "loss": 0.0145, + "num_tokens": 13881882.0, + "reward": 5.304417610168457, + "reward_std": 14.105676651000977, + "rewards/rollout_reward_func/mean": 5.304417133331299, + "rewards/rollout_reward_func/std": 14.791868209838867, + "sampling/importance_sampling_ratio/max": 1.3720983266830444, + "sampling/importance_sampling_ratio/mean": 0.9722362756729126, + "sampling/importance_sampling_ratio/min": 8.055465437370129e-20, + "sampling/sampling_logp_difference/max": 38.39814376831055, + "sampling/sampling_logp_difference/mean": 0.05052501708269119, + "step": 261, + "step_time": 39.094199752998065 + }, + { + "clip_ratio/high_max": 0.05559855583123863, + "clip_ratio/high_mean": 0.018239916767925024, + "clip_ratio/low_mean": 0.02690972271375358, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.04514963936526328, + "entropy": 0.1835553077980876, + "epoch": 0.00524, + "grad_norm": 0.266696572303772, + "kl": 0.5767297390848398, + "learning_rate": 9.9999054154658e-05, + "loss": 0.0025, + "step": 262, + "step_time": 9.272368914001163 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0026041666860692203, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 1241.4375, + "completions/mean_terminated_length": 1241.4375, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 0.1920191366225481, + "epoch": 0.00526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8278535604476929, + "kl": 0.5627734903246164, + "learning_rate": 9.999904576586242e-05, + "loss": -0.0123, + "num_tokens": 14012567.0, + "reward": 3.131194591522217, + "reward_std": 12.649508476257324, + "rewards/rollout_reward_func/mean": 3.131195068359375, + "rewards/rollout_reward_func/std": 12.768006324768066, + "sampling/importance_sampling_ratio/max": 1.5249695777893066, + "sampling/importance_sampling_ratio/mean": 1.0084636211395264, + "sampling/importance_sampling_ratio/min": 0.6291685700416565, + "sampling/sampling_logp_difference/max": 0.48067259788513184, + "sampling/sampling_logp_difference/mean": 0.011031190864741802, + "step": 263, + "step_time": 37.94770654900185 + }, + { + "clip_ratio/high_max": 0.06597222364507616, + "clip_ratio/high_mean": 0.02170138922519982, + "clip_ratio/low_mean": 0.027777778508607298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04947916854871437, + "entropy": 0.1821621311828494, + "epoch": 0.00528, + "grad_norm": 0.30830591917037964, + "kl": 0.6204142663627863, + "learning_rate": 9.999903734003084e-05, + "loss": -0.0238, + "step": 264, + "step_time": 9.788196208000045 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0026041666860692203, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004340277810115367, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 1215.875, + "completions/mean_terminated_length": 1215.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.18026093766093254, + "epoch": 0.0053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7985097765922546, + "kl": 0.5250384621322155, + "learning_rate": 9.999902887716329e-05, + "loss": -0.0455, + "num_tokens": 14141610.0, + "reward": 2.849423408508301, + "reward_std": 12.35162353515625, + "rewards/rollout_reward_func/mean": 2.849423408508301, + "rewards/rollout_reward_func/std": 12.910691261291504, + "sampling/importance_sampling_ratio/max": 1.7354861497879028, + "sampling/importance_sampling_ratio/mean": 0.9913997650146484, + "sampling/importance_sampling_ratio/min": 0.53452068567276, + "sampling/sampling_logp_difference/max": 0.5425161123275757, + "sampling/sampling_logp_difference/mean": 0.012209449894726276, + "step": 265, + "step_time": 38.11352587100009 + }, + { + "clip_ratio/high_max": 0.06311274622566998, + "clip_ratio/high_mean": 0.01925040880450979, + "clip_ratio/low_mean": 0.030831291631329805, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.050081700494047254, + "entropy": 0.1743807177990675, + "epoch": 0.00532, + "grad_norm": 0.9081993103027344, + "kl": 1.4623642209917307, + "learning_rate": 9.999902037725976e-05, + "loss": -0.0483, + "step": 266, + "step_time": 9.7086338709978 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0026041666860692203, + "clip_ratio/low_mean": 0.005259395460598171, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007863562146667391, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 1235.953125, + "completions/mean_terminated_length": 1235.953125, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.1784328306093812, + "epoch": 0.00534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8215274810791016, + "kl": 0.49557984061539173, + "learning_rate": 9.999901184032026e-05, + "loss": 0.0099, + "num_tokens": 14271910.0, + "reward": 6.570675849914551, + "reward_std": 11.428293228149414, + "rewards/rollout_reward_func/mean": 6.570675849914551, + "rewards/rollout_reward_func/std": 11.919609069824219, + "sampling/importance_sampling_ratio/max": 1.5103188753128052, + "sampling/importance_sampling_ratio/mean": 1.018727421760559, + "sampling/importance_sampling_ratio/min": 1.0843930725359919e-15, + "sampling/sampling_logp_difference/max": 27.71844482421875, + "sampling/sampling_logp_difference/mean": 0.04110131412744522, + "step": 267, + "step_time": 40.345479872002215 + }, + { + "clip_ratio/high_max": 0.08261846494860947, + "clip_ratio/high_mean": 0.025862949551083148, + "clip_ratio/low_mean": 0.025904605397954583, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05176755564752966, + "entropy": 0.18014734331518412, + "epoch": 0.00536, + "grad_norm": 0.3775624632835388, + "kl": 0.5100179798901081, + "learning_rate": 9.99990032663448e-05, + "loss": -0.0005, + "step": 268, + "step_time": 8.8038962849987 + }, + { + "clip_ratio/high_max": 0.010620915098115802, + "clip_ratio/high_mean": 0.0026552287745289505, + "clip_ratio/low_mean": 0.001787173212505877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0044424019870348275, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 1209.5625, + "completions/mean_terminated_length": 1209.5625, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.19260858092457056, + "epoch": 0.00538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9969385266304016, + "kl": 0.48063670098781586, + "learning_rate": 9.999899465533337e-05, + "loss": -0.0145, + "num_tokens": 14400520.0, + "reward": 4.870312690734863, + "reward_std": 12.755669593811035, + "rewards/rollout_reward_func/mean": 4.870312690734863, + "rewards/rollout_reward_func/std": 12.786203384399414, + "sampling/importance_sampling_ratio/max": 1.391904354095459, + "sampling/importance_sampling_ratio/mean": 0.9837595224380493, + "sampling/importance_sampling_ratio/min": 0.5111071467399597, + "sampling/sampling_logp_difference/max": 0.6141395568847656, + "sampling/sampling_logp_difference/mean": 0.012509889900684357, + "step": 269, + "step_time": 39.08969898599935 + }, + { + "clip_ratio/high_max": 0.07679738639853895, + "clip_ratio/high_mean": 0.025275735883042216, + "clip_ratio/low_mean": 0.03416053985711187, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.05943627591477707, + "entropy": 0.190420214086771, + "epoch": 0.0054, + "grad_norm": 2.133584499359131, + "kl": 1.2963667679578066, + "learning_rate": 9.999898600728599e-05, + "loss": -0.0154, + "step": 270, + "step_time": 10.067689283000618 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 1241.09375, + "completions/mean_terminated_length": 1241.09375, + "completions/min_length": 641.0, + "completions/min_terminated_length": 641.0, + "entropy": 0.17894164565950632, + "epoch": 0.00542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.759871780872345, + "kl": 0.4996040966361761, + "learning_rate": 9.999897732220269e-05, + "loss": -0.0452, + "num_tokens": 14531215.0, + "reward": 6.598260879516602, + "reward_std": 12.557649612426758, + "rewards/rollout_reward_func/mean": 6.598260402679443, + "rewards/rollout_reward_func/std": 12.858835220336914, + "sampling/importance_sampling_ratio/max": 1.6857366561889648, + "sampling/importance_sampling_ratio/mean": 1.0335665941238403, + "sampling/importance_sampling_ratio/min": 0.555221676826477, + "sampling/sampling_logp_difference/max": 0.583274245262146, + "sampling/sampling_logp_difference/mean": 0.010182222351431847, + "step": 271, + "step_time": 38.82532325500051 + }, + { + "clip_ratio/high_max": 0.03513071942143142, + "clip_ratio/high_mean": 0.009650735417380929, + "clip_ratio/low_mean": 0.020450367941521108, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030101103708148003, + "entropy": 0.17972450237721205, + "epoch": 0.00544, + "grad_norm": 0.39173194766044617, + "kl": 0.5205750651657581, + "learning_rate": 9.999896860008347e-05, + "loss": -0.052, + "step": 272, + "step_time": 10.316601943999558 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0026041666860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004340277810115367, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1340.0, + "completions/max_terminated_length": 1340.0, + "completions/mean_length": 1249.921875, + "completions/mean_terminated_length": 1249.921875, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "entropy": 0.18446057755500078, + "epoch": 0.00546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5517368912696838, + "kl": 0.5441189091652632, + "learning_rate": 9.999895984092831e-05, + "loss": 0.0131, + "num_tokens": 14662474.0, + "reward": 5.8217644691467285, + "reward_std": 11.078777313232422, + "rewards/rollout_reward_func/mean": 5.8217644691467285, + "rewards/rollout_reward_func/std": 11.748489379882812, + "sampling/importance_sampling_ratio/max": 2.5323374271392822, + "sampling/importance_sampling_ratio/mean": 0.9794174432754517, + "sampling/importance_sampling_ratio/min": 1.1464784742225287e-13, + "sampling/sampling_logp_difference/max": 23.46839714050293, + "sampling/sampling_logp_difference/mean": 0.03605649992823601, + "step": 273, + "step_time": 38.64782374399874 + }, + { + "clip_ratio/high_max": 0.041483918437734246, + "clip_ratio/high_mean": 0.01210709079168737, + "clip_ratio/low_mean": 0.049096201779320836, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06120329239638522, + "entropy": 0.17116414476186037, + "epoch": 0.00548, + "grad_norm": 0.3144800662994385, + "kl": 0.6644695494323969, + "learning_rate": 9.999895104473725e-05, + "loss": 0.0043, + "step": 274, + "step_time": 8.841590996999912 + }, + { + "clip_ratio/high_max": 0.01756535959430039, + "clip_ratio/high_mean": 0.006127451022621244, + "clip_ratio/low_mean": 0.0026552287745289505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008782679797150195, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1370.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 1246.375, + "completions/mean_terminated_length": 1246.375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.17010682448744774, + "epoch": 0.0055, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5666040182113647, + "kl": 0.5793958213180304, + "learning_rate": 9.99989422115103e-05, + "loss": 0.0183, + "num_tokens": 14793449.0, + "reward": 2.9848508834838867, + "reward_std": 12.649776458740234, + "rewards/rollout_reward_func/mean": 2.984851121902466, + "rewards/rollout_reward_func/std": 13.012813568115234, + "sampling/importance_sampling_ratio/max": 1.5569446086883545, + "sampling/importance_sampling_ratio/mean": 0.9847633838653564, + "sampling/importance_sampling_ratio/min": 0.6424822807312012, + "sampling/sampling_logp_difference/max": 0.4623146057128906, + "sampling/sampling_logp_difference/mean": 0.00930742733180523, + "step": 275, + "step_time": 39.34428709400072 + }, + { + "clip_ratio/high_max": 0.054125817492604256, + "clip_ratio/high_mean": 0.016135621059220284, + "clip_ratio/low_mean": 0.0301164222182706, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046252043335698545, + "entropy": 0.16701707802712917, + "epoch": 0.00552, + "grad_norm": 0.6509947180747986, + "kl": 0.6457913182675838, + "learning_rate": 9.999893334124744e-05, + "loss": 0.0127, + "step": 276, + "step_time": 9.492375128998901 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0026041666860692203, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034722222480922937, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 1211.75, + "completions/mean_terminated_length": 1211.75, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "entropy": 0.16395934531465173, + "epoch": 0.00554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6039373874664307, + "kl": 0.6709765158593655, + "learning_rate": 9.999892443394869e-05, + "loss": -0.0199, + "num_tokens": 14922202.0, + "reward": 9.567506790161133, + "reward_std": 12.886774063110352, + "rewards/rollout_reward_func/mean": 9.567506790161133, + "rewards/rollout_reward_func/std": 14.272911071777344, + "sampling/importance_sampling_ratio/max": 1.3702117204666138, + "sampling/importance_sampling_ratio/mean": 0.9926258325576782, + "sampling/importance_sampling_ratio/min": 2.4751771812714374e-13, + "sampling/sampling_logp_difference/max": 22.218292236328125, + "sampling/sampling_logp_difference/mean": 0.03479118272662163, + "step": 277, + "step_time": 38.66672314299831 + }, + { + "clip_ratio/high_max": 0.056832108180969954, + "clip_ratio/high_mean": 0.015944138227496296, + "clip_ratio/low_mean": 0.026416234264615923, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04236037301598117, + "entropy": 0.16752836294472218, + "epoch": 0.00556, + "grad_norm": 0.2952568829059601, + "kl": 0.7055745627731085, + "learning_rate": 9.999891548961409e-05, + "loss": -0.0283, + "step": 278, + "step_time": 10.249573535998024 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0026552287745289505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003523284336552024, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 1206.921875, + "completions/mean_terminated_length": 1206.921875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.16296257637441158, + "epoch": 0.00558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6338427662849426, + "kl": 0.5948988310992718, + "learning_rate": 9.99989065082436e-05, + "loss": -0.0193, + "num_tokens": 15050626.0, + "reward": 6.859679222106934, + "reward_std": 13.335336685180664, + "rewards/rollout_reward_func/mean": 6.859679222106934, + "rewards/rollout_reward_func/std": 13.806427955627441, + "sampling/importance_sampling_ratio/max": 1.73651123046875, + "sampling/importance_sampling_ratio/mean": 1.0235867500305176, + "sampling/importance_sampling_ratio/min": 0.6915313005447388, + "sampling/sampling_logp_difference/max": 0.3299523591995239, + "sampling/sampling_logp_difference/mean": 0.00790142547339201, + "step": 279, + "step_time": 37.97661670400066 + }, + { + "clip_ratio/high_max": 0.04227941203862429, + "clip_ratio/high_mean": 0.01235702628036961, + "clip_ratio/low_mean": 0.023852379759773612, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.036209406214766204, + "entropy": 0.17103052884340286, + "epoch": 0.0056, + "grad_norm": 0.310857355594635, + "kl": 0.6127588897943497, + "learning_rate": 9.999889748983726e-05, + "loss": -0.0289, + "step": 280, + "step_time": 9.740956478998669 + }, + { + "clip_ratio/high_max": 0.007148692850023508, + "clip_ratio/high_mean": 0.001787173212505877, + "clip_ratio/low_mean": 0.0035807291860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005367902398575097, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 1242.203125, + "completions/mean_terminated_length": 1242.203125, + "completions/min_length": 1071.0, + "completions/min_terminated_length": 1071.0, + "entropy": 0.18003392685204744, + "epoch": 0.00562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5553699731826782, + "kl": 0.5920679531991482, + "learning_rate": 9.999888843439508e-05, + "loss": 0.0235, + "num_tokens": 15181392.0, + "reward": 3.9797325134277344, + "reward_std": 11.883782386779785, + "rewards/rollout_reward_func/mean": 3.9797325134277344, + "rewards/rollout_reward_func/std": 12.557183265686035, + "sampling/importance_sampling_ratio/max": 2.3216447830200195, + "sampling/importance_sampling_ratio/mean": 1.0259038209915161, + "sampling/importance_sampling_ratio/min": 0.37790024280548096, + "sampling/sampling_logp_difference/max": 1.4781968593597412, + "sampling/sampling_logp_difference/mean": 0.011046608909964561, + "step": 281, + "step_time": 39.572295284000575 + }, + { + "clip_ratio/high_max": 0.04312193673104048, + "clip_ratio/high_mean": 0.01343571295728907, + "clip_ratio/low_mean": 0.023201337666250765, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03663705079816282, + "entropy": 0.18677309434860945, + "epoch": 0.00564, + "grad_norm": 0.5401102304458618, + "kl": 0.5960894413292408, + "learning_rate": 9.999887934191704e-05, + "loss": 0.0166, + "step": 282, + "step_time": 9.04144253800041 + }, + { + "clip_ratio/high_max": 0.007582720601931214, + "clip_ratio/high_mean": 0.0018956801504828036, + "clip_ratio/low_mean": 0.0009191176504828036, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002814797800965607, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 1215.859375, + "completions/mean_terminated_length": 1215.859375, + "completions/min_length": 1067.0, + "completions/min_terminated_length": 1067.0, + "entropy": 0.18976869899779558, + "epoch": 0.00566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6778466105461121, + "kl": 0.8045855388045311, + "learning_rate": 9.99988702124032e-05, + "loss": 0.0435, + "num_tokens": 15310402.0, + "reward": 9.532302856445312, + "reward_std": 13.447786331176758, + "rewards/rollout_reward_func/mean": 9.532302856445312, + "rewards/rollout_reward_func/std": 14.893537521362305, + "sampling/importance_sampling_ratio/max": 1.5014206171035767, + "sampling/importance_sampling_ratio/mean": 0.984979510307312, + "sampling/importance_sampling_ratio/min": 0.5918754935264587, + "sampling/sampling_logp_difference/max": 0.49157631397247314, + "sampling/sampling_logp_difference/mean": 0.009644631296396255, + "step": 283, + "step_time": 37.75929147200077 + }, + { + "clip_ratio/high_max": 0.04337724717333913, + "clip_ratio/high_mean": 0.017022824671585113, + "clip_ratio/low_mean": 0.019767412508372217, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.036790237470995635, + "entropy": 0.19258240424096584, + "epoch": 0.00568, + "grad_norm": 0.23295848071575165, + "kl": 0.7846251800656319, + "learning_rate": 9.999886104585351e-05, + "loss": 0.0377, + "step": 284, + "step_time": 9.718582901003174 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0036764706601388752, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004544526163954288, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1352.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 1182.8125, + "completions/mean_terminated_length": 1182.8125, + "completions/min_length": 644.0, + "completions/min_terminated_length": 644.0, + "entropy": 0.21535112708806992, + "epoch": 0.0057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8615608215332031, + "kl": 0.8677664548158646, + "learning_rate": 9.999885184226802e-05, + "loss": -0.0133, + "num_tokens": 15437277.0, + "reward": 4.226072788238525, + "reward_std": 8.478089332580566, + "rewards/rollout_reward_func/mean": 4.226072788238525, + "rewards/rollout_reward_func/std": 9.509638786315918, + "sampling/importance_sampling_ratio/max": 1.921204924583435, + "sampling/importance_sampling_ratio/mean": 0.9768272638320923, + "sampling/importance_sampling_ratio/min": 0.7244350910186768, + "sampling/sampling_logp_difference/max": 0.4400520324707031, + "sampling/sampling_logp_difference/mean": 0.010012689046561718, + "step": 285, + "step_time": 36.26275280400023 + }, + { + "clip_ratio/high_max": 0.04353043343871832, + "clip_ratio/high_mean": 0.01611264329403639, + "clip_ratio/low_mean": 0.029692606767639518, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.045805250061675906, + "entropy": 0.21773250121623278, + "epoch": 0.00572, + "grad_norm": 0.48840415477752686, + "kl": 1.049767030403018, + "learning_rate": 9.999884260164671e-05, + "loss": -0.0254, + "step": 286, + "step_time": 10.79683871799898 + }, + { + "clip_ratio/high_max": 0.01797385630197823, + "clip_ratio/high_mean": 0.004493464075494558, + "clip_ratio/low_mean": 0.003523284336552024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008016748412046582, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 1224.109375, + "completions/mean_terminated_length": 1224.109375, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "entropy": 0.22404625453054905, + "epoch": 0.00574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1797651052474976, + "kl": 0.8139622360467911, + "learning_rate": 9.999883332398962e-05, + "loss": -0.0606, + "num_tokens": 15566944.0, + "reward": 5.630161762237549, + "reward_std": 12.35897445678711, + "rewards/rollout_reward_func/mean": 5.630161762237549, + "rewards/rollout_reward_func/std": 13.792024612426758, + "sampling/importance_sampling_ratio/max": 2.566322088241577, + "sampling/importance_sampling_ratio/mean": 0.9811519384384155, + "sampling/importance_sampling_ratio/min": 0.388390451669693, + "sampling/sampling_logp_difference/max": 1.9641337394714355, + "sampling/sampling_logp_difference/mean": 0.015188181772828102, + "step": 287, + "step_time": 36.44446017899918 + }, + { + "clip_ratio/high_max": 0.053717320784926414, + "clip_ratio/high_mean": 0.01695261470740661, + "clip_ratio/low_mean": 0.04400914063444361, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.060961755632888526, + "entropy": 0.2100704526528716, + "epoch": 0.00576, + "grad_norm": 1.2667500972747803, + "kl": 2.074540827423334, + "learning_rate": 9.999882400929674e-05, + "loss": -0.057, + "step": 288, + "step_time": 8.99862431100064 + }, + { + "clip_ratio/high_max": 0.010850694496184587, + "clip_ratio/high_mean": 0.0035807291860692203, + "clip_ratio/low_mean": 0.001787173212505877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005367902398575097, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 1206.5, + "completions/mean_terminated_length": 1206.5, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.21379029098898172, + "epoch": 0.00578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9330686926841736, + "kl": 0.6734739989042282, + "learning_rate": 9.999881465756809e-05, + "loss": -0.0075, + "num_tokens": 15695392.0, + "reward": 5.131735801696777, + "reward_std": 13.59388256072998, + "rewards/rollout_reward_func/mean": 5.1317362785339355, + "rewards/rollout_reward_func/std": 15.563157081604004, + "sampling/importance_sampling_ratio/max": 1.5120335817337036, + "sampling/importance_sampling_ratio/mean": 0.9915132522583008, + "sampling/importance_sampling_ratio/min": 0.7389032244682312, + "sampling/sampling_logp_difference/max": 0.3448265790939331, + "sampling/sampling_logp_difference/mean": 0.010639440268278122, + "step": 289, + "step_time": 38.13999567300107 + }, + { + "clip_ratio/high_max": 0.052309082355350256, + "clip_ratio/high_mean": 0.015732499363366514, + "clip_ratio/low_mean": 0.04032860859297216, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.056061108596622944, + "entropy": 0.20592329651117325, + "epoch": 0.0058, + "grad_norm": 1.7719804048538208, + "kl": 2.317722400650382, + "learning_rate": 9.999880526880367e-05, + "loss": 0.0124, + "step": 290, + "step_time": 9.157000397000957 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 1227.28125, + "completions/mean_terminated_length": 1227.2381591796875, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "entropy": 0.2259034337475896, + "epoch": 0.00582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5913375616073608, + "kl": 0.5888024400919676, + "learning_rate": 9.999879584300349e-05, + "loss": -0.0201, + "num_tokens": 15825170.0, + "reward": 4.390281677246094, + "reward_std": 13.705522537231445, + "rewards/rollout_reward_func/mean": 4.390281677246094, + "rewards/rollout_reward_func/std": 13.848193168640137, + "sampling/importance_sampling_ratio/max": 1.4006119966506958, + "sampling/importance_sampling_ratio/mean": 0.9845026731491089, + "sampling/importance_sampling_ratio/min": 0.57123863697052, + "sampling/sampling_logp_difference/max": 0.4590674638748169, + "sampling/sampling_logp_difference/mean": 0.010318214073777199, + "step": 291, + "step_time": 38.378031336002095 + }, + { + "clip_ratio/high_max": 0.05575980432331562, + "clip_ratio/high_mean": 0.018331290979404002, + "clip_ratio/low_mean": 0.025366254791151732, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0436975461198017, + "entropy": 0.25106900557875633, + "epoch": 0.00584, + "grad_norm": 0.3503086268901825, + "kl": 0.6040437389165163, + "learning_rate": 9.999878638016755e-05, + "loss": -0.0261, + "step": 292, + "step_time": 10.2765881680034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 1203.28125, + "completions/mean_terminated_length": 1203.28125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.255804393440485, + "epoch": 0.00586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.733392059803009, + "kl": 0.7084890268743038, + "learning_rate": 9.99987768802959e-05, + "loss": -0.0295, + "num_tokens": 15953483.0, + "reward": 4.652621269226074, + "reward_std": 13.876627922058105, + "rewards/rollout_reward_func/mean": 4.652621269226074, + "rewards/rollout_reward_func/std": 14.444734573364258, + "sampling/importance_sampling_ratio/max": 1.5388283729553223, + "sampling/importance_sampling_ratio/mean": 0.9943192005157471, + "sampling/importance_sampling_ratio/min": 0.66633540391922, + "sampling/sampling_logp_difference/max": 0.3228440284729004, + "sampling/sampling_logp_difference/mean": 0.009999222122132778, + "step": 293, + "step_time": 36.11094247699839 + }, + { + "clip_ratio/high_max": 0.07255117082968354, + "clip_ratio/high_mean": 0.020741959451697767, + "clip_ratio/low_mean": 0.026092729007359594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0468346884008497, + "entropy": 0.2808321360498667, + "epoch": 0.00588, + "grad_norm": 0.3306209444999695, + "kl": 0.6640463471412659, + "learning_rate": 9.99987673433885e-05, + "loss": -0.0368, + "step": 294, + "step_time": 9.500305491999825 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0026041666860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034722222480922937, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 1203.546875, + "completions/mean_terminated_length": 1203.546875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.29141946602612734, + "epoch": 0.0059, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7943304777145386, + "kl": 0.6770852543413639, + "learning_rate": 9.999875776944538e-05, + "loss": -0.0049, + "num_tokens": 16081715.0, + "reward": 2.4234745502471924, + "reward_std": 10.446115493774414, + "rewards/rollout_reward_func/mean": 2.4234743118286133, + "rewards/rollout_reward_func/std": 11.454586029052734, + "sampling/importance_sampling_ratio/max": 1.456477165222168, + "sampling/importance_sampling_ratio/mean": 0.9954730272293091, + "sampling/importance_sampling_ratio/min": 0.6373972296714783, + "sampling/sampling_logp_difference/max": 0.3991684913635254, + "sampling/sampling_logp_difference/mean": 0.010583357885479927, + "step": 295, + "step_time": 37.65815602800012 + }, + { + "clip_ratio/high_max": 0.049019608180969954, + "clip_ratio/high_mean": 0.022671569080557674, + "clip_ratio/low_mean": 0.029513889458030462, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05218545877141878, + "entropy": 0.2855409812182188, + "epoch": 0.00592, + "grad_norm": 0.3455994129180908, + "kl": 0.6785521320998669, + "learning_rate": 9.999874815846655e-05, + "loss": -0.0152, + "step": 296, + "step_time": 8.978216180002164 + }, + { + "clip_ratio/high_max": 0.010620915098115802, + "clip_ratio/high_mean": 0.003523284336552024, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004391339898575097, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 1205.796875, + "completions/mean_terminated_length": 1205.796875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.3199264472350478, + "epoch": 0.00594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5850762128829956, + "kl": 0.8113113101571798, + "learning_rate": 9.999873851045201e-05, + "loss": -0.0035, + "num_tokens": 16210207.0, + "reward": 2.8288328647613525, + "reward_std": 14.778526306152344, + "rewards/rollout_reward_func/mean": 2.8288326263427734, + "rewards/rollout_reward_func/std": 16.02610969543457, + "sampling/importance_sampling_ratio/max": 1.4778478145599365, + "sampling/importance_sampling_ratio/mean": 1.0175740718841553, + "sampling/importance_sampling_ratio/min": 0.6004241108894348, + "sampling/sampling_logp_difference/max": 0.35140562057495117, + "sampling/sampling_logp_difference/mean": 0.012288028374314308, + "step": 297, + "step_time": 36.8206370079979 + }, + { + "clip_ratio/high_max": 0.0490196084138006, + "clip_ratio/high_mean": 0.015590063121635467, + "clip_ratio/low_mean": 0.026909722771961242, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04249978606821969, + "entropy": 0.3119704835116863, + "epoch": 0.00596, + "grad_norm": 0.6210339069366455, + "kl": 0.8435764815658331, + "learning_rate": 9.99987288254018e-05, + "loss": -0.0125, + "step": 298, + "step_time": 9.949690105999252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 1166.171875, + "completions/mean_terminated_length": 1166.171875, + "completions/min_length": 865.0, + "completions/min_terminated_length": 865.0, + "entropy": 0.3107016496360302, + "epoch": 0.00598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7741795778274536, + "kl": 0.8151105176657438, + "learning_rate": 9.99987191033159e-05, + "loss": 0.013, + "num_tokens": 16335952.0, + "reward": 0.605268120765686, + "reward_std": 9.44769287109375, + "rewards/rollout_reward_func/mean": 0.6052679419517517, + "rewards/rollout_reward_func/std": 10.618112564086914, + "sampling/importance_sampling_ratio/max": 1.4987179040908813, + "sampling/importance_sampling_ratio/mean": 1.0136826038360596, + "sampling/importance_sampling_ratio/min": 0.7334418892860413, + "sampling/sampling_logp_difference/max": 0.23920416831970215, + "sampling/sampling_logp_difference/mean": 0.011820180341601372, + "step": 299, + "step_time": 35.96857580300002 + }, + { + "clip_ratio/high_max": 0.07089971494860947, + "clip_ratio/high_mean": 0.022029462968930602, + "clip_ratio/low_mean": 0.03730450588045642, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05933396948967129, + "entropy": 0.28032723255455494, + "epoch": 0.006, + "grad_norm": 0.37243181467056274, + "kl": 0.8571038488298655, + "learning_rate": 9.999870934419433e-05, + "loss": -0.0014, + "step": 300, + "step_time": 9.851978641999267 + }, + { + "clip_ratio/high_max": 0.014093137346208096, + "clip_ratio/high_mean": 0.004391339898575097, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005259395460598171, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 1216.4375, + "completions/mean_terminated_length": 1216.4375, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "entropy": 0.2521855002269149, + "epoch": 0.00602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6381392478942871, + "kl": 0.8058282844722271, + "learning_rate": 9.999869954803708e-05, + "loss": 0.0246, + "num_tokens": 16465057.0, + "reward": 4.353306293487549, + "reward_std": 11.903841018676758, + "rewards/rollout_reward_func/mean": 4.353306293487549, + "rewards/rollout_reward_func/std": 13.071228981018066, + "sampling/importance_sampling_ratio/max": 1.870285153388977, + "sampling/importance_sampling_ratio/mean": 1.014232873916626, + "sampling/importance_sampling_ratio/min": 0.6221296191215515, + "sampling/sampling_logp_difference/max": 0.5893880128860474, + "sampling/sampling_logp_difference/mean": 0.010516786947846413, + "step": 301, + "step_time": 37.21239350799988 + }, + { + "clip_ratio/high_max": 0.0890522887930274, + "clip_ratio/high_mean": 0.02920751681085676, + "clip_ratio/low_mean": 0.026308735250495374, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05551625177031383, + "entropy": 0.26322738360613585, + "epoch": 0.00604, + "grad_norm": 0.3497966527938843, + "kl": 0.8271188456565142, + "learning_rate": 9.999868971484418e-05, + "loss": 0.0178, + "step": 302, + "step_time": 9.60695320800096 + }, + { + "clip_ratio/high_max": 0.007504480192437768, + "clip_ratio/high_mean": 0.0027441755519248545, + "clip_ratio/low_mean": 0.004391339898575097, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007135515450499952, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1357.0, + "completions/max_terminated_length": 1357.0, + "completions/mean_length": 1233.578125, + "completions/mean_terminated_length": 1232.1270751953125, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.3282460719347, + "epoch": 0.00606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8028053641319275, + "kl": 1.0723739713430405, + "learning_rate": 9.999867984461563e-05, + "loss": -0.009, + "num_tokens": 16595305.0, + "reward": 4.948282718658447, + "reward_std": 13.311994552612305, + "rewards/rollout_reward_func/mean": 4.9482831954956055, + "rewards/rollout_reward_func/std": 13.96406078338623, + "sampling/importance_sampling_ratio/max": 1.4959264993667603, + "sampling/importance_sampling_ratio/mean": 1.0020052194595337, + "sampling/importance_sampling_ratio/min": 0.664537250995636, + "sampling/sampling_logp_difference/max": 0.4060518741607666, + "sampling/sampling_logp_difference/mean": 0.01389513909816742, + "step": 303, + "step_time": 37.319652419000704 + }, + { + "clip_ratio/high_max": 0.10214776475913823, + "clip_ratio/high_mean": 0.0419843090348877, + "clip_ratio/low_mean": 0.032362769707106054, + "clip_ratio/low_min": 0.0029761905316263437, + "clip_ratio/region_mean": 0.07434707973152399, + "entropy": 0.35452230647206306, + "epoch": 0.00608, + "grad_norm": 0.5315479040145874, + "kl": 0.8896235972642899, + "learning_rate": 9.999866993735147e-05, + "loss": -0.0191, + "step": 304, + "step_time": 9.31032760600101 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0026041666860692203, + "clip_ratio/low_mean": 0.0013720877468585968, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003976254432927817, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 1178.75, + "completions/mean_terminated_length": 1178.75, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.3360240999609232, + "epoch": 0.0061, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8160866498947144, + "kl": 0.7584020271897316, + "learning_rate": 9.999865999305169e-05, + "loss": 0.0343, + "num_tokens": 16721992.0, + "reward": 5.459122657775879, + "reward_std": 12.891645431518555, + "rewards/rollout_reward_func/mean": 5.459122657775879, + "rewards/rollout_reward_func/std": 13.743046760559082, + "sampling/importance_sampling_ratio/max": 1.5625214576721191, + "sampling/importance_sampling_ratio/mean": 0.9862264394760132, + "sampling/importance_sampling_ratio/min": 0.7355522513389587, + "sampling/sampling_logp_difference/max": 0.3090386390686035, + "sampling/sampling_logp_difference/mean": 0.012195384129881859, + "step": 305, + "step_time": 36.28136819499923 + }, + { + "clip_ratio/high_max": 0.10116884484887123, + "clip_ratio/high_mean": 0.03489725984400138, + "clip_ratio/low_mean": 0.055271854158490896, + "clip_ratio/low_min": 0.0069444444961845875, + "clip_ratio/region_mean": 0.09016911429353058, + "entropy": 0.3460291214287281, + "epoch": 0.00612, + "grad_norm": 0.4136711359024048, + "kl": 0.7873252909630537, + "learning_rate": 9.999865001171627e-05, + "loss": 0.0177, + "step": 306, + "step_time": 10.277970246998848 + }, + { + "clip_ratio/high_max": 0.017422385746613145, + "clip_ratio/high_mean": 0.004355596436653286, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00522365199867636, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 1148.03125, + "completions/mean_terminated_length": 1148.03125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.4305746052414179, + "epoch": 0.00614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9119190573692322, + "kl": 0.862309418618679, + "learning_rate": 9.999863999334527e-05, + "loss": -0.039, + "num_tokens": 16846641.0, + "reward": 4.154011249542236, + "reward_std": 13.017316818237305, + "rewards/rollout_reward_func/mean": 4.1540117263793945, + "rewards/rollout_reward_func/std": 12.931968688964844, + "sampling/importance_sampling_ratio/max": 1.4475128650665283, + "sampling/importance_sampling_ratio/mean": 0.9774882793426514, + "sampling/importance_sampling_ratio/min": 9.214395739476355e-13, + "sampling/sampling_logp_difference/max": 23.965322494506836, + "sampling/sampling_logp_difference/mean": 0.03728090599179268, + "step": 307, + "step_time": 33.57406117500068 + }, + { + "clip_ratio/high_max": 0.08014640025794506, + "clip_ratio/high_mean": 0.026986419688910246, + "clip_ratio/low_mean": 0.03807902126573026, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.06506544025614858, + "entropy": 0.4615292586386204, + "epoch": 0.00616, + "grad_norm": 0.544769287109375, + "kl": 0.8701771721243858, + "learning_rate": 9.999862993793865e-05, + "loss": -0.0498, + "step": 308, + "step_time": 9.750469571001304 + }, + { + "clip_ratio/high_max": 0.0021551724057644606, + "clip_ratio/high_mean": 0.0005387931014411151, + "clip_ratio/low_mean": 0.004073183808941394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004611976910382509, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 1176.34375, + "completions/mean_terminated_length": 1176.34375, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.4520879667252302, + "epoch": 0.00618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6592026352882385, + "kl": 1.228204183280468, + "learning_rate": 9.999861984549645e-05, + "loss": 0.0146, + "num_tokens": 16973130.0, + "reward": 5.186724662780762, + "reward_std": 12.892146110534668, + "rewards/rollout_reward_func/mean": 5.186724662780762, + "rewards/rollout_reward_func/std": 12.396245002746582, + "sampling/importance_sampling_ratio/max": 1.4907543659210205, + "sampling/importance_sampling_ratio/mean": 0.992376446723938, + "sampling/importance_sampling_ratio/min": 0.6941927671432495, + "sampling/sampling_logp_difference/max": 0.338625431060791, + "sampling/sampling_logp_difference/mean": 0.014916637912392616, + "step": 309, + "step_time": 35.957562657998096 + }, + { + "clip_ratio/high_max": 0.07991837477311492, + "clip_ratio/high_mean": 0.02518792706541717, + "clip_ratio/low_mean": 0.04024840978672728, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06543633691035211, + "entropy": 0.44149017706513405, + "epoch": 0.0062, + "grad_norm": 0.5282915234565735, + "kl": 1.2467477656900883, + "learning_rate": 9.999860971601868e-05, + "loss": -0.002, + "step": 310, + "step_time": 8.980034561999673 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0008223684271797538, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016904239892028272, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 1184.96875, + "completions/mean_terminated_length": 1184.96875, + "completions/min_length": 917.0, + "completions/min_terminated_length": 917.0, + "entropy": 0.5129956435412169, + "epoch": 0.00622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7465701699256897, + "kl": 1.0012187995016575, + "learning_rate": 9.999859954950535e-05, + "loss": 0.018, + "num_tokens": 17100245.0, + "reward": 4.89565372467041, + "reward_std": 13.874456405639648, + "rewards/rollout_reward_func/mean": 4.89565372467041, + "rewards/rollout_reward_func/std": 14.702526092529297, + "sampling/importance_sampling_ratio/max": 1.5052223205566406, + "sampling/importance_sampling_ratio/mean": 1.027785062789917, + "sampling/importance_sampling_ratio/min": 0.5468899607658386, + "sampling/sampling_logp_difference/max": 0.4049875736236572, + "sampling/sampling_logp_difference/mean": 0.017239127308130264, + "step": 311, + "step_time": 34.84647880300054 + }, + { + "clip_ratio/high_max": 0.08282635360956192, + "clip_ratio/high_mean": 0.031164190906565636, + "clip_ratio/low_mean": 0.05087516509229317, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08203935588244349, + "entropy": 0.5392583776265383, + "epoch": 0.00624, + "grad_norm": 0.5834032297134399, + "kl": 1.0580051615834236, + "learning_rate": 9.999858934595648e-05, + "loss": 0.0006, + "step": 312, + "step_time": 9.797768173001714 + }, + { + "clip_ratio/high_max": 0.006076388992369175, + "clip_ratio/high_mean": 0.0015190972480922937, + "clip_ratio/low_mean": 0.0006793478387407959, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021984450868330896, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 1134.828125, + "completions/mean_terminated_length": 1133.3968505859375, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.5534908715635538, + "epoch": 0.00626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7070833444595337, + "kl": 0.9324860982596874, + "learning_rate": 9.999857910537204e-05, + "loss": 0.0171, + "num_tokens": 17224021.0, + "reward": 2.2121267318725586, + "reward_std": 12.934229850769043, + "rewards/rollout_reward_func/mean": 2.2121264934539795, + "rewards/rollout_reward_func/std": 13.348692893981934, + "sampling/importance_sampling_ratio/max": 1.3357430696487427, + "sampling/importance_sampling_ratio/mean": 0.9801706075668335, + "sampling/importance_sampling_ratio/min": 0.6364750862121582, + "sampling/sampling_logp_difference/max": 0.28098082542419434, + "sampling/sampling_logp_difference/mean": 0.01595621556043625, + "step": 313, + "step_time": 34.49526453500039 + }, + { + "clip_ratio/high_max": 0.09578519035130739, + "clip_ratio/high_mean": 0.028240888088475913, + "clip_ratio/low_mean": 0.043463885551318526, + "clip_ratio/low_min": 0.003289473708719015, + "clip_ratio/region_mean": 0.07170477387262508, + "entropy": 0.5145694836974144, + "epoch": 0.00628, + "grad_norm": 8.991610527038574, + "kl": 2.500880379229784, + "learning_rate": 9.999856882775207e-05, + "loss": 0.0362, + "step": 314, + "step_time": 9.686568430998705 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008680555620230734, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 1152.8125, + "completions/mean_terminated_length": 1152.8125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.47399672865867615, + "epoch": 0.0063, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9034998416900635, + "kl": 0.8184376284480095, + "learning_rate": 9.999855851309658e-05, + "loss": 0.0293, + "num_tokens": 17349042.0, + "reward": 2.9872653484344482, + "reward_std": 10.313895225524902, + "rewards/rollout_reward_func/mean": 2.9872655868530273, + "rewards/rollout_reward_func/std": 11.123116493225098, + "sampling/importance_sampling_ratio/max": 1.5023268461227417, + "sampling/importance_sampling_ratio/mean": 0.9912445545196533, + "sampling/importance_sampling_ratio/min": 0.5293837189674377, + "sampling/sampling_logp_difference/max": 0.49621057510375977, + "sampling/sampling_logp_difference/mean": 0.01648723892867565, + "step": 315, + "step_time": 36.106211563002034 + }, + { + "clip_ratio/high_max": 0.07373366155661643, + "clip_ratio/high_mean": 0.02871302078710869, + "clip_ratio/low_mean": 0.052897133806254715, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.08161015470977873, + "entropy": 0.439556997269392, + "epoch": 0.00632, + "grad_norm": 1.1542975902557373, + "kl": 0.8081017658114433, + "learning_rate": 9.999854816140556e-05, + "loss": 0.0112, + "step": 316, + "step_time": 9.529359940000177 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 1153.015625, + "completions/mean_terminated_length": 1152.2857666015625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.4202824104577303, + "epoch": 0.00634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8387591242790222, + "kl": 0.7999376337975264, + "learning_rate": 9.999853777267906e-05, + "loss": -0.0113, + "num_tokens": 17474080.0, + "reward": 3.8928003311157227, + "reward_std": 13.945871353149414, + "rewards/rollout_reward_func/mean": 3.8928003311157227, + "rewards/rollout_reward_func/std": 14.018685340881348, + "sampling/importance_sampling_ratio/max": 1.3972409963607788, + "sampling/importance_sampling_ratio/mean": 0.9933174252510071, + "sampling/importance_sampling_ratio/min": 0.66861891746521, + "sampling/sampling_logp_difference/max": 0.3364081382751465, + "sampling/sampling_logp_difference/mean": 0.013292517513036728, + "step": 317, + "step_time": 35.11344056699909 + }, + { + "clip_ratio/high_max": 0.062046968610957265, + "clip_ratio/high_mean": 0.02252296026563272, + "clip_ratio/low_mean": 0.06663749110884964, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08916045201476663, + "entropy": 0.3698996100574732, + "epoch": 0.00636, + "grad_norm": 0.5773271918296814, + "kl": 0.9309169836342335, + "learning_rate": 9.999852734691706e-05, + "loss": -0.0303, + "step": 318, + "step_time": 9.084739140999773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0034635705524124205, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034635705524124205, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1360.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 1189.828125, + "completions/mean_terminated_length": 1189.828125, + "completions/min_length": 1056.0, + "completions/min_terminated_length": 1056.0, + "entropy": 0.3289623577147722, + "epoch": 0.00638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7116008400917053, + "kl": 0.9500053711235523, + "learning_rate": 9.999851688411959e-05, + "loss": 0.0123, + "num_tokens": 17601410.0, + "reward": 4.444620609283447, + "reward_std": 12.232638359069824, + "rewards/rollout_reward_func/mean": 4.444620609283447, + "rewards/rollout_reward_func/std": 12.037857055664062, + "sampling/importance_sampling_ratio/max": 1.8450855016708374, + "sampling/importance_sampling_ratio/mean": 0.9873309135437012, + "sampling/importance_sampling_ratio/min": 2.6370022485067146e-11, + "sampling/sampling_logp_difference/max": 11.255170822143555, + "sampling/sampling_logp_difference/mean": 0.033629726618528366, + "step": 319, + "step_time": 38.23992628900032 + }, + { + "clip_ratio/high_max": 0.06827694294042885, + "clip_ratio/high_mean": 0.024013680347707123, + "clip_ratio/low_mean": 0.04197527136420831, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06598895112983882, + "entropy": 0.3273693434894085, + "epoch": 0.0064, + "grad_norm": 0.5724111795425415, + "kl": 1.0788306891918182, + "learning_rate": 9.999850638428662e-05, + "loss": 0.0049, + "step": 320, + "step_time": 10.348325264999403 + }, + { + "clip_ratio/high_max": 0.009027777938172221, + "clip_ratio/high_mean": 0.0022569444845430553, + "clip_ratio/low_mean": 0.002170138934161514, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004427083418704569, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1336.0, + "completions/max_terminated_length": 1336.0, + "completions/mean_length": 1203.65625, + "completions/mean_terminated_length": 1203.65625, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.3083435148000717, + "epoch": 0.00642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8611142039299011, + "kl": 0.9252029061317444, + "learning_rate": 9.99984958474182e-05, + "loss": 0.024, + "num_tokens": 17729704.0, + "reward": 2.3030171394348145, + "reward_std": 10.394119262695312, + "rewards/rollout_reward_func/mean": 2.3030171394348145, + "rewards/rollout_reward_func/std": 11.775047302246094, + "sampling/importance_sampling_ratio/max": 1.6587164402008057, + "sampling/importance_sampling_ratio/mean": 1.0117642879486084, + "sampling/importance_sampling_ratio/min": 0.4190000295639038, + "sampling/sampling_logp_difference/max": 0.3578883409500122, + "sampling/sampling_logp_difference/mean": 0.015003521926701069, + "step": 321, + "step_time": 35.224505068000326 + }, + { + "clip_ratio/high_max": 0.07542938669212162, + "clip_ratio/high_mean": 0.03128034179098904, + "clip_ratio/low_mean": 0.03897239360958338, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0702527352841571, + "entropy": 0.2796294568106532, + "epoch": 0.00644, + "grad_norm": 0.5799975395202637, + "kl": 0.8807330075651407, + "learning_rate": 9.999848527351433e-05, + "loss": 0.0091, + "step": 322, + "step_time": 9.679342022999663 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034722222480922937, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 1194.96875, + "completions/mean_terminated_length": 1194.96875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.24392448458820581, + "epoch": 0.00646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7959789633750916, + "kl": 0.779301343485713, + "learning_rate": 9.9998474662575e-05, + "loss": -0.0199, + "num_tokens": 17857450.0, + "reward": 4.581869125366211, + "reward_std": 11.262429237365723, + "rewards/rollout_reward_func/mean": 4.581869602203369, + "rewards/rollout_reward_func/std": 12.287596702575684, + "sampling/importance_sampling_ratio/max": 2.240818977355957, + "sampling/importance_sampling_ratio/mean": 1.018520712852478, + "sampling/importance_sampling_ratio/min": 0.3999040722846985, + "sampling/sampling_logp_difference/max": 0.5928263664245605, + "sampling/sampling_logp_difference/mean": 0.012124484404921532, + "step": 323, + "step_time": 37.114893339001355 + }, + { + "clip_ratio/high_max": 0.0683479537256062, + "clip_ratio/high_mean": 0.022346383950207382, + "clip_ratio/low_mean": 0.03898888279218227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06133526662597433, + "entropy": 0.2431696206331253, + "epoch": 0.00648, + "grad_norm": 0.35803645849227905, + "kl": 0.767679963260889, + "learning_rate": 9.999846401460026e-05, + "loss": -0.0339, + "step": 324, + "step_time": 8.890772448001371 + }, + { + "clip_ratio/high_max": 0.015318243764340878, + "clip_ratio/high_mean": 0.0038295609410852194, + "clip_ratio/low_mean": 0.0023561508278362453, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006185711768921465, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 1233.796875, + "completions/mean_terminated_length": 1233.796875, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "entropy": 0.24552472867071629, + "epoch": 0.0065, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7995591163635254, + "kl": 0.8211700264364481, + "learning_rate": 9.99984533295901e-05, + "loss": -0.0057, + "num_tokens": 17987636.0, + "reward": 2.361278533935547, + "reward_std": 11.01347541809082, + "rewards/rollout_reward_func/mean": 2.361278533935547, + "rewards/rollout_reward_func/std": 11.316116333007812, + "sampling/importance_sampling_ratio/max": 1.4373282194137573, + "sampling/importance_sampling_ratio/mean": 0.9916459321975708, + "sampling/importance_sampling_ratio/min": 0.7290171384811401, + "sampling/sampling_logp_difference/max": 0.3705787658691406, + "sampling/sampling_logp_difference/mean": 0.01046331413090229, + "step": 325, + "step_time": 38.97714790000191 + }, + { + "clip_ratio/high_max": 0.05977182672359049, + "clip_ratio/high_mean": 0.02233774628257379, + "clip_ratio/low_mean": 0.027810412109829485, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05014815804315731, + "entropy": 0.24062953237444162, + "epoch": 0.00652, + "grad_norm": 0.6732361316680908, + "kl": 0.9134266618639231, + "learning_rate": 9.999844260754451e-05, + "loss": -0.011, + "step": 326, + "step_time": 9.702275648000068 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034722222480922937, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 1266.625, + "completions/mean_terminated_length": 1266.625, + "completions/min_length": 1005.0, + "completions/min_terminated_length": 1005.0, + "entropy": 0.19475865550339222, + "epoch": 0.00654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7081814408302307, + "kl": 0.7645703088492155, + "learning_rate": 9.999843184846354e-05, + "loss": 0.0194, + "num_tokens": 18120014.0, + "reward": 6.7441205978393555, + "reward_std": 12.950173377990723, + "rewards/rollout_reward_func/mean": 6.7441205978393555, + "rewards/rollout_reward_func/std": 13.17819881439209, + "sampling/importance_sampling_ratio/max": 2.733876943588257, + "sampling/importance_sampling_ratio/mean": 1.017435908317566, + "sampling/importance_sampling_ratio/min": 0.8077232837677002, + "sampling/sampling_logp_difference/max": 1.0649070739746094, + "sampling/sampling_logp_difference/mean": 0.008961044251918793, + "step": 327, + "step_time": 38.45977644300001 + }, + { + "clip_ratio/high_max": 0.05813231039792299, + "clip_ratio/high_mean": 0.0188276685657911, + "clip_ratio/low_mean": 0.017785656382329762, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03661332529736683, + "entropy": 0.18874722812324762, + "epoch": 0.00656, + "grad_norm": 0.29629969596862793, + "kl": 0.7521160487085581, + "learning_rate": 9.999842105234716e-05, + "loss": 0.0089, + "step": 328, + "step_time": 9.240072366999811 + }, + { + "clip_ratio/high_max": 0.01736111124046147, + "clip_ratio/high_mean": 0.004340277810115367, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004340277810115367, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 1223.046875, + "completions/mean_terminated_length": 1223.046875, + "completions/min_length": 1069.0, + "completions/min_terminated_length": 1069.0, + "entropy": 0.17775962874293327, + "epoch": 0.00658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5002116560935974, + "kl": 0.5365802068263292, + "learning_rate": 9.999841021919543e-05, + "loss": -0.0003, + "num_tokens": 18249422.0, + "reward": 5.926024436950684, + "reward_std": 10.913434028625488, + "rewards/rollout_reward_func/mean": 5.926024436950684, + "rewards/rollout_reward_func/std": 11.495051383972168, + "sampling/importance_sampling_ratio/max": 1.340820550918579, + "sampling/importance_sampling_ratio/mean": 0.9783110618591309, + "sampling/importance_sampling_ratio/min": 0.5937914848327637, + "sampling/sampling_logp_difference/max": 0.4624512195587158, + "sampling/sampling_logp_difference/mean": 0.009367045015096664, + "step": 329, + "step_time": 40.06546166299813 + }, + { + "clip_ratio/high_max": 0.049223856534808874, + "clip_ratio/high_mean": 0.015778186498209834, + "clip_ratio/low_mean": 0.02711397095117718, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04289215768221766, + "entropy": 0.16470052115619183, + "epoch": 0.0066, + "grad_norm": 0.272549033164978, + "kl": 0.5706925727427006, + "learning_rate": 9.999839934900832e-05, + "loss": -0.0098, + "step": 330, + "step_time": 9.584335595999619 + }, + { + "clip_ratio/high_max": 0.006761695956811309, + "clip_ratio/high_mean": 0.0016904239892028272, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016904239892028272, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 1230.453125, + "completions/mean_terminated_length": 1230.453125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.15301176952198148, + "epoch": 0.00662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5951566696166992, + "kl": 0.6225019320845604, + "learning_rate": 9.999838844178584e-05, + "loss": -0.0415, + "num_tokens": 18379457.0, + "reward": 5.441021919250488, + "reward_std": 11.596078872680664, + "rewards/rollout_reward_func/mean": 5.441021919250488, + "rewards/rollout_reward_func/std": 13.130385398864746, + "sampling/importance_sampling_ratio/max": 1.2981815338134766, + "sampling/importance_sampling_ratio/mean": 0.9712120294570923, + "sampling/importance_sampling_ratio/min": 0.5313878655433655, + "sampling/sampling_logp_difference/max": 0.4391303062438965, + "sampling/sampling_logp_difference/mean": 0.008532309904694557, + "step": 331, + "step_time": 38.03544032799982 + }, + { + "clip_ratio/high_max": 0.040491855004802346, + "clip_ratio/high_mean": 0.015382359270006418, + "clip_ratio/low_mean": 0.02635878958972171, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04174114967463538, + "entropy": 0.13200736604630947, + "epoch": 0.00664, + "grad_norm": 0.4791216552257538, + "kl": 0.6594886407256126, + "learning_rate": 9.999837749752803e-05, + "loss": -0.0494, + "step": 332, + "step_time": 9.623436052000216 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0034722222480922937, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004340277810115367, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 1219.734375, + "completions/mean_terminated_length": 1219.734375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.11573670757934451, + "epoch": 0.00666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6563036441802979, + "kl": 0.851641334593296, + "learning_rate": 9.999836651623487e-05, + "loss": -0.0048, + "num_tokens": 18508741.0, + "reward": 5.738734245300293, + "reward_std": 12.408971786499023, + "rewards/rollout_reward_func/mean": 5.738734245300293, + "rewards/rollout_reward_func/std": 12.671599388122559, + "sampling/importance_sampling_ratio/max": 1.3496527671813965, + "sampling/importance_sampling_ratio/mean": 1.0077811479568481, + "sampling/importance_sampling_ratio/min": 0.6974970102310181, + "sampling/sampling_logp_difference/max": 0.3328993320465088, + "sampling/sampling_logp_difference/mean": 0.00713011808693409, + "step": 333, + "step_time": 39.30893147200186 + }, + { + "clip_ratio/high_max": 0.03472222248092294, + "clip_ratio/high_mean": 0.011284722364507616, + "clip_ratio/low_mean": 0.024913194763939828, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03619791695382446, + "entropy": 0.11344034224748611, + "epoch": 0.00668, + "grad_norm": 0.4923565983772278, + "kl": 0.6980615984648466, + "learning_rate": 9.999835549790641e-05, + "loss": -0.0079, + "step": 334, + "step_time": 10.117216201999327 + }, + { + "clip_ratio/high_max": 0.0036764706019312143, + "clip_ratio/high_mean": 0.0009191176504828036, + "clip_ratio/low_mean": 0.0034722222480922937, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004391339898575097, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 1238.984375, + "completions/mean_terminated_length": 1238.984375, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.12856985442340374, + "epoch": 0.0067, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8339588642120361, + "kl": 0.6355916745960712, + "learning_rate": 9.999834444254262e-05, + "loss": -0.0042, + "num_tokens": 18639311.0, + "reward": 6.144355297088623, + "reward_std": 13.870889663696289, + "rewards/rollout_reward_func/mean": 6.144355297088623, + "rewards/rollout_reward_func/std": 14.220029830932617, + "sampling/importance_sampling_ratio/max": 1.3300856351852417, + "sampling/importance_sampling_ratio/mean": 0.9924861788749695, + "sampling/importance_sampling_ratio/min": 0.6479190587997437, + "sampling/sampling_logp_difference/max": 0.2639361619949341, + "sampling/sampling_logp_difference/mean": 0.006817285902798176, + "step": 335, + "step_time": 37.15376971599926 + }, + { + "clip_ratio/high_max": 0.03817401989363134, + "clip_ratio/high_mean": 0.012147671717684716, + "clip_ratio/low_mean": 0.020067402394488454, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03221507422858849, + "entropy": 0.12425063038244843, + "epoch": 0.00672, + "grad_norm": 0.32718658447265625, + "kl": 0.7219895403832197, + "learning_rate": 9.999833335014352e-05, + "loss": -0.011, + "step": 336, + "step_time": 9.780628326000624 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 1266.78125, + "completions/mean_terminated_length": 1266.78125, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.11600295826792717, + "epoch": 0.00674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7365610003471375, + "kl": 0.5392901804298162, + "learning_rate": 9.999832222070914e-05, + "loss": 0.0023, + "num_tokens": 18771742.0, + "reward": 5.880302429199219, + "reward_std": 12.320051193237305, + "rewards/rollout_reward_func/mean": 5.880302429199219, + "rewards/rollout_reward_func/std": 12.716879844665527, + "sampling/importance_sampling_ratio/max": 1.3457348346710205, + "sampling/importance_sampling_ratio/mean": 0.9991644620895386, + "sampling/importance_sampling_ratio/min": 0.6999140381813049, + "sampling/sampling_logp_difference/max": 0.3562436103820801, + "sampling/sampling_logp_difference/mean": 0.005911126732826233, + "step": 337, + "step_time": 38.84034024799803 + }, + { + "clip_ratio/high_max": 0.03513071918860078, + "clip_ratio/high_mean": 0.011386846599634737, + "clip_ratio/low_mean": 0.024994894862174988, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.036381741403602064, + "entropy": 0.1102461889386177, + "epoch": 0.00676, + "grad_norm": 0.2775817811489105, + "kl": 0.6620934028178453, + "learning_rate": 9.999831105423947e-05, + "loss": -0.006, + "step": 338, + "step_time": 9.023721004000436 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1350.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 1238.953125, + "completions/mean_terminated_length": 1238.953125, + "completions/min_length": 1062.0, + "completions/min_terminated_length": 1062.0, + "entropy": 0.11055759433656931, + "epoch": 0.00678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5181728601455688, + "kl": 0.462925398722291, + "learning_rate": 9.999829985073453e-05, + "loss": 0.0105, + "num_tokens": 18902239.0, + "reward": 7.53302526473999, + "reward_std": 12.4171142578125, + "rewards/rollout_reward_func/mean": 7.533025741577148, + "rewards/rollout_reward_func/std": 13.036537170410156, + "sampling/importance_sampling_ratio/max": 1.3853559494018555, + "sampling/importance_sampling_ratio/mean": 1.000986933708191, + "sampling/importance_sampling_ratio/min": 0.702711284160614, + "sampling/sampling_logp_difference/max": 0.4794572591781616, + "sampling/sampling_logp_difference/mean": 0.00590522913262248, + "step": 339, + "step_time": 39.08791527499943 + }, + { + "clip_ratio/high_max": 0.039215686498209834, + "clip_ratio/high_mean": 0.013276143989060074, + "clip_ratio/low_mean": 0.02185995056061074, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03513609484070912, + "entropy": 0.11709691304713488, + "epoch": 0.0068, + "grad_norm": 0.30310577154159546, + "kl": 0.5310502368956804, + "learning_rate": 9.999828861019435e-05, + "loss": 0.006, + "step": 340, + "step_time": 9.792470953999327 + }, + { + "clip_ratio/high_max": 0.013888888992369175, + "clip_ratio/high_mean": 0.0034722222480922937, + "clip_ratio/low_mean": 0.0009191176504828036, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004391339898575097, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 1241.765625, + "completions/mean_terminated_length": 1240.635009765625, + "completions/min_length": 1101.0, + "completions/min_terminated_length": 1101.0, + "entropy": 0.12758585345000029, + "epoch": 0.00682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6102613210678101, + "kl": 0.6113391723483801, + "learning_rate": 9.99982773326189e-05, + "loss": 0.0158, + "num_tokens": 19032925.0, + "reward": 3.9826180934906006, + "reward_std": 12.427906036376953, + "rewards/rollout_reward_func/mean": 3.9826183319091797, + "rewards/rollout_reward_func/std": 13.354879379272461, + "sampling/importance_sampling_ratio/max": 1.1800951957702637, + "sampling/importance_sampling_ratio/mean": 0.997043251991272, + "sampling/importance_sampling_ratio/min": 0.7389504313468933, + "sampling/sampling_logp_difference/max": 0.2936210632324219, + "sampling/sampling_logp_difference/mean": 0.005317248869687319, + "step": 341, + "step_time": 39.12008871799935 + }, + { + "clip_ratio/high_max": 0.021037581842392683, + "clip_ratio/high_mean": 0.006995506584644318, + "clip_ratio/low_mean": 0.016595179855357856, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023590686498209834, + "entropy": 0.12726877955719829, + "epoch": 0.00684, + "grad_norm": 0.49857455492019653, + "kl": 0.6323374789208174, + "learning_rate": 9.999826601800824e-05, + "loss": 0.0106, + "step": 342, + "step_time": 9.27907708300063 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008680555620230734, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 1219.203125, + "completions/mean_terminated_length": 1219.203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.11009268835186958, + "epoch": 0.00686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6059837341308594, + "kl": 0.7338532544672489, + "learning_rate": 9.999825466636233e-05, + "loss": -0.0167, + "num_tokens": 19162127.0, + "reward": 4.720416069030762, + "reward_std": 10.753931999206543, + "rewards/rollout_reward_func/mean": 4.720416069030762, + "rewards/rollout_reward_func/std": 12.976871490478516, + "sampling/importance_sampling_ratio/max": 1.5183767080307007, + "sampling/importance_sampling_ratio/mean": 1.0038487911224365, + "sampling/importance_sampling_ratio/min": 0.6935895681381226, + "sampling/sampling_logp_difference/max": 0.4249706268310547, + "sampling/sampling_logp_difference/mean": 0.004785279743373394, + "step": 343, + "step_time": 39.04033868700208 + }, + { + "clip_ratio/high_max": 0.02798202633857727, + "clip_ratio/high_mean": 0.012203840189613402, + "clip_ratio/low_mean": 0.013071895577013493, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025275735824834555, + "entropy": 0.11364737106487155, + "epoch": 0.00688, + "grad_norm": 0.2909540832042694, + "kl": 0.7444342169910669, + "learning_rate": 9.999824327768122e-05, + "loss": -0.0205, + "step": 344, + "step_time": 9.620514073000777 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 1240.3125, + "completions/mean_terminated_length": 1240.3125, + "completions/min_length": 1088.0, + "completions/min_terminated_length": 1088.0, + "entropy": 0.13259067060425878, + "epoch": 0.0069, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8596341609954834, + "kl": 0.7895576078444719, + "learning_rate": 9.99982318519649e-05, + "loss": -0.0057, + "num_tokens": 19292776.0, + "reward": 2.67919659614563, + "reward_std": 14.777613639831543, + "rewards/rollout_reward_func/mean": 2.679196357727051, + "rewards/rollout_reward_func/std": 15.276268005371094, + "sampling/importance_sampling_ratio/max": 1.4407294988632202, + "sampling/importance_sampling_ratio/mean": 0.9704160690307617, + "sampling/importance_sampling_ratio/min": 0.6675639152526855, + "sampling/sampling_logp_difference/max": 0.4319186210632324, + "sampling/sampling_logp_difference/mean": 0.007419218309223652, + "step": 345, + "step_time": 38.83812410200153 + }, + { + "clip_ratio/high_max": 0.03472222248092294, + "clip_ratio/high_mean": 0.009548611182253808, + "clip_ratio/low_mean": 0.03599877539090812, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.045547386282123625, + "entropy": 0.1278433846309781, + "epoch": 0.00692, + "grad_norm": 0.6303772330284119, + "kl": 1.1542848944664001, + "learning_rate": 9.999822038921338e-05, + "loss": -0.0049, + "step": 346, + "step_time": 9.405012558002454 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0025584796094335616, + "clip_ratio/low_mean": 0.0026041666860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005162646295502782, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 1241.171875, + "completions/mean_terminated_length": 1241.171875, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "entropy": 0.1106796741951257, + "epoch": 0.00694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4797411561012268, + "kl": 0.6278085261583328, + "learning_rate": 9.99982088894267e-05, + "loss": 0.0106, + "num_tokens": 19423491.0, + "reward": 5.54637336730957, + "reward_std": 12.041938781738281, + "rewards/rollout_reward_func/mean": 5.54637336730957, + "rewards/rollout_reward_func/std": 13.066041946411133, + "sampling/importance_sampling_ratio/max": 1.482460618019104, + "sampling/importance_sampling_ratio/mean": 0.9943655133247375, + "sampling/importance_sampling_ratio/min": 0.6257169246673584, + "sampling/sampling_logp_difference/max": 0.510839581489563, + "sampling/sampling_logp_difference/mean": 0.006749512627720833, + "step": 347, + "step_time": 39.46550670700071 + }, + { + "clip_ratio/high_max": 0.05993883335031569, + "clip_ratio/high_mean": 0.015852763841394335, + "clip_ratio/low_mean": 0.02315665892092511, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390094225294888, + "entropy": 0.11026378069072962, + "epoch": 0.00696, + "grad_norm": 0.3181508183479309, + "kl": 0.6370288580656052, + "learning_rate": 9.999819735260483e-05, + "loss": 0.0068, + "step": 348, + "step_time": 10.100482684999406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002517361135687679, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002517361135687679, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1332.0, + "completions/max_terminated_length": 1332.0, + "completions/mean_length": 1208.171875, + "completions/mean_terminated_length": 1208.171875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.13927970174700022, + "epoch": 0.00698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5229349732398987, + "kl": 0.5507344976067543, + "learning_rate": 9.999818577874781e-05, + "loss": 0.0234, + "num_tokens": 19552011.0, + "reward": 5.104192733764648, + "reward_std": 11.615788459777832, + "rewards/rollout_reward_func/mean": 5.104192733764648, + "rewards/rollout_reward_func/std": 12.1382474899292, + "sampling/importance_sampling_ratio/max": 1.415814995765686, + "sampling/importance_sampling_ratio/mean": 1.0048539638519287, + "sampling/importance_sampling_ratio/min": 1.834242700438695e-16, + "sampling/sampling_logp_difference/max": 27.188508987426758, + "sampling/sampling_logp_difference/mean": 0.039306361228227615, + "step": 349, + "step_time": 37.862728561997756 + }, + { + "clip_ratio/high_max": 0.05868378118611872, + "clip_ratio/high_mean": 0.017275112157221884, + "clip_ratio/low_mean": 0.013766340038273484, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03104145231191069, + "entropy": 0.14939681394025683, + "epoch": 0.007, + "grad_norm": 0.3000262379646301, + "kl": 0.5239376667886972, + "learning_rate": 9.999817416785565e-05, + "loss": 0.0173, + "step": 350, + "step_time": 9.859687822999149 + }, + { + "clip_ratio/high_max": 0.013706140452995896, + "clip_ratio/high_mean": 0.003426535113248974, + "clip_ratio/low_mean": 0.0026041666860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006030701799318194, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 1236.90625, + "completions/mean_terminated_length": 1236.90625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.14154944382607937, + "epoch": 0.00702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47570475935935974, + "kl": 0.5030098669230938, + "learning_rate": 9.999816251992836e-05, + "loss": -0.0158, + "num_tokens": 19682494.0, + "reward": 4.190635681152344, + "reward_std": 14.216930389404297, + "rewards/rollout_reward_func/mean": 4.190635681152344, + "rewards/rollout_reward_func/std": 14.30445671081543, + "sampling/importance_sampling_ratio/max": 1.5223360061645508, + "sampling/importance_sampling_ratio/mean": 1.0101966857910156, + "sampling/importance_sampling_ratio/min": 0.7218723297119141, + "sampling/sampling_logp_difference/max": 0.302712082862854, + "sampling/sampling_logp_difference/mean": 0.007096399553120136, + "step": 351, + "step_time": 39.30754639200131 + }, + { + "clip_ratio/high_max": 0.03492647083476186, + "clip_ratio/high_mean": 0.013026208442170173, + "clip_ratio/low_mean": 0.018280228949151933, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031306437274906784, + "entropy": 0.1433765795081854, + "epoch": 0.00704, + "grad_norm": 0.2731061577796936, + "kl": 0.5208645444363356, + "learning_rate": 9.999815083496594e-05, + "loss": -0.0214, + "step": 352, + "step_time": 9.398018900999887 + }, + { + "clip_ratio/high_max": 0.013888888992369175, + "clip_ratio/high_mean": 0.0034722222480922937, + "clip_ratio/low_mean": 0.0034722222480922937, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0069444444961845875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 1237.3125, + "completions/mean_terminated_length": 1237.3125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.1480951178818941, + "epoch": 0.00706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5071465373039246, + "kl": 0.5567373130470514, + "learning_rate": 9.99981391129684e-05, + "loss": -0.008, + "num_tokens": 19812942.0, + "reward": 4.355041980743408, + "reward_std": 13.132366180419922, + "rewards/rollout_reward_func/mean": 4.355041980743408, + "rewards/rollout_reward_func/std": 13.851308822631836, + "sampling/importance_sampling_ratio/max": 1.4629567861557007, + "sampling/importance_sampling_ratio/mean": 1.0300343036651611, + "sampling/importance_sampling_ratio/min": 0.6640676856040955, + "sampling/sampling_logp_difference/max": 0.5005507469177246, + "sampling/sampling_logp_difference/mean": 0.007855242118239403, + "step": 353, + "step_time": 38.34223270599978 + }, + { + "clip_ratio/high_max": 0.024305555736646056, + "clip_ratio/high_mean": 0.007812500116415322, + "clip_ratio/low_mean": 0.024994894512929022, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.032807394745759666, + "entropy": 0.13850665464997292, + "epoch": 0.00708, + "grad_norm": 0.27875232696533203, + "kl": 0.5900795683264732, + "learning_rate": 9.999812735393576e-05, + "loss": -0.0167, + "step": 354, + "step_time": 9.843489302002126 + }, + { + "clip_ratio/high_max": 0.010416666744276881, + "clip_ratio/high_mean": 0.0026041666860692203, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034722222480922937, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 1228.9375, + "completions/mean_terminated_length": 1228.9375, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "entropy": 0.13509350316599011, + "epoch": 0.0071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5068008899688721, + "kl": 0.4818702656775713, + "learning_rate": 9.999811555786804e-05, + "loss": 0.0278, + "num_tokens": 19942820.0, + "reward": 5.804719924926758, + "reward_std": 13.167655944824219, + "rewards/rollout_reward_func/mean": 5.804719924926758, + "rewards/rollout_reward_func/std": 13.18018913269043, + "sampling/importance_sampling_ratio/max": 1.306014895439148, + "sampling/importance_sampling_ratio/mean": 1.0091025829315186, + "sampling/importance_sampling_ratio/min": 0.625792384147644, + "sampling/sampling_logp_difference/max": 0.3519221544265747, + "sampling/sampling_logp_difference/mean": 0.006864185445010662, + "step": 355, + "step_time": 39.10083819900228 + }, + { + "clip_ratio/high_max": 0.031250000232830644, + "clip_ratio/high_mean": 0.013888889225199819, + "clip_ratio/low_mean": 0.026143791212234646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.040032680495642126, + "entropy": 0.1260006483644247, + "epoch": 0.00712, + "grad_norm": 0.2814493179321289, + "kl": 0.5503856968134642, + "learning_rate": 9.999810372476525e-05, + "loss": 0.0244, + "step": 356, + "step_time": 9.433313735999036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 1269.96875, + "completions/mean_terminated_length": 1269.96875, + "completions/min_length": 1127.0, + "completions/min_terminated_length": 1127.0, + "entropy": 0.1161547633819282, + "epoch": 0.00714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4718828499317169, + "kl": 0.9211016893386841, + "learning_rate": 9.999809185462739e-05, + "loss": 0.039, + "num_tokens": 20075371.0, + "reward": 3.961371898651123, + "reward_std": 11.789936065673828, + "rewards/rollout_reward_func/mean": 3.961371898651123, + "rewards/rollout_reward_func/std": 12.59416675567627, + "sampling/importance_sampling_ratio/max": 1.223926067352295, + "sampling/importance_sampling_ratio/mean": 0.9972316026687622, + "sampling/importance_sampling_ratio/min": 0.7068163156509399, + "sampling/sampling_logp_difference/max": 0.21517443656921387, + "sampling/sampling_logp_difference/mean": 0.0053621698170900345, + "step": 357, + "step_time": 38.67136614899937 + }, + { + "clip_ratio/high_max": 0.049019608180969954, + "clip_ratio/high_mean": 0.013991013227496296, + "clip_ratio/low_mean": 0.01996527804294601, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03395629138685763, + "entropy": 0.11733251390978694, + "epoch": 0.00716, + "grad_norm": 0.16502372920513153, + "kl": 0.7801671754568815, + "learning_rate": 9.999807994745449e-05, + "loss": 0.0324, + "step": 358, + "step_time": 9.794801944999563 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 1256.96875, + "completions/mean_terminated_length": 1256.96875, + "completions/min_length": 1011.0, + "completions/min_terminated_length": 1011.0, + "entropy": 0.13261962542310357, + "epoch": 0.00718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4223484992980957, + "kl": 0.6454224642366171, + "learning_rate": 9.999806800324652e-05, + "loss": -0.0021, + "num_tokens": 20207093.0, + "reward": 4.6267523765563965, + "reward_std": 13.086963653564453, + "rewards/rollout_reward_func/mean": 4.626751899719238, + "rewards/rollout_reward_func/std": 14.676898956298828, + "sampling/importance_sampling_ratio/max": 1.336045265197754, + "sampling/importance_sampling_ratio/mean": 0.9978616237640381, + "sampling/importance_sampling_ratio/min": 0.6580431461334229, + "sampling/sampling_logp_difference/max": 0.287054181098938, + "sampling/sampling_logp_difference/mean": 0.0057748714461922646, + "step": 359, + "step_time": 38.74908411499746 + }, + { + "clip_ratio/high_max": 0.058662281604483724, + "clip_ratio/high_mean": 0.018137792707420886, + "clip_ratio/low_mean": 0.018183479725848883, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03632127266610041, + "entropy": 0.1370791387744248, + "epoch": 0.0072, + "grad_norm": 0.26030489802360535, + "kl": 0.6394520290195942, + "learning_rate": 9.999805602200354e-05, + "loss": -0.0085, + "step": 360, + "step_time": 9.32019592799952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008680555620230734, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 1241.734375, + "completions/mean_terminated_length": 1241.734375, + "completions/min_length": 1025.0, + "completions/min_terminated_length": 1025.0, + "entropy": 0.1345509896054864, + "epoch": 0.00722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5100898742675781, + "kl": 0.8387140035629272, + "learning_rate": 9.999804400372554e-05, + "loss": 0.007, + "num_tokens": 20337789.0, + "reward": 9.449151039123535, + "reward_std": 12.286431312561035, + "rewards/rollout_reward_func/mean": 9.449151039123535, + "rewards/rollout_reward_func/std": 13.57576847076416, + "sampling/importance_sampling_ratio/max": 1.4115562438964844, + "sampling/importance_sampling_ratio/mean": 0.990313708782196, + "sampling/importance_sampling_ratio/min": 0.6950281858444214, + "sampling/sampling_logp_difference/max": 0.3388124704360962, + "sampling/sampling_logp_difference/mean": 0.00530852098017931, + "step": 361, + "step_time": 39.235169910002696 + }, + { + "clip_ratio/high_max": 0.0349264710675925, + "clip_ratio/high_mean": 0.010467728832736611, + "clip_ratio/low_mean": 0.01741217344533652, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027879902394488454, + "entropy": 0.15170079609379172, + "epoch": 0.00724, + "grad_norm": 0.33363601565361023, + "kl": 0.6247174255549908, + "learning_rate": 9.999803194841253e-05, + "loss": 0.0003, + "step": 362, + "step_time": 9.35076707999906 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 1251.59375, + "completions/mean_terminated_length": 1251.59375, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.1958311009220779, + "epoch": 0.00726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5317199230194092, + "kl": 0.5815525501966476, + "learning_rate": 9.999801985606452e-05, + "loss": 0.0042, + "num_tokens": 20469218.0, + "reward": 3.8903188705444336, + "reward_std": 13.076482772827148, + "rewards/rollout_reward_func/mean": 3.8903186321258545, + "rewards/rollout_reward_func/std": 13.372103691101074, + "sampling/importance_sampling_ratio/max": 1.3623380661010742, + "sampling/importance_sampling_ratio/mean": 1.0135592222213745, + "sampling/importance_sampling_ratio/min": 0.7123748064041138, + "sampling/sampling_logp_difference/max": 0.29522740840911865, + "sampling/sampling_logp_difference/mean": 0.006953438278287649, + "step": 363, + "step_time": 39.74288477299888 + }, + { + "clip_ratio/high_max": 0.06527777831070125, + "clip_ratio/high_mean": 0.01979166700039059, + "clip_ratio/low_mean": 0.021701389166992158, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04149305640021339, + "entropy": 0.2018888248130679, + "epoch": 0.00728, + "grad_norm": 0.2644880712032318, + "kl": 0.5717838387936354, + "learning_rate": 9.999800772668153e-05, + "loss": -0.0029, + "step": 364, + "step_time": 9.321073625997087 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0017361111240461469, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026041666860692203, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 1249.453125, + "completions/mean_terminated_length": 1249.453125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.18510928004980087, + "epoch": 0.0073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.442364364862442, + "kl": 0.4795332048088312, + "learning_rate": 9.999799556026358e-05, + "loss": -0.0238, + "num_tokens": 20600462.0, + "reward": 6.273903846740723, + "reward_std": 12.39173698425293, + "rewards/rollout_reward_func/mean": 6.273903846740723, + "rewards/rollout_reward_func/std": 13.681985855102539, + "sampling/importance_sampling_ratio/max": 1.3438879251480103, + "sampling/importance_sampling_ratio/mean": 0.9609812498092651, + "sampling/importance_sampling_ratio/min": 0.6316797733306885, + "sampling/sampling_logp_difference/max": 0.33423590660095215, + "sampling/sampling_logp_difference/mean": 0.007498072925955057, + "step": 365, + "step_time": 38.43022035099784 + }, + { + "clip_ratio/high_max": 0.05171783687546849, + "clip_ratio/high_mean": 0.013797514839097857, + "clip_ratio/low_mean": 0.007766812981572002, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0215643277624622, + "entropy": 0.18268039543181658, + "epoch": 0.00732, + "grad_norm": 0.2666545808315277, + "kl": 0.47542588133364916, + "learning_rate": 9.999798335681066e-05, + "loss": -0.0309, + "step": 366, + "step_time": 9.454387761999897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0016904239892028272, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016904239892028272, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 1222.125, + "completions/mean_terminated_length": 1222.125, + "completions/min_length": 999.0, + "completions/min_terminated_length": 999.0, + "entropy": 0.21282331459224224, + "epoch": 0.00734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8037834763526917, + "kl": 0.6722489278763533, + "learning_rate": 9.99979711163228e-05, + "loss": 0.0148, + "num_tokens": 20729886.0, + "reward": 5.174856662750244, + "reward_std": 11.355770111083984, + "rewards/rollout_reward_func/mean": 5.174857139587402, + "rewards/rollout_reward_func/std": 11.9678955078125, + "sampling/importance_sampling_ratio/max": 1.8758124113082886, + "sampling/importance_sampling_ratio/mean": 1.0103557109832764, + "sampling/importance_sampling_ratio/min": 0.7285647392272949, + "sampling/sampling_logp_difference/max": 0.3263084888458252, + "sampling/sampling_logp_difference/mean": 0.008695240132510662, + "step": 367, + "step_time": 37.67410640100388 + }, + { + "clip_ratio/high_max": 0.036011905409395695, + "clip_ratio/high_mean": 0.01073908741818741, + "clip_ratio/low_mean": 0.02561856439569965, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0363576520467177, + "entropy": 0.21391737554222345, + "epoch": 0.00736, + "grad_norm": 0.4400097727775574, + "kl": 0.749302851036191, + "learning_rate": 9.999795883880001e-05, + "loss": 0.005, + "step": 368, + "step_time": 9.906740201999128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.003426535113248974, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003426535113248974, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 1231.171875, + "completions/mean_terminated_length": 1231.171875, + "completions/min_length": 994.0, + "completions/min_terminated_length": 994.0, + "entropy": 0.2234082594513893, + "epoch": 0.00738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0522061586380005, + "kl": 0.9143509455025196, + "learning_rate": 9.999794652424228e-05, + "loss": 0.0039, + "num_tokens": 20859908.0, + "reward": 8.738959312438965, + "reward_std": 11.845466613769531, + "rewards/rollout_reward_func/mean": 8.738959312438965, + "rewards/rollout_reward_func/std": 12.123114585876465, + "sampling/importance_sampling_ratio/max": 1.3250545263290405, + "sampling/importance_sampling_ratio/mean": 1.020609974861145, + "sampling/importance_sampling_ratio/min": 0.5885343551635742, + "sampling/sampling_logp_difference/max": 0.4251088500022888, + "sampling/sampling_logp_difference/mean": 0.008847212418913841, + "step": 369, + "step_time": 38.576368669004296 + }, + { + "clip_ratio/high_max": 0.05455874605104327, + "clip_ratio/high_mean": 0.016198166005779058, + "clip_ratio/low_mean": 0.03593064745655283, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05212881352053955, + "entropy": 0.22893889155238867, + "epoch": 0.0074, + "grad_norm": 0.39881831407546997, + "kl": 1.0176227018237114, + "learning_rate": 9.999793417264966e-05, + "loss": -0.0017, + "step": 370, + "step_time": 8.888960126005259 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 1242.28125, + "completions/mean_terminated_length": 1242.28125, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.2308051260188222, + "epoch": 0.00742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5715855956077576, + "kl": 0.87254199385643, + "learning_rate": 9.999792178402214e-05, + "loss": -0.0234, + "num_tokens": 20990697.0, + "reward": 6.123772621154785, + "reward_std": 10.485084533691406, + "rewards/rollout_reward_func/mean": 6.123772144317627, + "rewards/rollout_reward_func/std": 11.30632209777832, + "sampling/importance_sampling_ratio/max": 1.4539350271224976, + "sampling/importance_sampling_ratio/mean": 1.0005998611450195, + "sampling/importance_sampling_ratio/min": 0.5505736470222473, + "sampling/sampling_logp_difference/max": 0.3510777950286865, + "sampling/sampling_logp_difference/mean": 0.009203520603477955, + "step": 371, + "step_time": 38.65458783400027 + }, + { + "clip_ratio/high_max": 0.045200163731351495, + "clip_ratio/high_mean": 0.013036152173299342, + "clip_ratio/low_mean": 0.023381332110147923, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03641748463269323, + "entropy": 0.2447242382913828, + "epoch": 0.00744, + "grad_norm": 0.298229455947876, + "kl": 0.8313354179263115, + "learning_rate": 9.999790935835973e-05, + "loss": -0.0303, + "step": 372, + "step_time": 9.79756171700501 + }, + { + "clip_ratio/high_max": 0.010051169665530324, + "clip_ratio/high_mean": 0.002512792416382581, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0033808479784056544, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 1223.75, + "completions/mean_terminated_length": 1223.75, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "entropy": 0.25220474135130644, + "epoch": 0.00746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5470872521400452, + "kl": 0.7501334678381681, + "learning_rate": 9.999789689566245e-05, + "loss": 0.0016, + "num_tokens": 21120250.0, + "reward": 4.980414867401123, + "reward_std": 13.811859130859375, + "rewards/rollout_reward_func/mean": 4.980414867401123, + "rewards/rollout_reward_func/std": 15.705443382263184, + "sampling/importance_sampling_ratio/max": 1.4358227252960205, + "sampling/importance_sampling_ratio/mean": 0.9660643339157104, + "sampling/importance_sampling_ratio/min": 0.4660157859325409, + "sampling/sampling_logp_difference/max": 0.5715584754943848, + "sampling/sampling_logp_difference/mean": 0.011051887646317482, + "step": 373, + "step_time": 37.840678287995615 + }, + { + "clip_ratio/high_max": 0.07236842135898769, + "clip_ratio/high_mean": 0.021564327646046877, + "clip_ratio/low_mean": 0.025087612215429544, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04665194044355303, + "entropy": 0.2590667102485895, + "epoch": 0.00748, + "grad_norm": 0.33827197551727295, + "kl": 0.7311984747648239, + "learning_rate": 9.999788439593031e-05, + "loss": -0.0111, + "step": 374, + "step_time": 8.941922394003996 + }, + { + "clip_ratio/high_max": 0.0034722222480922937, + "clip_ratio/high_mean": 0.0008680555620230734, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017361111240461469, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 1216.34375, + "completions/mean_terminated_length": 1216.34375, + "completions/min_length": 993.0, + "completions/min_terminated_length": 993.0, + "entropy": 0.2550716269761324, + "epoch": 0.0075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8335476517677307, + "kl": 0.9307033438235521, + "learning_rate": 9.999787185916331e-05, + "loss": 0.0294, + "num_tokens": 21249311.0, + "reward": 5.41689920425415, + "reward_std": 12.388166427612305, + "rewards/rollout_reward_func/mean": 5.416898727416992, + "rewards/rollout_reward_func/std": 13.267603874206543, + "sampling/importance_sampling_ratio/max": 1.4684741497039795, + "sampling/importance_sampling_ratio/mean": 1.0054916143417358, + "sampling/importance_sampling_ratio/min": 0.6331810355186462, + "sampling/sampling_logp_difference/max": 0.2799299955368042, + "sampling/sampling_logp_difference/mean": 0.01017037034034729, + "step": 375, + "step_time": 38.13824768100312 + }, + { + "clip_ratio/high_max": 0.06950894417241216, + "clip_ratio/high_mean": 0.021580452797934413, + "clip_ratio/low_mean": 0.02794391370844096, + "clip_ratio/low_min": 0.0034722222480922937, + "clip_ratio/region_mean": 0.049524366680998355, + "entropy": 0.2559032328426838, + "epoch": 0.00752, + "grad_norm": 0.2704547047615051, + "kl": 0.9575543515384197, + "learning_rate": 9.999785928536148e-05, + "loss": 0.0164, + "step": 376, + "step_time": 9.173922988000413 + }, + { + "clip_ratio/high_max": 0.00657894741743803, + "clip_ratio/high_mean": 0.0016447368543595076, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016447368543595076, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 1201.6875, + "completions/mean_terminated_length": 1201.6875, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.2529036393389106, + "epoch": 0.00754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5121884346008301, + "kl": 0.7823121659457684, + "learning_rate": 9.999784667452484e-05, + "loss": -0.0058, + "num_tokens": 21377388.0, + "reward": 5.937844753265381, + "reward_std": 10.75206184387207, + "rewards/rollout_reward_func/mean": 5.937845230102539, + "rewards/rollout_reward_func/std": 10.630824089050293, + "sampling/importance_sampling_ratio/max": 1.2855048179626465, + "sampling/importance_sampling_ratio/mean": 0.9824115037918091, + "sampling/importance_sampling_ratio/min": 0.7048435807228088, + "sampling/sampling_logp_difference/max": 0.3823585510253906, + "sampling/sampling_logp_difference/mean": 0.010149901732802391, + "step": 377, + "step_time": 37.091166489997704 + }, + { + "clip_ratio/high_max": 0.030701754614710808, + "clip_ratio/high_mean": 0.011025112122297287, + "clip_ratio/low_mean": 0.024379125621635467, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03540423803497106, + "entropy": 0.24686269089579582, + "epoch": 0.00756, + "grad_norm": 0.308444082736969, + "kl": 0.7720872350037098, + "learning_rate": 9.999783402665338e-05, + "loss": -0.0141, + "step": 378, + "step_time": 8.784612373994605 + }, + { + "clip_ratio/high_max": 0.01686507952399552, + "clip_ratio/high_mean": 0.00421626988099888, + "clip_ratio/low_mean": 0.0035807291860692203, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0077969990670681, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 1210.0625, + "completions/mean_terminated_length": 1210.0625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 0.2716317633166909, + "epoch": 0.00758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6121698021888733, + "kl": 0.925221860408783, + "learning_rate": 9.999782134174711e-05, + "loss": -0.0013, + "num_tokens": 21506045.0, + "reward": 2.2015371322631836, + "reward_std": 15.299311637878418, + "rewards/rollout_reward_func/mean": 2.2015371322631836, + "rewards/rollout_reward_func/std": 15.50017261505127, + "sampling/importance_sampling_ratio/max": 1.400822639465332, + "sampling/importance_sampling_ratio/mean": 0.9883875846862793, + "sampling/importance_sampling_ratio/min": 0.625456690788269, + "sampling/sampling_logp_difference/max": 0.3319031000137329, + "sampling/sampling_logp_difference/mean": 0.011153844185173512, + "step": 379, + "step_time": 36.844649089001905 + }, + { + "clip_ratio/high_max": 0.05834899842739105, + "clip_ratio/high_mean": 0.017242478381376714, + "clip_ratio/low_mean": 0.03576550219440833, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05300798005191609, + "entropy": 0.2637836243957281, + "epoch": 0.0076, + "grad_norm": 0.4454600512981415, + "kl": 0.9274842478334904, + "learning_rate": 9.999780861980607e-05, + "loss": -0.0126, + "step": 380, + "step_time": 9.811575109000842 + }, + { + "clip_ratio/high_max": 0.010620915098115802, + "clip_ratio/high_mean": 0.0026552287745289505, + "clip_ratio/low_mean": 0.0009191176504828036, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003574346425011754, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 1219.8125, + "completions/mean_terminated_length": 1219.8125, + "completions/min_length": 991.0, + "completions/min_terminated_length": 991.0, + "entropy": 0.2292822152376175, + "epoch": 0.00762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6078642010688782, + "kl": 0.803357319906354, + "learning_rate": 9.999779586083025e-05, + "loss": 0.004, + "num_tokens": 21635298.0, + "reward": 5.408005714416504, + "reward_std": 9.926593780517578, + "rewards/rollout_reward_func/mean": 5.4080047607421875, + "rewards/rollout_reward_func/std": 11.208430290222168, + "sampling/importance_sampling_ratio/max": 1.2364863157272339, + "sampling/importance_sampling_ratio/mean": 0.997908890247345, + "sampling/importance_sampling_ratio/min": 0.6723216772079468, + "sampling/sampling_logp_difference/max": 0.38260674476623535, + "sampling/sampling_logp_difference/mean": 0.007029087748378515, + "step": 381, + "step_time": 37.91188854600114 + }, + { + "clip_ratio/high_max": 0.05433114105835557, + "clip_ratio/high_mean": 0.014450840826611966, + "clip_ratio/low_mean": 0.02246758935507387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03691843029810116, + "entropy": 0.21105156652629375, + "epoch": 0.00764, + "grad_norm": 0.3075491189956665, + "kl": 0.9027222413569689, + "learning_rate": 9.999778306481968e-05, + "loss": -0.0043, + "step": 382, + "step_time": 9.498284126000726 + }, + { + "clip_ratio/high_max": 0.0036764706019312143, + "clip_ratio/high_mean": 0.0009191176504828036, + "clip_ratio/low_mean": 0.0008680555620230734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001787173212505877, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1357.0, + "completions/max_terminated_length": 1357.0, + "completions/mean_length": 1213.765625, + "completions/mean_terminated_length": 1213.765625, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.20513668935745955, + "epoch": 0.00766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6294713020324707, + "kl": 0.7374063562601805, + "learning_rate": 9.999777023177434e-05, + "loss": 0.0252, + "num_tokens": 21764144.0, + "reward": 8.72990894317627, + "reward_std": 11.312125205993652, + "rewards/rollout_reward_func/mean": 8.729909896850586, + "rewards/rollout_reward_func/std": 11.270212173461914, + "sampling/importance_sampling_ratio/max": 1.667926549911499, + "sampling/importance_sampling_ratio/mean": 1.0118814706802368, + "sampling/importance_sampling_ratio/min": 0.7219305038452148, + "sampling/sampling_logp_difference/max": 0.31063222885131836, + "sampling/sampling_logp_difference/mean": 0.007431398145854473, + "step": 383, + "step_time": 37.184195774003456 + }, + { + "clip_ratio/high_max": 0.024509804090484977, + "clip_ratio/high_mean": 0.006995506584644318, + "clip_ratio/low_mean": 0.034743722644634545, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.041739229462109506, + "entropy": 0.19153737649321556, + "epoch": 0.00768, + "grad_norm": 0.37729939818382263, + "kl": 1.0056524686515331, + "learning_rate": 9.999775736169427e-05, + "loss": 0.0245, + "step": 384, + "step_time": 8.749921898001048 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.004579809028655291, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0060679042944684625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 1410.3125, + "completions/mean_terminated_length": 1410.3125, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "entropy": 0.20708153676241636, + "epoch": 0.0077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.738293468952179, + "kl": 0.887890812009573, + "learning_rate": 9.99977444545795e-05, + "loss": -0.0628, + "num_tokens": 21905662.0, + "reward": 9.202791213989258, + "reward_std": 15.181166648864746, + "rewards/rollout_reward_func/mean": 9.202792167663574, + "rewards/rollout_reward_func/std": 15.67770767211914, + "sampling/importance_sampling_ratio/max": 1.5441806316375732, + "sampling/importance_sampling_ratio/mean": 0.9917982816696167, + "sampling/importance_sampling_ratio/min": 5.679499839178481e-13, + "sampling/sampling_logp_difference/max": 22.66815948486328, + "sampling/sampling_logp_difference/mean": 0.030492324382066727, + "step": 385, + "step_time": 39.20689014000345 + }, + { + "clip_ratio/high_max": 0.023971861926838756, + "clip_ratio/high_mean": 0.007481060747522861, + "clip_ratio/low_mean": 0.03716492815874517, + "clip_ratio/low_min": 0.0029761905316263437, + "clip_ratio/region_mean": 0.04464598890626803, + "entropy": 0.1879758802242577, + "epoch": 0.00772, + "grad_norm": 0.41567400097846985, + "kl": 0.8819043859839439, + "learning_rate": 9.999773151042999e-05, + "loss": -0.0737, + "step": 386, + "step_time": 10.475744582005063 + }, + { + "clip_ratio/high_max": 0.009424603311344981, + "clip_ratio/high_mean": 0.0023561508278362453, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031001984607428312, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 1433.34375, + "completions/mean_terminated_length": 1433.34375, + "completions/min_length": 1211.0, + "completions/min_terminated_length": 1211.0, + "entropy": 0.15365674067288637, + "epoch": 0.00774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5962560176849365, + "kl": 0.6632435545325279, + "learning_rate": 9.99977185292458e-05, + "loss": 0.0216, + "num_tokens": 22048591.0, + "reward": 13.268250465393066, + "reward_std": 13.775822639465332, + "rewards/rollout_reward_func/mean": 13.268250465393066, + "rewards/rollout_reward_func/std": 14.63206958770752, + "sampling/importance_sampling_ratio/max": 1.2218079566955566, + "sampling/importance_sampling_ratio/mean": 0.9793609380722046, + "sampling/importance_sampling_ratio/min": 0.6325286626815796, + "sampling/sampling_logp_difference/max": 0.38329482078552246, + "sampling/sampling_logp_difference/mean": 0.0064071910455822945, + "step": 387, + "step_time": 41.232673029999205 + }, + { + "clip_ratio/high_max": 0.05530754057690501, + "clip_ratio/high_mean": 0.01680307579226792, + "clip_ratio/low_mean": 0.014248512219637632, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031051588244736195, + "entropy": 0.14256418915465474, + "epoch": 0.00776, + "grad_norm": 0.527172863483429, + "kl": 0.646535612642765, + "learning_rate": 9.999770551102692e-05, + "loss": 0.0167, + "step": 388, + "step_time": 10.636822301992652 + }, + { + "clip_ratio/high_max": 0.0031250000465661287, + "clip_ratio/high_mean": 0.0007812500116415322, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015252976445481181, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1429.21875, + "completions/mean_terminated_length": 1429.21875, + "completions/min_length": 1226.0, + "completions/min_terminated_length": 1226.0, + "entropy": 0.14011064730584621, + "epoch": 0.00778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5465406775474548, + "kl": 0.6449617743492126, + "learning_rate": 9.999769245577337e-05, + "loss": -0.0416, + "num_tokens": 22191273.0, + "reward": 10.615909576416016, + "reward_std": 10.947202682495117, + "rewards/rollout_reward_func/mean": 10.615909576416016, + "rewards/rollout_reward_func/std": 12.735282897949219, + "sampling/importance_sampling_ratio/max": 2.317744016647339, + "sampling/importance_sampling_ratio/mean": 1.0246381759643555, + "sampling/importance_sampling_ratio/min": 0.2836526930332184, + "sampling/sampling_logp_difference/max": 1.3213729858398438, + "sampling/sampling_logp_difference/mean": 0.008526146411895752, + "step": 389, + "step_time": 41.60411787599878 + }, + { + "clip_ratio/high_max": 0.03645833395421505, + "clip_ratio/high_mean": 0.01361607201397419, + "clip_ratio/low_mean": 0.013582785322796553, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02719885722035542, + "entropy": 0.15200490225106478, + "epoch": 0.0078, + "grad_norm": 0.4452749192714691, + "kl": 0.5898908544331789, + "learning_rate": 9.999767936348516e-05, + "loss": -0.05, + "step": 390, + "step_time": 10.0632308130007 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002232142898719758, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1550.0, + "completions/max_terminated_length": 1550.0, + "completions/mean_length": 1416.0625, + "completions/mean_terminated_length": 1416.0625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.16107679810374975, + "epoch": 0.00782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5122284889221191, + "kl": 0.5392248686403036, + "learning_rate": 9.999766623416232e-05, + "loss": -0.0577, + "num_tokens": 22333164.0, + "reward": 14.949935913085938, + "reward_std": 16.67510414123535, + "rewards/rollout_reward_func/mean": 14.949935913085938, + "rewards/rollout_reward_func/std": 18.703474044799805, + "sampling/importance_sampling_ratio/max": 1.4272053241729736, + "sampling/importance_sampling_ratio/mean": 0.9347177743911743, + "sampling/importance_sampling_ratio/min": 0.16998553276062012, + "sampling/sampling_logp_difference/max": 1.3626210689544678, + "sampling/sampling_logp_difference/mean": 0.009718427434563637, + "step": 391, + "step_time": 39.99297312899398 + }, + { + "clip_ratio/high_max": 0.02976190554909408, + "clip_ratio/high_mean": 0.008928571594879031, + "clip_ratio/low_mean": 0.015298011188860983, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024226582725532353, + "entropy": 0.14390948927029967, + "epoch": 0.00784, + "grad_norm": 0.4105764627456665, + "kl": 0.5607901010662317, + "learning_rate": 9.999765306780482e-05, + "loss": -0.0626, + "step": 392, + "step_time": 10.043341166003302 + }, + { + "clip_ratio/high_max": 0.009077381109818816, + "clip_ratio/high_mean": 0.002269345277454704, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00301339291036129, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.0, + "completions/max_terminated_length": 1554.0, + "completions/mean_length": 1443.640625, + "completions/mean_terminated_length": 1443.640625, + "completions/min_length": 1069.0, + "completions/min_terminated_length": 1069.0, + "entropy": 0.12537508364766836, + "epoch": 0.00786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7089640498161316, + "kl": 0.9093821812421083, + "learning_rate": 9.99976398644127e-05, + "loss": 0.0186, + "num_tokens": 22476782.0, + "reward": 11.492524147033691, + "reward_std": 15.943157196044922, + "rewards/rollout_reward_func/mean": 11.492524147033691, + "rewards/rollout_reward_func/std": 16.71925163269043, + "sampling/importance_sampling_ratio/max": 1.7644160985946655, + "sampling/importance_sampling_ratio/mean": 0.9914994239807129, + "sampling/importance_sampling_ratio/min": 0.7484045028686523, + "sampling/sampling_logp_difference/max": 0.4207209348678589, + "sampling/sampling_logp_difference/mean": 0.006036281120032072, + "step": 393, + "step_time": 40.15463091899983 + }, + { + "clip_ratio/high_max": 0.01800595293752849, + "clip_ratio/high_mean": 0.005245535809081048, + "clip_ratio/low_mean": 0.018960813991725445, + "clip_ratio/low_min": 0.0029761905316263437, + "clip_ratio/region_mean": 0.024206349917221814, + "entropy": 0.11524984752759337, + "epoch": 0.00788, + "grad_norm": 0.7076042890548706, + "kl": 0.7414491530507803, + "learning_rate": 9.9997626623986e-05, + "loss": 0.0115, + "step": 394, + "step_time": 10.618017185999634 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014880952658131719, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1443.5, + "completions/mean_terminated_length": 1443.5, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.12252500653266907, + "epoch": 0.0079, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7846469283103943, + "kl": 0.7317866403609514, + "learning_rate": 9.999761334652469e-05, + "loss": 0.0075, + "num_tokens": 22620477.0, + "reward": 11.849661827087402, + "reward_std": 16.187042236328125, + "rewards/rollout_reward_func/mean": 11.849662780761719, + "rewards/rollout_reward_func/std": 17.399803161621094, + "sampling/importance_sampling_ratio/max": 1.4447773694992065, + "sampling/importance_sampling_ratio/mean": 1.0075819492340088, + "sampling/importance_sampling_ratio/min": 0.663360595703125, + "sampling/sampling_logp_difference/max": 0.43144845962524414, + "sampling/sampling_logp_difference/mean": 0.007304108701646328, + "step": 395, + "step_time": 40.574595107005734 + }, + { + "clip_ratio/high_max": 0.033482143422588706, + "clip_ratio/high_mean": 0.011425047181546688, + "clip_ratio/low_mean": 0.01829117111628875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02971621841425076, + "entropy": 0.12393791414797306, + "epoch": 0.00792, + "grad_norm": 0.38290056586265564, + "kl": 0.7317893952131271, + "learning_rate": 9.999760003202881e-05, + "loss": 0.0033, + "step": 396, + "step_time": 10.742739806995814 + }, + { + "clip_ratio/high_max": 0.01205357164144516, + "clip_ratio/high_mean": 0.00301339291036129, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003757440543267876, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 1444.578125, + "completions/mean_terminated_length": 1444.578125, + "completions/min_length": 1287.0, + "completions/min_terminated_length": 1287.0, + "entropy": 0.13034009747207165, + "epoch": 0.00794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7314363121986389, + "kl": 0.6178351659327745, + "learning_rate": 9.999758668049833e-05, + "loss": -0.0157, + "num_tokens": 22764146.0, + "reward": 11.904011726379395, + "reward_std": 15.453010559082031, + "rewards/rollout_reward_func/mean": 11.904010772705078, + "rewards/rollout_reward_func/std": 16.291580200195312, + "sampling/importance_sampling_ratio/max": 1.2977502346038818, + "sampling/importance_sampling_ratio/mean": 0.9704984426498413, + "sampling/importance_sampling_ratio/min": 0.6586284637451172, + "sampling/sampling_logp_difference/max": 0.34184467792510986, + "sampling/sampling_logp_difference/mean": 0.006397986318916082, + "step": 397, + "step_time": 40.79752054799974 + }, + { + "clip_ratio/high_max": 0.043154762824997306, + "clip_ratio/high_mean": 0.014508928987197578, + "clip_ratio/low_mean": 0.025279997498728335, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.039788926660548896, + "entropy": 0.11147738387808204, + "epoch": 0.00796, + "grad_norm": 0.28320473432540894, + "kl": 0.7334012817591429, + "learning_rate": 9.999757329193333e-05, + "loss": -0.021, + "step": 398, + "step_time": 9.331709539997973 + }, + { + "clip_ratio/high_max": 0.0028409091755747795, + "clip_ratio/high_mean": 0.0007102272938936949, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014542749268002808, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1546.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 1455.65625, + "completions/mean_terminated_length": 1455.65625, + "completions/min_length": 1290.0, + "completions/min_terminated_length": 1290.0, + "entropy": 0.12526550004258752, + "epoch": 0.00798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7262594103813171, + "kl": 0.6103415302932262, + "learning_rate": 9.999755986633378e-05, + "loss": -0.0318, + "num_tokens": 22908577.0, + "reward": 9.555159568786621, + "reward_std": 12.746781349182129, + "rewards/rollout_reward_func/mean": 9.555160522460938, + "rewards/rollout_reward_func/std": 14.475045204162598, + "sampling/importance_sampling_ratio/max": 1.3095201253890991, + "sampling/importance_sampling_ratio/mean": 0.9779645204544067, + "sampling/importance_sampling_ratio/min": 7.978658610397404e-16, + "sampling/sampling_logp_difference/max": 27.056884765625, + "sampling/sampling_logp_difference/mean": 0.034079719334840775, + "step": 399, + "step_time": 41.13124246700136 + }, + { + "clip_ratio/high_max": 0.033107627648860216, + "clip_ratio/high_mean": 0.010509049869142473, + "clip_ratio/low_mean": 0.02362351247575134, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03413256263593212, + "entropy": 0.1209962465800345, + "epoch": 0.008, + "grad_norm": 0.35216766595840454, + "kl": 0.6524146590381861, + "learning_rate": 9.99975464036997e-05, + "loss": -0.044, + "step": 400, + "step_time": 10.55650918399806 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.0015252976445481181, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00301339291036129, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1555.0, + "completions/max_terminated_length": 1555.0, + "completions/mean_length": 1432.03125, + "completions/mean_terminated_length": 1432.03125, + "completions/min_length": 1248.0, + "completions/min_terminated_length": 1248.0, + "entropy": 0.11135548166930676, + "epoch": 0.00802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.133269190788269, + "kl": 0.7685734387487173, + "learning_rate": 9.99975329040311e-05, + "loss": 0.0237, + "num_tokens": 23051446.0, + "reward": 10.270038604736328, + "reward_std": 15.682093620300293, + "rewards/rollout_reward_func/mean": 10.270038604736328, + "rewards/rollout_reward_func/std": 16.255008697509766, + "sampling/importance_sampling_ratio/max": 1.4514762163162231, + "sampling/importance_sampling_ratio/mean": 1.0226449966430664, + "sampling/importance_sampling_ratio/min": 0.7271938920021057, + "sampling/sampling_logp_difference/max": 0.45901012420654297, + "sampling/sampling_logp_difference/mean": 0.005241828970611095, + "step": 401, + "step_time": 41.26497857599861 + }, + { + "clip_ratio/high_max": 0.03290043352171779, + "clip_ratio/high_mean": 0.008225108380429447, + "clip_ratio/low_mean": 0.015327381319366395, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023552489699795842, + "entropy": 0.11777450842782855, + "epoch": 0.00804, + "grad_norm": 0.928424060344696, + "kl": 1.0143736563622952, + "learning_rate": 9.999751936732799e-05, + "loss": 0.0269, + "step": 402, + "step_time": 10.720703897995918 + }, + { + "clip_ratio/high_max": 0.008928571594879031, + "clip_ratio/high_mean": 0.0029761905316263437, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029761905316263437, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 1402.484375, + "completions/mean_terminated_length": 1402.484375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "entropy": 0.12977053970098495, + "epoch": 0.00806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5543701648712158, + "kl": 0.6270178612321615, + "learning_rate": 9.999750579359041e-05, + "loss": 0.0183, + "num_tokens": 23192365.0, + "reward": 10.964115142822266, + "reward_std": 14.9024658203125, + "rewards/rollout_reward_func/mean": 10.964115142822266, + "rewards/rollout_reward_func/std": 15.60954475402832, + "sampling/importance_sampling_ratio/max": 1.254056453704834, + "sampling/importance_sampling_ratio/mean": 0.9866700768470764, + "sampling/importance_sampling_ratio/min": 0.661080539226532, + "sampling/sampling_logp_difference/max": 0.2775760889053345, + "sampling/sampling_logp_difference/mean": 0.005836261436343193, + "step": 403, + "step_time": 40.087825246997454 + }, + { + "clip_ratio/high_max": 0.029910714831203222, + "clip_ratio/high_mean": 0.009676001209300011, + "clip_ratio/low_mean": 0.016021825780626386, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025697826989926398, + "entropy": 0.13238740153610706, + "epoch": 0.00808, + "grad_norm": 0.44548580050468445, + "kl": 0.718162702396512, + "learning_rate": 9.999749218281836e-05, + "loss": 0.0147, + "step": 404, + "step_time": 9.659750243004964 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014880952658131719, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1552.0, + "completions/max_terminated_length": 1552.0, + "completions/mean_length": 1440.421875, + "completions/mean_terminated_length": 1440.421875, + "completions/min_length": 1154.0, + "completions/min_terminated_length": 1154.0, + "entropy": 0.12935744831338525, + "epoch": 0.0081, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5480543375015259, + "kl": 0.6046669036149979, + "learning_rate": 9.999747853501184e-05, + "loss": 0.0137, + "num_tokens": 23335798.0, + "reward": 12.202452659606934, + "reward_std": 18.661951065063477, + "rewards/rollout_reward_func/mean": 12.20245361328125, + "rewards/rollout_reward_func/std": 20.890966415405273, + "sampling/importance_sampling_ratio/max": 1.5541430711746216, + "sampling/importance_sampling_ratio/mean": 1.0242815017700195, + "sampling/importance_sampling_ratio/min": 0.6801992058753967, + "sampling/sampling_logp_difference/max": 0.38781797885894775, + "sampling/sampling_logp_difference/mean": 0.006136234849691391, + "step": 405, + "step_time": 40.755317154002114 + }, + { + "clip_ratio/high_max": 0.02380952425301075, + "clip_ratio/high_mean": 0.00744047638727352, + "clip_ratio/low_mean": 0.018129960633814335, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025570437079295516, + "entropy": 0.12149734795093536, + "epoch": 0.00812, + "grad_norm": 0.26514580845832825, + "kl": 0.64109767973423, + "learning_rate": 9.999746485017087e-05, + "loss": 0.0087, + "step": 406, + "step_time": 10.136832774996947 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0037202381645329297, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.0, + "completions/max_terminated_length": 1554.0, + "completions/mean_length": 1405.703125, + "completions/mean_terminated_length": 1405.703125, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 0.12104977620765567, + "epoch": 0.00814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35428720712661743, + "kl": 0.6081782300025225, + "learning_rate": 9.999745112829547e-05, + "loss": 0.0047, + "num_tokens": 23476941.0, + "reward": 10.940488815307617, + "reward_std": 14.940820693969727, + "rewards/rollout_reward_func/mean": 10.940488815307617, + "rewards/rollout_reward_func/std": 15.13664436340332, + "sampling/importance_sampling_ratio/max": 1.254475712776184, + "sampling/importance_sampling_ratio/mean": 0.9845165014266968, + "sampling/importance_sampling_ratio/min": 0.6197980642318726, + "sampling/sampling_logp_difference/max": 0.40376973152160645, + "sampling/sampling_logp_difference/mean": 0.00637152511626482, + "step": 407, + "step_time": 40.52080072600438 + }, + { + "clip_ratio/high_max": 0.02380952425301075, + "clip_ratio/high_mean": 0.007440476329065859, + "clip_ratio/low_mean": 0.026450893783476204, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.033891370287165046, + "entropy": 0.11284881783649325, + "epoch": 0.00816, + "grad_norm": 0.26076704263687134, + "kl": 0.6312750466167927, + "learning_rate": 9.999743736938565e-05, + "loss": -0.0013, + "step": 408, + "step_time": 10.76028649699765 + }, + { + "clip_ratio/high_max": 0.0028409091755747795, + "clip_ratio/high_mean": 0.0007102272938936949, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029423701926134527, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1546.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 1414.90625, + "completions/mean_terminated_length": 1414.90625, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.11457140510901809, + "epoch": 0.00818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.668945848941803, + "kl": 0.5923841055482626, + "learning_rate": 9.999742357344142e-05, + "loss": 0.0624, + "num_tokens": 23618723.0, + "reward": 10.537452697753906, + "reward_std": 15.241682052612305, + "rewards/rollout_reward_func/mean": 10.537453651428223, + "rewards/rollout_reward_func/std": 16.505765914916992, + "sampling/importance_sampling_ratio/max": 1.2935158014297485, + "sampling/importance_sampling_ratio/mean": 0.9813590049743652, + "sampling/importance_sampling_ratio/min": 2.974116992179171e-14, + "sampling/sampling_logp_difference/max": 25.953086853027344, + "sampling/sampling_logp_difference/mean": 0.028037957847118378, + "step": 409, + "step_time": 40.691261925003346 + }, + { + "clip_ratio/high_max": 0.04437229549512267, + "clip_ratio/high_mean": 0.011837121448479593, + "clip_ratio/low_mean": 0.016443452972453088, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028280574886593968, + "entropy": 0.1118474374525249, + "epoch": 0.0082, + "grad_norm": 0.2630373537540436, + "kl": 0.6832827776670456, + "learning_rate": 9.999740974046282e-05, + "loss": 0.0566, + "step": 410, + "step_time": 9.717429660999187 + }, + { + "clip_ratio/high_max": 0.011904762126505375, + "clip_ratio/high_mean": 0.004464285797439516, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0052083334303461015, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 1393.609375, + "completions/mean_terminated_length": 1393.609375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.1146247279830277, + "epoch": 0.00822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47182416915893555, + "kl": 0.5627781376242638, + "learning_rate": 9.999739587044981e-05, + "loss": -0.0341, + "num_tokens": 23759122.0, + "reward": 8.971721649169922, + "reward_std": 14.443693161010742, + "rewards/rollout_reward_func/mean": 8.971721649169922, + "rewards/rollout_reward_func/std": 14.68343448638916, + "sampling/importance_sampling_ratio/max": 1.243363857269287, + "sampling/importance_sampling_ratio/mean": 0.9929588437080383, + "sampling/importance_sampling_ratio/min": 0.7046716809272766, + "sampling/sampling_logp_difference/max": 0.35747838020324707, + "sampling/sampling_logp_difference/mean": 0.005684119649231434, + "step": 411, + "step_time": 39.962890398002855 + }, + { + "clip_ratio/high_max": 0.035714286379516125, + "clip_ratio/high_mean": 0.009672619227785617, + "clip_ratio/low_mean": 0.01767113123787567, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02734375116415322, + "entropy": 0.11477407393977046, + "epoch": 0.00824, + "grad_norm": 0.24663475155830383, + "kl": 0.6022106558084488, + "learning_rate": 9.999738196340245e-05, + "loss": -0.0386, + "step": 412, + "step_time": 9.870993509000982 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029761905316263437, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1434.671875, + "completions/mean_terminated_length": 1434.671875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.11288065044209361, + "epoch": 0.00826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38144180178642273, + "kl": 0.7383872698992491, + "learning_rate": 9.999736801932072e-05, + "loss": 0.0133, + "num_tokens": 23902181.0, + "reward": 13.304646492004395, + "reward_std": 20.157991409301758, + "rewards/rollout_reward_func/mean": 13.304647445678711, + "rewards/rollout_reward_func/std": 21.064607620239258, + "sampling/importance_sampling_ratio/max": 1.3603137731552124, + "sampling/importance_sampling_ratio/mean": 1.0158387422561646, + "sampling/importance_sampling_ratio/min": 0.7469893097877502, + "sampling/sampling_logp_difference/max": 0.2501299977302551, + "sampling/sampling_logp_difference/mean": 0.004556077066808939, + "step": 413, + "step_time": 41.25988831600807 + }, + { + "clip_ratio/high_max": 0.017857143422588706, + "clip_ratio/high_mean": 0.0044642858556471765, + "clip_ratio/low_mean": 0.012648809934034944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01711309573147446, + "entropy": 0.10936349909752607, + "epoch": 0.00828, + "grad_norm": 0.2556546628475189, + "kl": 0.7252329587936401, + "learning_rate": 9.999735403820466e-05, + "loss": 0.0102, + "step": 414, + "step_time": 10.573283408997668 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002232142898719758, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1562.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 1493.640625, + "completions/mean_terminated_length": 1493.640625, + "completions/min_length": 1359.0, + "completions/min_terminated_length": 1359.0, + "entropy": 0.11034470843151212, + "epoch": 0.0083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7242380380630493, + "kl": 0.6212767362594604, + "learning_rate": 9.999734002005428e-05, + "loss": -0.0155, + "num_tokens": 24049141.0, + "reward": 9.928826332092285, + "reward_std": 15.976888656616211, + "rewards/rollout_reward_func/mean": 9.928826332092285, + "rewards/rollout_reward_func/std": 16.414718627929688, + "sampling/importance_sampling_ratio/max": 1.3260316848754883, + "sampling/importance_sampling_ratio/mean": 1.0085797309875488, + "sampling/importance_sampling_ratio/min": 0.5519727468490601, + "sampling/sampling_logp_difference/max": 0.5959200859069824, + "sampling/sampling_logp_difference/mean": 0.006388316862285137, + "step": 415, + "step_time": 40.88382640199961 + }, + { + "clip_ratio/high_max": 0.014880952658131719, + "clip_ratio/high_mean": 0.005952381121460348, + "clip_ratio/low_mean": 0.019494048377964646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025446429615840316, + "entropy": 0.09857920417562127, + "epoch": 0.00832, + "grad_norm": 0.39599546790122986, + "kl": 0.7278024889528751, + "learning_rate": 9.99973259648696e-05, + "loss": -0.013, + "step": 416, + "step_time": 10.850284384998304 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002232142898719758, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 1388.96875, + "completions/mean_terminated_length": 1388.96875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.1026167522650212, + "epoch": 0.00834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4579165577888489, + "kl": 0.8239834625273943, + "learning_rate": 9.99973118726506e-05, + "loss": -0.0484, + "num_tokens": 24189178.0, + "reward": 12.621437072753906, + "reward_std": 16.67880630493164, + "rewards/rollout_reward_func/mean": 12.621437072753906, + "rewards/rollout_reward_func/std": 17.352924346923828, + "sampling/importance_sampling_ratio/max": 1.2838397026062012, + "sampling/importance_sampling_ratio/mean": 1.0156192779541016, + "sampling/importance_sampling_ratio/min": 0.6750461459159851, + "sampling/sampling_logp_difference/max": 0.2394113540649414, + "sampling/sampling_logp_difference/mean": 0.004534607753157616, + "step": 417, + "step_time": 39.856104094997136 + }, + { + "clip_ratio/high_max": 0.038690477376803756, + "clip_ratio/high_mean": 0.011904762184713036, + "clip_ratio/low_mean": 0.01116071455180645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02306547696935013, + "entropy": 0.11136638512834907, + "epoch": 0.00836, + "grad_norm": 0.2160414755344391, + "kl": 0.6315647587180138, + "learning_rate": 9.999729774339733e-05, + "loss": -0.0554, + "step": 418, + "step_time": 9.950184918994637 + }, + { + "clip_ratio/high_max": 0.0028409091755747795, + "clip_ratio/high_mean": 0.0007102272938936949, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007102272938936949, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1415.34375, + "completions/mean_terminated_length": 1415.34375, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 0.12300179339945316, + "epoch": 0.00838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35927513241767883, + "kl": 0.563910448923707, + "learning_rate": 9.999728357710979e-05, + "loss": -0.0024, + "num_tokens": 24330939.0, + "reward": 10.211483001708984, + "reward_std": 12.243392944335938, + "rewards/rollout_reward_func/mean": 10.211483001708984, + "rewards/rollout_reward_func/std": 12.923269271850586, + "sampling/importance_sampling_ratio/max": 1.561508297920227, + "sampling/importance_sampling_ratio/mean": 0.9852752089500427, + "sampling/importance_sampling_ratio/min": 0.6525661945343018, + "sampling/sampling_logp_difference/max": 0.421316921710968, + "sampling/sampling_logp_difference/mean": 0.005881062708795071, + "step": 419, + "step_time": 40.82010957399871 + }, + { + "clip_ratio/high_max": 0.023403680184856057, + "clip_ratio/high_mean": 0.00801836303435266, + "clip_ratio/low_mean": 0.005332341359462589, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01335070439381525, + "entropy": 0.12413196917623281, + "epoch": 0.0084, + "grad_norm": 0.22808168828487396, + "kl": 0.5641085561364889, + "learning_rate": 9.999726937378799e-05, + "loss": -0.0082, + "step": 420, + "step_time": 9.7226802879959 + }, + { + "clip_ratio/high_max": 0.008928571594879031, + "clip_ratio/high_mean": 0.002232142898719758, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004464285797439516, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1535.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1441.640625, + "completions/mean_terminated_length": 1441.640625, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "entropy": 0.1284960494376719, + "epoch": 0.00842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.532646656036377, + "kl": 0.7414810676127672, + "learning_rate": 9.999725513343196e-05, + "loss": 0.0034, + "num_tokens": 24474440.0, + "reward": 15.56411361694336, + "reward_std": 16.717456817626953, + "rewards/rollout_reward_func/mean": 15.56411361694336, + "rewards/rollout_reward_func/std": 16.81290626525879, + "sampling/importance_sampling_ratio/max": 1.2900909185409546, + "sampling/importance_sampling_ratio/mean": 1.0089163780212402, + "sampling/importance_sampling_ratio/min": 0.6302499175071716, + "sampling/sampling_logp_difference/max": 0.41839098930358887, + "sampling/sampling_logp_difference/mean": 0.006366787478327751, + "step": 421, + "step_time": 41.64962637100143 + }, + { + "clip_ratio/high_max": 0.020833333721384406, + "clip_ratio/high_mean": 0.0052083334303461015, + "clip_ratio/low_mean": 0.014136905199848115, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019345238688401878, + "entropy": 0.12450070818886161, + "epoch": 0.00844, + "grad_norm": 0.3132474422454834, + "kl": 0.7047660015523434, + "learning_rate": 9.999724085604169e-05, + "loss": -0.0014, + "step": 422, + "step_time": 10.727001868001025 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029761905316263437, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 1452.453125, + "completions/mean_terminated_length": 1452.453125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.11882536578923464, + "epoch": 0.00846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7803420424461365, + "kl": 0.8879449907690287, + "learning_rate": 9.999722654161722e-05, + "loss": -0.0437, + "num_tokens": 24618707.0, + "reward": 11.537307739257812, + "reward_std": 16.87006187438965, + "rewards/rollout_reward_func/mean": 11.537307739257812, + "rewards/rollout_reward_func/std": 18.111291885375977, + "sampling/importance_sampling_ratio/max": 2.1790900230407715, + "sampling/importance_sampling_ratio/mean": 1.0079734325408936, + "sampling/importance_sampling_ratio/min": 0.6660839319229126, + "sampling/sampling_logp_difference/max": 1.0955865383148193, + "sampling/sampling_logp_difference/mean": 0.0059229484759271145, + "step": 423, + "step_time": 39.62021958500554 + }, + { + "clip_ratio/high_max": 0.02380952425301075, + "clip_ratio/high_mean": 0.006696428696159273, + "clip_ratio/low_mean": 0.015560741710942239, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02225717029068619, + "entropy": 0.12485062563791871, + "epoch": 0.00848, + "grad_norm": 0.31361132860183716, + "kl": 0.7454855944961309, + "learning_rate": 9.999721219015854e-05, + "loss": -0.0541, + "step": 424, + "step_time": 10.1194757400026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014880952658131719, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1537.0, + "completions/max_terminated_length": 1537.0, + "completions/mean_length": 1414.84375, + "completions/mean_terminated_length": 1414.84375, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.1313102599233389, + "epoch": 0.0085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5788205862045288, + "kl": 0.6796710211783648, + "learning_rate": 9.999719780166567e-05, + "loss": -0.0346, + "num_tokens": 24760444.0, + "reward": 10.583850860595703, + "reward_std": 15.813437461853027, + "rewards/rollout_reward_func/mean": 10.583850860595703, + "rewards/rollout_reward_func/std": 15.782630920410156, + "sampling/importance_sampling_ratio/max": 1.3160440921783447, + "sampling/importance_sampling_ratio/mean": 0.9774030447006226, + "sampling/importance_sampling_ratio/min": 0.7505905628204346, + "sampling/sampling_logp_difference/max": 0.2754938304424286, + "sampling/sampling_logp_difference/mean": 0.00678935507312417, + "step": 425, + "step_time": 41.97174792000442 + }, + { + "clip_ratio/high_max": 0.020833333721384406, + "clip_ratio/high_mean": 0.0052083334303461015, + "clip_ratio/low_mean": 0.017931548063643277, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023139881726820022, + "entropy": 0.13454774813726544, + "epoch": 0.00852, + "grad_norm": 0.24161121249198914, + "kl": 0.6602058243006468, + "learning_rate": 9.999718337613865e-05, + "loss": -0.0446, + "step": 426, + "step_time": 9.663861974999236 + }, + { + "clip_ratio/high_max": 0.008928571594879031, + "clip_ratio/high_mean": 0.002232142898719758, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0037202381645329297, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1540.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 1432.515625, + "completions/mean_terminated_length": 1432.515625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.1410405244678259, + "epoch": 0.00854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5101809501647949, + "kl": 0.6386174689978361, + "learning_rate": 9.999716891357746e-05, + "loss": 0.0369, + "num_tokens": 24903364.0, + "reward": 11.803701400756836, + "reward_std": 16.973173141479492, + "rewards/rollout_reward_func/mean": 11.803701400756836, + "rewards/rollout_reward_func/std": 17.966468811035156, + "sampling/importance_sampling_ratio/max": 1.7738006114959717, + "sampling/importance_sampling_ratio/mean": 0.995194137096405, + "sampling/importance_sampling_ratio/min": 0.6213434338569641, + "sampling/sampling_logp_difference/max": 0.5084433555603027, + "sampling/sampling_logp_difference/mean": 0.007640195079147816, + "step": 427, + "step_time": 42.486011768000026 + }, + { + "clip_ratio/high_max": 0.02976190554909408, + "clip_ratio/high_mean": 0.01116071455180645, + "clip_ratio/low_mean": 0.012369791802484542, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023530506470706314, + "entropy": 0.14208506979048252, + "epoch": 0.00856, + "grad_norm": 0.2106575071811676, + "kl": 0.6240573097020388, + "learning_rate": 9.999715441398214e-05, + "loss": 0.0308, + "step": 428, + "step_time": 10.646923483993305 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007440476329065859, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1573.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 1438.828125, + "completions/mean_terminated_length": 1438.828125, + "completions/min_length": 1255.0, + "completions/min_terminated_length": 1255.0, + "entropy": 0.14296143036335707, + "epoch": 0.00858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42578843235969543, + "kl": 0.5468210577964783, + "learning_rate": 9.999713987735269e-05, + "loss": 0.0008, + "num_tokens": 25046668.0, + "reward": 12.157367706298828, + "reward_std": 19.82905387878418, + "rewards/rollout_reward_func/mean": 12.157367706298828, + "rewards/rollout_reward_func/std": 20.11625862121582, + "sampling/importance_sampling_ratio/max": 1.1917697191238403, + "sampling/importance_sampling_ratio/mean": 0.988789439201355, + "sampling/importance_sampling_ratio/min": 0.6782960295677185, + "sampling/sampling_logp_difference/max": 0.32637321949005127, + "sampling/sampling_logp_difference/mean": 0.006113000214099884, + "step": 429, + "step_time": 40.68629942800362 + }, + { + "clip_ratio/high_max": 0.023809524485841393, + "clip_ratio/high_mean": 0.007440476503688842, + "clip_ratio/low_mean": 0.01045386923942715, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017894345801323652, + "entropy": 0.14418638544157147, + "epoch": 0.0086, + "grad_norm": 0.268960177898407, + "kl": 0.5387851055711508, + "learning_rate": 9.999712530368912e-05, + "loss": -0.0055, + "step": 430, + "step_time": 11.072718907002127 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007440476329065859, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1550.0, + "completions/max_terminated_length": 1550.0, + "completions/mean_length": 1429.546875, + "completions/mean_terminated_length": 1429.546875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.15179488621652126, + "epoch": 0.00862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7485275864601135, + "kl": 0.5430763624608517, + "learning_rate": 9.999711069299146e-05, + "loss": -0.0808, + "num_tokens": 25189448.0, + "reward": 11.358131408691406, + "reward_std": 17.856586456298828, + "rewards/rollout_reward_func/mean": 11.358131408691406, + "rewards/rollout_reward_func/std": 18.32318878173828, + "sampling/importance_sampling_ratio/max": 1.3345392942428589, + "sampling/importance_sampling_ratio/mean": 1.0265988111495972, + "sampling/importance_sampling_ratio/min": 0.48013654351234436, + "sampling/sampling_logp_difference/max": 0.7489854097366333, + "sampling/sampling_logp_difference/mean": 0.008107547648251057, + "step": 431, + "step_time": 40.926112169998305 + }, + { + "clip_ratio/high_max": 0.0654761919286102, + "clip_ratio/high_mean": 0.02008928614668548, + "clip_ratio/low_mean": 0.017782738606911153, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.037872024811804295, + "entropy": 0.15683973440900445, + "epoch": 0.00864, + "grad_norm": 0.2219485342502594, + "kl": 0.5109246261417866, + "learning_rate": 9.99970960452597e-05, + "loss": -0.0914, + "step": 432, + "step_time": 10.182908312996005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1552.0, + "completions/max_terminated_length": 1552.0, + "completions/mean_length": 1470.21875, + "completions/mean_terminated_length": 1470.21875, + "completions/min_length": 1344.0, + "completions/min_terminated_length": 1344.0, + "entropy": 0.14491091342642903, + "epoch": 0.00866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46287354826927185, + "kl": 0.5186197776347399, + "learning_rate": 9.999708136049389e-05, + "loss": -0.0113, + "num_tokens": 25334849.0, + "reward": 10.764678955078125, + "reward_std": 13.417325973510742, + "rewards/rollout_reward_func/mean": 10.764678955078125, + "rewards/rollout_reward_func/std": 14.159459114074707, + "sampling/importance_sampling_ratio/max": 1.4547772407531738, + "sampling/importance_sampling_ratio/mean": 1.0081079006195068, + "sampling/importance_sampling_ratio/min": 0.7049920558929443, + "sampling/sampling_logp_difference/max": 0.4709939956665039, + "sampling/sampling_logp_difference/mean": 0.005667536519467831, + "step": 433, + "step_time": 42.1604533589998 + }, + { + "clip_ratio/high_max": 0.0476190485060215, + "clip_ratio/high_mean": 0.014136905199848115, + "clip_ratio/low_mean": 0.01785714365541935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03199404844781384, + "entropy": 0.15482168877497315, + "epoch": 0.00868, + "grad_norm": 0.2332436740398407, + "kl": 0.5066223796457052, + "learning_rate": 9.9997066638694e-05, + "loss": -0.0183, + "step": 434, + "step_time": 10.119833896998898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 1436.375, + "completions/mean_terminated_length": 1436.375, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "entropy": 0.16640883032232523, + "epoch": 0.0087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5225183367729187, + "kl": 0.4759600590914488, + "learning_rate": 9.999705187986009e-05, + "loss": 0.0044, + "num_tokens": 25478062.0, + "reward": 11.862446784973145, + "reward_std": 14.980566024780273, + "rewards/rollout_reward_func/mean": 11.862445831298828, + "rewards/rollout_reward_func/std": 15.403722763061523, + "sampling/importance_sampling_ratio/max": 1.3265814781188965, + "sampling/importance_sampling_ratio/mean": 1.0067017078399658, + "sampling/importance_sampling_ratio/min": 0.6984032988548279, + "sampling/sampling_logp_difference/max": 0.3158724308013916, + "sampling/sampling_logp_difference/mean": 0.007522557862102985, + "step": 435, + "step_time": 40.53673548099687 + }, + { + "clip_ratio/high_max": 0.08556547830812633, + "clip_ratio/high_mean": 0.028087798331398517, + "clip_ratio/low_mean": 0.025297619868069887, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05338541802484542, + "entropy": 0.166918208822608, + "epoch": 0.00872, + "grad_norm": 0.5592331886291504, + "kl": 0.46205065958201885, + "learning_rate": 9.999703708399215e-05, + "loss": -0.0001, + "step": 436, + "step_time": 10.790453629004332 + }, + { + "clip_ratio/high_max": 0.017857143189758062, + "clip_ratio/high_mean": 0.004464285797439516, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0052083334303461015, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1542.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 1430.21875, + "completions/mean_terminated_length": 1430.21875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.1512767318636179, + "epoch": 0.00874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5887247920036316, + "kl": 0.47883218713104725, + "learning_rate": 9.99970222510902e-05, + "loss": 0.023, + "num_tokens": 25620798.0, + "reward": 10.20716667175293, + "reward_std": 16.14691734313965, + "rewards/rollout_reward_func/mean": 10.20716667175293, + "rewards/rollout_reward_func/std": 17.900371551513672, + "sampling/importance_sampling_ratio/max": 1.2416183948516846, + "sampling/importance_sampling_ratio/mean": 0.9807419776916504, + "sampling/importance_sampling_ratio/min": 0.542736291885376, + "sampling/sampling_logp_difference/max": 0.36902284622192383, + "sampling/sampling_logp_difference/mean": 0.0073195262812078, + "step": 437, + "step_time": 40.27251563699974 + }, + { + "clip_ratio/high_max": 0.059523811331018806, + "clip_ratio/high_mean": 0.02306547691114247, + "clip_ratio/low_mean": 0.03698593232547864, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06005140976049006, + "entropy": 0.11152059538289905, + "epoch": 0.00876, + "grad_norm": 0.34218233823776245, + "kl": 0.599434606730938, + "learning_rate": 9.999700738115424e-05, + "loss": 0.0208, + "step": 438, + "step_time": 10.141322578992913 + }, + { + "clip_ratio/high_max": 0.008928571594879031, + "clip_ratio/high_mean": 0.002232142898719758, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029761905316263437, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 1448.5625, + "completions/mean_terminated_length": 1448.5625, + "completions/min_length": 1357.0, + "completions/min_terminated_length": 1357.0, + "entropy": 0.09335534879937768, + "epoch": 0.00878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5081444382667542, + "kl": 0.5331121180206537, + "learning_rate": 9.999699247418432e-05, + "loss": -0.0063, + "num_tokens": 25764758.0, + "reward": 9.246360778808594, + "reward_std": 12.59730339050293, + "rewards/rollout_reward_func/mean": 9.246360778808594, + "rewards/rollout_reward_func/std": 14.430070877075195, + "sampling/importance_sampling_ratio/max": 1.47153902053833, + "sampling/importance_sampling_ratio/mean": 0.9984990358352661, + "sampling/importance_sampling_ratio/min": 0.582763671875, + "sampling/sampling_logp_difference/max": 0.4040945768356323, + "sampling/sampling_logp_difference/mean": 0.0051497891545295715, + "step": 439, + "step_time": 41.78158714499841 + }, + { + "clip_ratio/high_max": 0.020833333721384406, + "clip_ratio/high_mean": 0.0052083334303461015, + "clip_ratio/low_mean": 0.02083333401242271, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026041667442768812, + "entropy": 0.07112342561595142, + "epoch": 0.0088, + "grad_norm": 0.41764187812805176, + "kl": 0.8426203690469265, + "learning_rate": 9.999697753018041e-05, + "loss": -0.0085, + "step": 440, + "step_time": 10.17990355800066 + }, + { + "clip_ratio/high_max": 0.008928571594879031, + "clip_ratio/high_mean": 0.0029761905316263437, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0052083334303461015, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1552.0, + "completions/max_terminated_length": 1552.0, + "completions/mean_length": 1425.234375, + "completions/mean_terminated_length": 1425.234375, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.07502732030116022, + "epoch": 0.00882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4730430841445923, + "kl": 0.598696194589138, + "learning_rate": 9.999696254914256e-05, + "loss": -0.0232, + "num_tokens": 25907211.0, + "reward": 12.0460205078125, + "reward_std": 12.864827156066895, + "rewards/rollout_reward_func/mean": 12.0460205078125, + "rewards/rollout_reward_func/std": 13.124265670776367, + "sampling/importance_sampling_ratio/max": 2.117748975753784, + "sampling/importance_sampling_ratio/mean": 0.979032039642334, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.2621982097625732, + "sampling/sampling_logp_difference/mean": 0.006923416629433632, + "step": 441, + "step_time": 40.17172135200235 + }, + { + "clip_ratio/high_max": 0.020833333721384406, + "clip_ratio/high_mean": 0.006696428754366934, + "clip_ratio/low_mean": 0.010491071618162096, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01718750043073669, + "entropy": 0.07272043719422072, + "epoch": 0.00884, + "grad_norm": 0.2536933422088623, + "kl": 0.6081040930002928, + "learning_rate": 9.999694753107076e-05, + "loss": -0.0288, + "step": 442, + "step_time": 10.609227344999454 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007440476329065859, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 1433.484375, + "completions/mean_terminated_length": 1433.484375, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.08062643301673234, + "epoch": 0.00886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8294975757598877, + "kl": 0.5611858777701855, + "learning_rate": 9.999693247596505e-05, + "loss": 0.0316, + "num_tokens": 26050176.0, + "reward": 9.822164535522461, + "reward_std": 14.750000953674316, + "rewards/rollout_reward_func/mean": 9.822165489196777, + "rewards/rollout_reward_func/std": 14.6282377243042, + "sampling/importance_sampling_ratio/max": 1.5190024375915527, + "sampling/importance_sampling_ratio/mean": 1.0036146640777588, + "sampling/importance_sampling_ratio/min": 0.7604562640190125, + "sampling/sampling_logp_difference/max": 0.3031894564628601, + "sampling/sampling_logp_difference/mean": 0.004106580279767513, + "step": 443, + "step_time": 40.79559905699534 + }, + { + "clip_ratio/high_max": 0.030257937032729387, + "clip_ratio/high_mean": 0.007564484258182347, + "clip_ratio/low_mean": 0.015591179952025414, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02315566421020776, + "entropy": 0.0824263768736273, + "epoch": 0.00888, + "grad_norm": 0.9322162866592407, + "kl": 0.7285797223448753, + "learning_rate": 9.999691738382544e-05, + "loss": 0.034, + "step": 444, + "step_time": 10.72277228299754 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007440476329065859, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1574.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 1428.421875, + "completions/mean_terminated_length": 1428.421875, + "completions/min_length": 1166.0, + "completions/min_terminated_length": 1166.0, + "entropy": 0.07391244731843472, + "epoch": 0.0089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5232189893722534, + "kl": 0.6245864983648062, + "learning_rate": 9.999690225465193e-05, + "loss": -0.0215, + "num_tokens": 26192780.0, + "reward": 11.33067512512207, + "reward_std": 15.117729187011719, + "rewards/rollout_reward_func/mean": 11.33067512512207, + "rewards/rollout_reward_func/std": 16.229934692382812, + "sampling/importance_sampling_ratio/max": 1.4000767469406128, + "sampling/importance_sampling_ratio/mean": 1.0228557586669922, + "sampling/importance_sampling_ratio/min": 0.8148965239524841, + "sampling/sampling_logp_difference/max": 0.303769588470459, + "sampling/sampling_logp_difference/mean": 0.003096876898780465, + "step": 445, + "step_time": 39.69306251100352 + }, + { + "clip_ratio/high_max": 0.020833333721384406, + "clip_ratio/high_mean": 0.0059523810632526875, + "clip_ratio/low_mean": 0.011408730410039425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017361111531499773, + "entropy": 0.07312626042403281, + "epoch": 0.00892, + "grad_norm": 0.2677549719810486, + "kl": 0.6443136036396027, + "learning_rate": 9.999688708844453e-05, + "loss": -0.0254, + "step": 446, + "step_time": 9.859747254999093 + }, + { + "clip_ratio/high_max": 0.0064484127797186375, + "clip_ratio/high_mean": 0.0023561508278362453, + "clip_ratio/low_mean": 0.0014542749268002808, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003810425754636526, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 1392.1875, + "completions/mean_terminated_length": 1392.1875, + "completions/min_length": 1207.0, + "completions/min_terminated_length": 1207.0, + "entropy": 0.10388755868189037, + "epoch": 0.00894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44962024688720703, + "kl": 0.5700237862765789, + "learning_rate": 9.999687188520327e-05, + "loss": -0.0085, + "num_tokens": 26333000.0, + "reward": 10.396234512329102, + "reward_std": 12.773336410522461, + "rewards/rollout_reward_func/mean": 10.396234512329102, + "rewards/rollout_reward_func/std": 13.91511058807373, + "sampling/importance_sampling_ratio/max": 1.2538983821868896, + "sampling/importance_sampling_ratio/mean": 1.0106072425842285, + "sampling/importance_sampling_ratio/min": 0.8617662787437439, + "sampling/sampling_logp_difference/max": 0.21103119850158691, + "sampling/sampling_logp_difference/mean": 0.004161643795669079, + "step": 447, + "step_time": 40.40724292899722 + }, + { + "clip_ratio/high_max": 0.017215219675563276, + "clip_ratio/high_mean": 0.005047852551797405, + "clip_ratio/low_mean": 0.011870941845700145, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01691879451391287, + "entropy": 0.10565289529040456, + "epoch": 0.00896, + "grad_norm": 0.3113742470741272, + "kl": 0.5588793251663446, + "learning_rate": 9.999685664492817e-05, + "loss": -0.011, + "step": 448, + "step_time": 9.88399511400712 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.0029761905316263437, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004464285797439516, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1539.0, + "completions/max_terminated_length": 1539.0, + "completions/mean_length": 1425.671875, + "completions/mean_terminated_length": 1425.671875, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.08549337997101247, + "epoch": 0.00898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43871885538101196, + "kl": 0.5197541080415249, + "learning_rate": 9.999684136761923e-05, + "loss": 0.0424, + "num_tokens": 26475423.0, + "reward": 13.137186050415039, + "reward_std": 18.040781021118164, + "rewards/rollout_reward_func/mean": 13.137186050415039, + "rewards/rollout_reward_func/std": 18.348669052124023, + "sampling/importance_sampling_ratio/max": 2.0688071250915527, + "sampling/importance_sampling_ratio/mean": 1.0355302095413208, + "sampling/importance_sampling_ratio/min": 0.7141319513320923, + "sampling/sampling_logp_difference/max": 0.7877845764160156, + "sampling/sampling_logp_difference/mean": 0.004831024445593357, + "step": 449, + "step_time": 40.3097439919984 + }, + { + "clip_ratio/high_max": 0.02380952425301075, + "clip_ratio/high_mean": 0.00744047638727352, + "clip_ratio/low_mean": 0.010349026299081743, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01778950251173228, + "entropy": 0.09008124680258334, + "epoch": 0.009, + "grad_norm": 0.2819499969482422, + "kl": 0.48992327228188515, + "learning_rate": 9.999682605327648e-05, + "loss": 0.0377, + "step": 450, + "step_time": 11.019723427001736 + }, + { + "clip_ratio/high_max": 0.008928571594879031, + "clip_ratio/high_mean": 0.002232142898719758, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029761905316263437, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 1437.046875, + "completions/mean_terminated_length": 1437.046875, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.09430601261556149, + "epoch": 0.00902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39097362756729126, + "kl": 0.5122922882437706, + "learning_rate": 9.99968107018999e-05, + "loss": -0.0447, + "num_tokens": 26618636.0, + "reward": 10.664965629577637, + "reward_std": 12.413619995117188, + "rewards/rollout_reward_func/mean": 10.664965629577637, + "rewards/rollout_reward_func/std": 12.955881118774414, + "sampling/importance_sampling_ratio/max": 1.1989917755126953, + "sampling/importance_sampling_ratio/mean": 0.9830008745193481, + "sampling/importance_sampling_ratio/min": 0.5060357451438904, + "sampling/sampling_logp_difference/max": 0.3329579830169678, + "sampling/sampling_logp_difference/mean": 0.004485957324504852, + "step": 451, + "step_time": 39.55284725899946 + }, + { + "clip_ratio/high_max": 0.017857143189758062, + "clip_ratio/high_mean": 0.0052083334303461015, + "clip_ratio/low_mean": 0.009709821664728224, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014918155211489648, + "entropy": 0.09947029640898108, + "epoch": 0.00904, + "grad_norm": 0.2647772431373596, + "kl": 0.501507306471467, + "learning_rate": 9.999679531348955e-05, + "loss": -0.0474, + "step": 452, + "step_time": 9.83529582600022 + }, + { + "clip_ratio/high_max": 0.0029761905316263437, + "clip_ratio/high_mean": 0.0007440476329065859, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014880952658131719, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1576.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 1483.796875, + "completions/mean_terminated_length": 1483.796875, + "completions/min_length": 1354.0, + "completions/min_terminated_length": 1354.0, + "entropy": 0.08577556139789522, + "epoch": 0.00906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4616855978965759, + "kl": 0.4984573759138584, + "learning_rate": 9.999677988804543e-05, + "loss": 0.0129, + "num_tokens": 26764995.0, + "reward": 12.713988304138184, + "reward_std": 16.157230377197266, + "rewards/rollout_reward_func/mean": 12.713988304138184, + "rewards/rollout_reward_func/std": 17.417678833007812, + "sampling/importance_sampling_ratio/max": 1.2561296224594116, + "sampling/importance_sampling_ratio/mean": 1.0040102005004883, + "sampling/importance_sampling_ratio/min": 0.5851351618766785, + "sampling/sampling_logp_difference/max": 0.335345983505249, + "sampling/sampling_logp_difference/mean": 0.004758521914482117, + "step": 453, + "step_time": 42.06172357400101 + }, + { + "clip_ratio/high_max": 0.014880952658131719, + "clip_ratio/high_mean": 0.004464285797439516, + "clip_ratio/low_mean": 0.015625000465661287, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020089286321308464, + "entropy": 0.0774516521487385, + "epoch": 0.00908, + "grad_norm": 0.13414981961250305, + "kl": 0.5282110534608364, + "learning_rate": 9.999676442556757e-05, + "loss": 0.0065, + "step": 454, + "step_time": 10.263705699999264 + }, + { + "clip_ratio/high_max": 0.018005952704697847, + "clip_ratio/high_mean": 0.005245535809081048, + "clip_ratio/low_mean": 0.002232142898719758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007477678707800806, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 1432.234375, + "completions/mean_terminated_length": 1432.234375, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "entropy": 0.0869816429913044, + "epoch": 0.0091, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5139105319976807, + "kl": 0.5019301455467939, + "learning_rate": 9.999674892605595e-05, + "loss": -0.0143, + "num_tokens": 26907877.0, + "reward": 14.470987319946289, + "reward_std": 12.551952362060547, + "rewards/rollout_reward_func/mean": 14.470987319946289, + "rewards/rollout_reward_func/std": 13.231359481811523, + "sampling/importance_sampling_ratio/max": 1.4351580142974854, + "sampling/importance_sampling_ratio/mean": 0.9842495918273926, + "sampling/importance_sampling_ratio/min": 0.7047746181488037, + "sampling/sampling_logp_difference/max": 0.36011219024658203, + "sampling/sampling_logp_difference/mean": 0.005461296532303095, + "step": 455, + "step_time": 41.37734428300246 + }, + { + "clip_ratio/high_max": 0.036011905409395695, + "clip_ratio/high_mean": 0.012723214633297175, + "clip_ratio/low_mean": 0.01116071455180645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023883929243311286, + "entropy": 0.07720525958575308, + "epoch": 0.00912, + "grad_norm": 0.3397330045700073, + "kl": 0.6114528980106115, + "learning_rate": 9.99967333895106e-05, + "loss": -0.0171, + "step": 456, + "step_time": 10.617095338997387 + }, + { + "clip_ratio/high_max": 0.009077381109818816, + "clip_ratio/high_mean": 0.002269345277454704, + "clip_ratio/low_mean": 0.0037202381645329297, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005989583441987634, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 1450.328125, + "completions/mean_terminated_length": 1450.328125, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "entropy": 0.08630289603024721, + "epoch": 0.00914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.915941059589386, + "kl": 0.5304882265627384, + "learning_rate": 9.999671781593154e-05, + "loss": -0.0128, + "num_tokens": 27051977.0, + "reward": 12.1441650390625, + "reward_std": 13.508443832397461, + "rewards/rollout_reward_func/mean": 12.1441650390625, + "rewards/rollout_reward_func/std": 14.862476348876953, + "sampling/importance_sampling_ratio/max": 1.8943538665771484, + "sampling/importance_sampling_ratio/mean": 1.0383222103118896, + "sampling/importance_sampling_ratio/min": 0.6029430031776428, + "sampling/sampling_logp_difference/max": 0.5262751579284668, + "sampling/sampling_logp_difference/mean": 0.0063569676131010056, + "step": 457, + "step_time": 40.44359572299618 + }, + { + "clip_ratio/high_max": 0.026785714784637094, + "clip_ratio/high_mean": 0.00889475119765848, + "clip_ratio/low_mean": 0.015625000349245965, + "clip_ratio/low_min": 0.0029761905316263437, + "clip_ratio/region_mean": 0.024519751546904445, + "entropy": 0.07588907447643578, + "epoch": 0.00916, + "grad_norm": 0.3780209422111511, + "kl": 0.5850545484572649, + "learning_rate": 9.999670220531878e-05, + "loss": -0.0142, + "step": 458, + "step_time": 10.81988593099959 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.0007812500116415322, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002269345277454704, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 1446.8125, + "completions/mean_terminated_length": 1446.8125, + "completions/min_length": 708.0, + "completions/min_terminated_length": 708.0, + "entropy": 0.06712023681029677, + "epoch": 0.00918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5037431716918945, + "kl": 0.5365529656410217, + "learning_rate": 9.999668655767235e-05, + "loss": -0.0142, + "num_tokens": 27195924.0, + "reward": 12.623528480529785, + "reward_std": 16.375185012817383, + "rewards/rollout_reward_func/mean": 12.623528480529785, + "rewards/rollout_reward_func/std": 17.157840728759766, + "sampling/importance_sampling_ratio/max": 1.4218283891677856, + "sampling/importance_sampling_ratio/mean": 1.0120244026184082, + "sampling/importance_sampling_ratio/min": 0.7264562249183655, + "sampling/sampling_logp_difference/max": 0.36830270290374756, + "sampling/sampling_logp_difference/mean": 0.003537567099556327, + "step": 459, + "step_time": 39.789260978999664 + }, + { + "clip_ratio/high_max": 0.020833333721384406, + "clip_ratio/high_mean": 0.0052083334303461015, + "clip_ratio/low_mean": 0.010230655025225133, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015438988397363573, + "entropy": 0.061492747627198696, + "epoch": 0.0092, + "grad_norm": 0.27846819162368774, + "kl": 0.6263625603169203, + "learning_rate": 9.999667087299225e-05, + "loss": -0.0179, + "step": 460, + "step_time": 10.157873148000363 + }, + { + "clip_ratio/high_max": 0.011904762126505375, + "clip_ratio/high_mean": 0.0037202381645329297, + "clip_ratio/low_mean": 0.0007440476329065859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004464285797439516, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1553.0, + "completions/max_terminated_length": 1553.0, + "completions/mean_length": 1393.921875, + "completions/mean_terminated_length": 1393.921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.06535098806489259, + "epoch": 0.00922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6925691366195679, + "kl": 0.5965993329882622, + "learning_rate": 9.99966551512785e-05, + "loss": -0.0133, + "num_tokens": 27336352.0, + "reward": 8.150674819946289, + "reward_std": 15.653514862060547, + "rewards/rollout_reward_func/mean": 8.150674819946289, + "rewards/rollout_reward_func/std": 16.096240997314453, + "sampling/importance_sampling_ratio/max": 1.3934406042099, + "sampling/importance_sampling_ratio/mean": 0.9711774587631226, + "sampling/importance_sampling_ratio/min": 0.3346167504787445, + "sampling/sampling_logp_difference/max": 1.0496406555175781, + "sampling/sampling_logp_difference/mean": 0.005856034811586142, + "step": 461, + "step_time": 41.696121679995485 + }, + { + "clip_ratio/high_max": 0.023958333767950535, + "clip_ratio/high_mean": 0.00673363107489422, + "clip_ratio/low_mean": 0.015252976503688842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021986607811413705, + "entropy": 0.0650356519035995, + "epoch": 0.00924, + "grad_norm": 1.2314876317977905, + "kl": 1.7299257963895798, + "learning_rate": 9.999663939253112e-05, + "loss": -0.0022, + "step": 462, + "step_time": 10.117755536000914 + }, + { + "clip_ratio/high_max": 0.009077381109818816, + "clip_ratio/high_mean": 0.002269345277454704, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003757440543267876, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 1386.734375, + "completions/mean_terminated_length": 1386.734375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.08255739836022258, + "epoch": 0.00926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4534408450126648, + "kl": 0.5308241080492735, + "learning_rate": 9.999662359675012e-05, + "loss": -0.0123, + "num_tokens": 27476234.0, + "reward": 11.247259140014648, + "reward_std": 14.853042602539062, + "rewards/rollout_reward_func/mean": 11.247259140014648, + "rewards/rollout_reward_func/std": 14.736608505249023, + "sampling/importance_sampling_ratio/max": 1.3197416067123413, + "sampling/importance_sampling_ratio/mean": 0.9946113228797913, + "sampling/importance_sampling_ratio/min": 0.7106093764305115, + "sampling/sampling_logp_difference/max": 0.3446381092071533, + "sampling/sampling_logp_difference/mean": 0.00609009200707078, + "step": 463, + "step_time": 40.418589189997874 + }, + { + "clip_ratio/high_max": 0.026934524532407522, + "clip_ratio/high_mean": 0.009709821664728224, + "clip_ratio/low_mean": 0.01640625053551048, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026116072200238705, + "entropy": 0.09197275433689356, + "epoch": 0.00928, + "grad_norm": 0.46926549077033997, + "kl": 0.5417319964617491, + "learning_rate": 9.999660776393552e-05, + "loss": -0.0111, + "step": 464, + "step_time": 10.364104637999844 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0014880952658131719, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029761905316263437, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1538.0, + "completions/max_terminated_length": 1538.0, + "completions/mean_length": 1408.109375, + "completions/mean_terminated_length": 1408.109375, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 0.0891355937346816, + "epoch": 0.0093, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9044552445411682, + "kl": 0.6998987477272749, + "learning_rate": 9.999659189408731e-05, + "loss": -0.0085, + "num_tokens": 27617505.0, + "reward": 13.215154647827148, + "reward_std": 11.782221794128418, + "rewards/rollout_reward_func/mean": 13.215155601501465, + "rewards/rollout_reward_func/std": 12.105838775634766, + "sampling/importance_sampling_ratio/max": 1.657700777053833, + "sampling/importance_sampling_ratio/mean": 1.0080327987670898, + "sampling/importance_sampling_ratio/min": 0.5086445808410645, + "sampling/sampling_logp_difference/max": 0.6424019932746887, + "sampling/sampling_logp_difference/mean": 0.005240763537585735, + "step": 465, + "step_time": 39.89362095000433 + } + ], + "logging_steps": 1.0, + "max_steps": 100000, + "num_input_tokens_seen": 27617505, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}