| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.11428571428571428, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2700.4271850585938, |
| "cov_mean": -6.0587970438064076e-05, |
| "cov_std": 0.35307812318205833, |
| "entropy": 0.36474609375, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.45541471242904663, |
| "kl": 0.0, |
| "learning_rate": 1e-07, |
| "loss": -0.0382, |
| "reward": 0.7604166893288493, |
| "reward_std": 0.4268697127699852, |
| "rewards/accuracy_reward": 0.25000001303851604, |
| "rewards/format_reward": 0.5104166669771075, |
| "step": 1, |
| "w_high_ratio": 0.2200421690940857, |
| "w_low_ratio": 0.03663695091381669, |
| "w_max": 2.1593789756298065, |
| "w_mean": 1.4711343348026276, |
| "w_min": 6.525355682266089e-35, |
| "w_std": 0.2659660503268242 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3127.3958435058594, |
| "cov_mean": -2.155053698515985e-05, |
| "cov_std": 0.310540571808815, |
| "entropy": 0.353515625, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.5143813490867615, |
| "kl": 0.0, |
| "learning_rate": 2e-07, |
| "loss": 0.0049, |
| "reward": 0.6458333637565374, |
| "reward_std": 0.4249730706214905, |
| "rewards/accuracy_reward": 0.2812500102445483, |
| "rewards/format_reward": 0.3645833386108279, |
| "step": 2, |
| "w_high_ratio": 0.05183619633316994, |
| "w_low_ratio": 0.036958135198801756, |
| "w_max": 1.8325217366218567, |
| "w_mean": 1.2113382518291473, |
| "w_min": 0.0, |
| "w_std": 0.20957503467798233 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3791.375, |
| "cov_mean": -3.563215068425052e-05, |
| "cov_std": 0.28256653994321823, |
| "entropy": 0.4658203125, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.2701888084411621, |
| "kl": 4.756450653076172e-05, |
| "learning_rate": 3e-07, |
| "loss": 0.0344, |
| "reward": 0.16666667349636555, |
| "reward_std": 0.3025414012372494, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/format_reward": 0.14583333395421505, |
| "step": 3, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03658500872552395, |
| "w_max": 1.348844289779663, |
| "w_mean": 1.0439709424972534, |
| "w_min": 0.0, |
| "w_std": 0.15747325122356415 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2446.1250610351562, |
| "cov_mean": -2.657165350683499e-05, |
| "cov_std": 0.47042107582092285, |
| "entropy": 0.4052734375, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.6580816507339478, |
| "kl": 2.866983413696289e-05, |
| "learning_rate": 4e-07, |
| "loss": -0.0116, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.5623367577791214, |
| "rewards/accuracy_reward": 0.19791667070239782, |
| "rewards/format_reward": 0.6562500074505806, |
| "step": 4, |
| "w_high_ratio": 0.2048901468515396, |
| "w_low_ratio": 0.04687658231705427, |
| "w_max": 2.3084834814071655, |
| "w_mean": 1.5087227523326874, |
| "w_min": 3.5522916070634113e-43, |
| "w_std": 0.31092390790581703 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3562.4166870117188, |
| "cov_mean": 3.653238718470675e-05, |
| "cov_std": 0.539387047290802, |
| "entropy": 0.45458984375, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.30956918001174927, |
| "kl": 3.820657730102539e-05, |
| "learning_rate": 5e-07, |
| "loss": 0.0053, |
| "reward": 0.4479166939854622, |
| "reward_std": 0.5839087814092636, |
| "rewards/accuracy_reward": 0.08333333861082792, |
| "rewards/format_reward": 0.3645833507180214, |
| "step": 5, |
| "w_high_ratio": 0.009932879358530045, |
| "w_low_ratio": 0.061708422377705574, |
| "w_max": 1.4947779774665833, |
| "w_mean": 1.13177028298378, |
| "w_min": 0.0, |
| "w_std": 0.2904536984860897 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3346.166748046875, |
| "cov_mean": 2.951182568722288e-05, |
| "cov_std": 0.4695303291082382, |
| "entropy": 0.474609375, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.4116966426372528, |
| "kl": 4.678964614868164e-05, |
| "learning_rate": 6e-07, |
| "loss": 0.0655, |
| "reward": 0.40625001303851604, |
| "reward_std": 0.5175340622663498, |
| "rewards/accuracy_reward": 0.09375000186264515, |
| "rewards/format_reward": 0.31250001303851604, |
| "step": 6, |
| "w_high_ratio": 0.09942464530467987, |
| "w_low_ratio": 0.05820021778345108, |
| "w_max": 2.0522369146347046, |
| "w_mean": 1.2698509693145752, |
| "w_min": 6.311469302795941e-40, |
| "w_std": 0.3068386148661375 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3291.197998046875, |
| "cov_mean": -2.080401827697642e-06, |
| "cov_std": 0.5660274773836136, |
| "entropy": 0.38671875, |
| "epoch": 0.008, |
| "grad_norm": 0.4001730680465698, |
| "kl": 2.8431415557861328e-05, |
| "learning_rate": 7e-07, |
| "loss": -0.0874, |
| "reward": 0.9687500298023224, |
| "reward_std": 0.639276884496212, |
| "rewards/accuracy_reward": 0.2812500074505806, |
| "rewards/format_reward": 0.6875000149011612, |
| "step": 7, |
| "w_high_ratio": 0.04338983818888664, |
| "w_low_ratio": 0.05278784967958927, |
| "w_max": 1.6053467988967896, |
| "w_mean": 1.2385202646255493, |
| "w_min": 0.0, |
| "w_std": 0.2744893953204155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2852.4896240234375, |
| "cov_mean": -2.6475029699213337e-05, |
| "cov_std": 0.24741052091121674, |
| "entropy": 0.349365234375, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.28081798553466797, |
| "kl": 2.35140323638916e-05, |
| "learning_rate": 8e-07, |
| "loss": -0.0024, |
| "reward": 0.8750000111758709, |
| "reward_std": 0.3533418998122215, |
| "rewards/accuracy_reward": 0.3854166679084301, |
| "rewards/format_reward": 0.48958334885537624, |
| "step": 8, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.026329820044338703, |
| "w_max": 1.744232177734375, |
| "w_mean": 1.2852342873811722, |
| "w_min": 0.25, |
| "w_std": 0.13892405480146408 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3371.2708740234375, |
| "cov_mean": -2.0414277514646528e-05, |
| "cov_std": 0.3537435829639435, |
| "entropy": 0.4619140625, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.3546924591064453, |
| "kl": 3.835558891296387e-05, |
| "learning_rate": 9e-07, |
| "loss": -0.0341, |
| "reward": 0.3958333432674408, |
| "reward_std": 0.4515319801867008, |
| "rewards/accuracy_reward": 0.09375000186264515, |
| "rewards/format_reward": 0.3020833358168602, |
| "step": 9, |
| "w_high_ratio": 0.07049691677093506, |
| "w_low_ratio": 0.03988973796367645, |
| "w_max": 1.8283900916576385, |
| "w_mean": 1.2170793116092682, |
| "w_min": 9.553366044251431e-29, |
| "w_std": 0.2274811826646328 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2993.8438110351562, |
| "cov_mean": -3.597586055548163e-05, |
| "cov_std": 0.3790554851293564, |
| "entropy": 0.349609375, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.4014468193054199, |
| "kl": 3.153085708618164e-05, |
| "learning_rate": 1e-06, |
| "loss": 0.111, |
| "reward": 0.572916679084301, |
| "reward_std": 0.5256113260984421, |
| "rewards/accuracy_reward": 0.15625000651925802, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 10, |
| "w_high_ratio": 0.17509328201413155, |
| "w_low_ratio": 0.04464914742857218, |
| "w_max": 2.3221429884433746, |
| "w_mean": 1.403680145740509, |
| "w_min": 0.0, |
| "w_std": 0.2820280008018017 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3645.3126220703125, |
| "cov_mean": 1.727970106912835e-05, |
| "cov_std": 0.314908966422081, |
| "entropy": 0.3701171875, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.40044310688972473, |
| "kl": 3.1620264053344727e-05, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0585, |
| "reward": 0.25000001303851604, |
| "reward_std": 0.4806990921497345, |
| "rewards/accuracy_reward": 0.09375000279396772, |
| "rewards/format_reward": 0.15625000279396772, |
| "step": 11, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04341120272874832, |
| "w_max": 1.4488586485385895, |
| "w_mean": 1.097432792186737, |
| "w_min": 4.6695499555262094e-38, |
| "w_std": 0.2005491964519024 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2548.0729370117188, |
| "cov_mean": -8.678332051204052e-05, |
| "cov_std": 0.3924334645271301, |
| "entropy": 0.3896484375, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.40460628271102905, |
| "kl": 3.9458274841308594e-05, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0329, |
| "reward": 0.9479166939854622, |
| "reward_std": 0.4162924438714981, |
| "rewards/accuracy_reward": 0.23958334140479565, |
| "rewards/format_reward": 0.708333358168602, |
| "step": 12, |
| "w_high_ratio": 0.14796987175941467, |
| "w_low_ratio": 0.038647969253361225, |
| "w_max": 2.0233654975891113, |
| "w_mean": 1.479979693889618, |
| "w_min": 0.0, |
| "w_std": 0.2828930839896202 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3180.9896850585938, |
| "cov_mean": -3.604557650760398e-05, |
| "cov_std": 0.29976917430758476, |
| "entropy": 0.39111328125, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.42143014073371887, |
| "kl": 2.7954578399658203e-05, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.0007, |
| "reward": 0.5937500298023224, |
| "reward_std": 0.39751993864774704, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/format_reward": 0.4062500074505806, |
| "step": 13, |
| "w_high_ratio": 0.13092797622084618, |
| "w_low_ratio": 0.038139537908136845, |
| "w_max": 1.9087003767490387, |
| "w_mean": 1.2740049362182617, |
| "w_min": 0.0, |
| "w_std": 0.20897787064313889 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3071.760498046875, |
| "cov_mean": -1.3722287803830113e-06, |
| "cov_std": 0.3737764284014702, |
| "entropy": 0.37744140625, |
| "epoch": 0.016, |
| "grad_norm": 0.5302906632423401, |
| "kl": 2.9087066650390625e-05, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0182, |
| "reward": 0.541666679084301, |
| "reward_std": 0.4254928454756737, |
| "rewards/accuracy_reward": 0.15625000279396772, |
| "rewards/format_reward": 0.3854166679084301, |
| "step": 14, |
| "w_high_ratio": 0.13122042268514633, |
| "w_low_ratio": 0.04646214470267296, |
| "w_max": 2.057934284210205, |
| "w_mean": 1.2967519462108612, |
| "w_min": 6.977258874336181e-23, |
| "w_std": 0.279865525662899 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2915.2396240234375, |
| "cov_mean": -2.9782687306578737e-05, |
| "cov_std": 0.3060881793498993, |
| "entropy": 0.3681640625, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.471722811460495, |
| "kl": 3.0308961868286133e-05, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": 0.0075, |
| "reward": 0.6145833432674408, |
| "reward_std": 0.3603988029062748, |
| "rewards/accuracy_reward": 0.19791666977107525, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 15, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03737350553274155, |
| "w_max": 1.5459995865821838, |
| "w_mean": 1.177234023809433, |
| "w_min": 0.0, |
| "w_std": 0.20952722802758217 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3857.7084350585938, |
| "cov_mean": -3.300000025774352e-05, |
| "cov_std": 0.18836934491991997, |
| "entropy": 0.45751953125, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.31818071007728577, |
| "kl": 3.904104232788086e-05, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0556, |
| "reward": 0.13541666977107525, |
| "reward_std": 0.249445378780365, |
| "rewards/accuracy_reward": 0.052083334885537624, |
| "rewards/format_reward": 0.0833333358168602, |
| "step": 16, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.02626894786953926, |
| "w_max": 1.197378009557724, |
| "w_mean": 1.0219765603542328, |
| "w_min": 0.25, |
| "w_std": 0.10555266216397285 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2503.479232788086, |
| "cov_mean": 5.234984382695984e-05, |
| "cov_std": 0.34602249413728714, |
| "entropy": 0.43408203125, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.43525052070617676, |
| "kl": 5.2034854888916016e-05, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": 0.0179, |
| "reward": 0.7604167014360428, |
| "reward_std": 0.4241996556520462, |
| "rewards/accuracy_reward": 0.1770833432674408, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 17, |
| "w_high_ratio": 0.13831434771418571, |
| "w_low_ratio": 0.039523204788565636, |
| "w_max": 2.07596218585968, |
| "w_mean": 1.3631863296031952, |
| "w_min": 0.25, |
| "w_std": 0.22783420607447624 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3050.8438110351562, |
| "cov_mean": -8.538075144315371e-05, |
| "cov_std": 0.34513213485479355, |
| "entropy": 0.36328125, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.3079332113265991, |
| "kl": 5.075335502624512e-05, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.005, |
| "reward": 0.5833333432674408, |
| "reward_std": 0.4453107975423336, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 18, |
| "w_high_ratio": 0.0583355538547039, |
| "w_low_ratio": 0.038819507928565145, |
| "w_max": 1.7474263310432434, |
| "w_mean": 1.2030333578586578, |
| "w_min": 1.0509738482436128e-45, |
| "w_std": 0.20471886917948723 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3198.7500610351562, |
| "cov_mean": 6.975242376938695e-05, |
| "cov_std": 0.4591265842318535, |
| "entropy": 0.3994140625, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.44368991255760193, |
| "kl": 5.0961971282958984e-05, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": -0.0137, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.6421672403812408, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 19, |
| "w_high_ratio": 0.07967927679419518, |
| "w_low_ratio": 0.055658016353845596, |
| "w_max": 1.7943021953105927, |
| "w_mean": 1.2339626252651215, |
| "w_min": 0.0, |
| "w_std": 0.28908008337020874 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2531.0208740234375, |
| "cov_mean": 3.258255492255557e-05, |
| "cov_std": 0.3959212973713875, |
| "entropy": 0.3134765625, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.42710381746292114, |
| "kl": 7.474422454833984e-05, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": -0.0142, |
| "reward": 0.9479166865348816, |
| "reward_std": 0.4626789018511772, |
| "rewards/accuracy_reward": 0.260416679084301, |
| "rewards/format_reward": 0.6875000149011612, |
| "step": 20, |
| "w_high_ratio": 0.20212292298674583, |
| "w_low_ratio": 0.040643465239554644, |
| "w_max": 2.211318254470825, |
| "w_mean": 1.4521130919456482, |
| "w_min": 0.0, |
| "w_std": 0.25691715627908707 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2890.541778564453, |
| "cov_mean": 1.6164100088644773e-05, |
| "cov_std": 0.2818361334502697, |
| "entropy": 0.421875, |
| "epoch": 0.024, |
| "grad_norm": 0.8042517900466919, |
| "kl": 0.0001793205738067627, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.073, |
| "reward": 0.697916692122817, |
| "reward_std": 0.3805258348584175, |
| "rewards/accuracy_reward": 0.22916667070239782, |
| "rewards/format_reward": 0.4687500102445483, |
| "step": 21, |
| "w_high_ratio": 0.1653403341770172, |
| "w_low_ratio": 0.034426179714500904, |
| "w_max": 2.0890542566776276, |
| "w_mean": 1.4488303065299988, |
| "w_min": 0.25, |
| "w_std": 0.26479026675224304 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2023.8750610351562, |
| "cov_mean": -3.984599959494517e-05, |
| "cov_std": 0.3324627988040447, |
| "entropy": 0.41162109375, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.43941012024879456, |
| "kl": 0.0001862645149230957, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0018, |
| "reward": 1.0416666865348816, |
| "reward_std": 0.32273583114147186, |
| "rewards/accuracy_reward": 0.2708333386108279, |
| "rewards/format_reward": 0.7708333730697632, |
| "step": 22, |
| "w_high_ratio": 0.16147570684552193, |
| "w_low_ratio": 0.026547667337581515, |
| "w_max": 2.171365201473236, |
| "w_mean": 1.5571591556072235, |
| "w_min": 2.1019476964872256e-45, |
| "w_std": 0.16451371088624 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2749.6146850585938, |
| "cov_mean": -4.101629156139097e-05, |
| "cov_std": 0.25967343151569366, |
| "entropy": 0.37939453125, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.37253376841545105, |
| "kl": 0.00014030933380126953, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": -0.0678, |
| "reward": 0.5625000223517418, |
| "reward_std": 0.3055335730314255, |
| "rewards/accuracy_reward": 0.11458333861082792, |
| "rewards/format_reward": 0.4479166679084301, |
| "step": 23, |
| "w_high_ratio": 0.14202075079083443, |
| "w_low_ratio": 0.028185136150568724, |
| "w_max": 2.1473127901554108, |
| "w_mean": 1.35337632894516, |
| "w_min": 1.3966908548442706e-38, |
| "w_std": 0.21078352630138397 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3073.7813110351562, |
| "cov_mean": -2.623773912091565e-05, |
| "cov_std": 0.5731624215841293, |
| "entropy": 0.365234375, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.6742011904716492, |
| "kl": 0.00015079975128173828, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": -0.0491, |
| "reward": 0.7916666939854622, |
| "reward_std": 0.7121171355247498, |
| "rewards/accuracy_reward": 0.2812500074505806, |
| "rewards/format_reward": 0.510416679084301, |
| "step": 24, |
| "w_high_ratio": 0.15864675119519234, |
| "w_low_ratio": 0.057481554336845875, |
| "w_max": 2.1011292338371277, |
| "w_mean": 1.4009218215942383, |
| "w_min": 0.0, |
| "w_std": 0.3654456064105034 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3008.9271850585938, |
| "cov_mean": -5.06913784192875e-05, |
| "cov_std": 0.360674187541008, |
| "entropy": 0.43896484375, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.6551496982574463, |
| "kl": 0.00020998716354370117, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0466, |
| "reward": 0.5208333432674408, |
| "reward_std": 0.4276355504989624, |
| "rewards/accuracy_reward": 0.1354166679084301, |
| "rewards/format_reward": 0.3854166865348816, |
| "step": 25, |
| "w_high_ratio": 0.10193426162004471, |
| "w_low_ratio": 0.04761309362947941, |
| "w_max": 2.144305258989334, |
| "w_mean": 1.3412529230117798, |
| "w_min": 0.25, |
| "w_std": 0.2752615138888359 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3288.4896240234375, |
| "cov_mean": 1.0494537946215132e-05, |
| "cov_std": 0.2694205194711685, |
| "entropy": 0.423828125, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.2771300673484802, |
| "kl": 4.094839096069336e-05, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": -0.0308, |
| "reward": 0.6875000298023224, |
| "reward_std": 0.3060605004429817, |
| "rewards/accuracy_reward": 0.2395833358168602, |
| "rewards/format_reward": 0.447916679084301, |
| "step": 26, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.026628307532519102, |
| "w_max": 1.6005243062973022, |
| "w_mean": 1.1837812960147858, |
| "w_min": 0.0, |
| "w_std": 0.15812482312321663 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3304.0625610351562, |
| "cov_mean": 2.6395198347017867e-05, |
| "cov_std": 0.47998297959566116, |
| "entropy": 0.43408203125, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.3476252257823944, |
| "kl": 0.0001595616340637207, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": -0.0198, |
| "reward": 0.614583358168602, |
| "reward_std": 0.5712436102330685, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.4479166716337204, |
| "step": 27, |
| "w_high_ratio": 0.12017197906970978, |
| "w_low_ratio": 0.05561595968902111, |
| "w_max": 1.8565902709960938, |
| "w_mean": 1.2779352962970734, |
| "w_min": 0.0, |
| "w_std": 0.2663590759038925 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3088.8854370117188, |
| "cov_mean": -3.223641306249192e-05, |
| "cov_std": 0.42785073816776276, |
| "entropy": 0.4013671875, |
| "epoch": 0.032, |
| "grad_norm": 0.33925580978393555, |
| "kl": 0.00015425682067871094, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0176, |
| "reward": 0.739583358168602, |
| "reward_std": 0.5647517889738083, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/format_reward": 0.4479166865348816, |
| "step": 28, |
| "w_high_ratio": 0.049459055066108704, |
| "w_low_ratio": 0.046367804519832134, |
| "w_max": 1.811535805463791, |
| "w_mean": 1.2396393418312073, |
| "w_min": 0.0, |
| "w_std": 0.264127716422081 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3654.9375610351562, |
| "cov_mean": -5.409401546785375e-05, |
| "cov_std": 0.37247660756111145, |
| "entropy": 0.4443359375, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.3806890547275543, |
| "kl": 0.00038933753967285156, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0781, |
| "reward": 0.28125, |
| "reward_std": 0.4390515610575676, |
| "rewards/accuracy_reward": 0.09375000279396772, |
| "rewards/format_reward": 0.18750000558793545, |
| "step": 29, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04866650328040123, |
| "w_max": 1.3806794583797455, |
| "w_mean": 1.0922435522079468, |
| "w_min": 0.25, |
| "w_std": 0.21911596134305 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3306.5938110351562, |
| "cov_mean": -2.3586735551361926e-05, |
| "cov_std": 0.4439524784684181, |
| "entropy": 0.37841796875, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.49297624826431274, |
| "kl": 0.0008985996246337891, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": -0.0256, |
| "reward": 0.6458333544433117, |
| "reward_std": 0.5611635595560074, |
| "rewards/accuracy_reward": 0.2187500037252903, |
| "rewards/format_reward": 0.4270833395421505, |
| "step": 30, |
| "w_high_ratio": 0.06216667778789997, |
| "w_low_ratio": 0.05258181784301996, |
| "w_max": 1.9480324983596802, |
| "w_mean": 1.2739951610565186, |
| "w_min": 4.3700652660890127e-35, |
| "w_std": 0.28470994904637337 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3249.072998046875, |
| "cov_mean": 7.066663692967268e-05, |
| "cov_std": 0.31172432936728, |
| "entropy": 0.38232421875, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.41590920090675354, |
| "kl": 0.0011307001113891602, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": -0.009, |
| "reward": 0.541666679084301, |
| "reward_std": 0.48244282230734825, |
| "rewards/accuracy_reward": 0.19791667442768812, |
| "rewards/format_reward": 0.34375000558793545, |
| "step": 31, |
| "w_high_ratio": 0.09930127486586571, |
| "w_low_ratio": 0.04017635714262724, |
| "w_max": 2.09058153629303, |
| "w_mean": 1.2682196497917175, |
| "w_min": 0.0, |
| "w_std": 0.2415708377957344 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3569.0626220703125, |
| "cov_mean": -2.6002062440966256e-06, |
| "cov_std": 0.3952501490712166, |
| "entropy": 0.43212890625, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 1.6052252054214478, |
| "kl": 0.03867650032043457, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": -0.0005, |
| "reward": 0.6979166865348816, |
| "reward_std": 0.5698855072259903, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/format_reward": 0.3645833432674408, |
| "step": 32, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04428828274831176, |
| "w_max": 1.4899851083755493, |
| "w_mean": 1.1257199943065643, |
| "w_min": 0.0, |
| "w_std": 0.2084966115653515 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3602.0313110351562, |
| "cov_mean": 3.6438897495827405e-05, |
| "cov_std": 0.35359735041856766, |
| "entropy": 0.36865234375, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.4356963038444519, |
| "kl": 0.006441354751586914, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0063, |
| "reward": 0.6041666977107525, |
| "reward_std": 0.5775354653596878, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.35416667722165585, |
| "step": 33, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04679631860926747, |
| "w_max": 1.4279894530773163, |
| "w_mean": 1.114120066165924, |
| "w_min": 1.083308810307908e-39, |
| "w_std": 0.1986728459596634 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2617.0834045410156, |
| "cov_mean": 6.101907820266206e-05, |
| "cov_std": 0.4060870446264744, |
| "entropy": 0.42529296875, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.6856685876846313, |
| "kl": 0.0007038116455078125, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": -0.0313, |
| "reward": 0.9583333432674408, |
| "reward_std": 0.4463158957660198, |
| "rewards/accuracy_reward": 0.40625001303851604, |
| "rewards/format_reward": 0.5520833535119891, |
| "step": 34, |
| "w_high_ratio": 0.24574057757854462, |
| "w_low_ratio": 0.03856370970606804, |
| "w_max": 2.4257175028324127, |
| "w_mean": 1.564720779657364, |
| "w_min": 9.308517412847608e-40, |
| "w_std": 0.33169806748628616 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3259.072998046875, |
| "cov_mean": 6.144623966974905e-06, |
| "cov_std": 0.4504075199365616, |
| "entropy": 0.41455078125, |
| "epoch": 0.04, |
| "grad_norm": 0.41887158155441284, |
| "kl": 0.0009140968322753906, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0587, |
| "reward": 0.5312500111758709, |
| "reward_std": 0.6110180467367172, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/format_reward": 0.34375001303851604, |
| "step": 35, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.055781897623091936, |
| "w_max": 1.4624531269073486, |
| "w_mean": 1.1014132499694824, |
| "w_min": 5.8279178943564365e-36, |
| "w_std": 0.2577386908233166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3602.3438110351562, |
| "cov_mean": 2.4346391001017764e-05, |
| "cov_std": 0.21211356669664383, |
| "entropy": 0.49267578125, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.5710666179656982, |
| "kl": 0.0008481144905090332, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": -0.0327, |
| "reward": 0.19791667442768812, |
| "reward_std": 0.16615793853998184, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/format_reward": 0.18750001024454832, |
| "step": 36, |
| "w_high_ratio": 0.04688615724444389, |
| "w_low_ratio": 0.019336079712957144, |
| "w_max": 1.5947359800338745, |
| "w_mean": 1.155133068561554, |
| "w_min": 0.25, |
| "w_std": 0.13783840090036392 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3618.604248046875, |
| "cov_mean": 3.312654916953761e-05, |
| "cov_std": 0.2396765574812889, |
| "entropy": 0.41943359375, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.35426977276802063, |
| "kl": 0.0006122589111328125, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.0109, |
| "reward": 0.19791667256504297, |
| "reward_std": 0.22218847274780273, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/format_reward": 0.18750000558793545, |
| "step": 37, |
| "w_high_ratio": 0.004357387777417898, |
| "w_low_ratio": 0.026776093989610672, |
| "w_max": 1.527068942785263, |
| "w_mean": 1.1089930832386017, |
| "w_min": 0.0, |
| "w_std": 0.156088937073946 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3681.0625610351562, |
| "cov_mean": -1.270252869289834e-05, |
| "cov_std": 0.19421957433223724, |
| "entropy": 0.44921875, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.24127696454524994, |
| "kl": 0.00042241811752319336, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0018, |
| "reward": 0.25, |
| "reward_std": 0.287552148103714, |
| "rewards/accuracy_reward": 0.11458333861082792, |
| "rewards/format_reward": 0.13541666977107525, |
| "step": 38, |
| "w_high_ratio": 0.041594721376895905, |
| "w_low_ratio": 0.02351229265332222, |
| "w_max": 1.402332603931427, |
| "w_mean": 1.097372442483902, |
| "w_min": 0.5, |
| "w_std": 0.1229349672794342 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3038.5626220703125, |
| "cov_mean": 2.8159271096228622e-05, |
| "cov_std": 0.31503692269325256, |
| "entropy": 0.364013671875, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.2635081708431244, |
| "kl": 0.001262664794921875, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": -0.0133, |
| "reward": 0.7291666865348816, |
| "reward_std": 0.26209891587495804, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/format_reward": 0.5000000149011612, |
| "step": 39, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.027915622107684612, |
| "w_max": 1.4430480003356934, |
| "w_mean": 1.1478222012519836, |
| "w_min": 0.25, |
| "w_std": 0.14645230770111084 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2716.6876220703125, |
| "cov_mean": 1.541716937936144e-05, |
| "cov_std": 0.37162280082702637, |
| "entropy": 0.36669921875, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.27566489577293396, |
| "kl": 0.0022230148315429688, |
| "learning_rate": 7.75e-07, |
| "loss": -0.0203, |
| "reward": 0.729166679084301, |
| "reward_std": 0.3497198149561882, |
| "rewards/accuracy_reward": 0.15625000093132257, |
| "rewards/format_reward": 0.5729166716337204, |
| "step": 40, |
| "w_high_ratio": 0.05738469213247299, |
| "w_low_ratio": 0.03804673533886671, |
| "w_max": 1.8830105662345886, |
| "w_mean": 1.3182978928089142, |
| "w_min": 0.0, |
| "w_std": 0.2204515039920807 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3265.8334350585938, |
| "cov_mean": 4.474487604966271e-05, |
| "cov_std": 0.3514489606022835, |
| "entropy": 0.38134765625, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.25198379158973694, |
| "kl": 0.00047135353088378906, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": -0.0348, |
| "reward": 0.5625000204890966, |
| "reward_std": 0.5001779943704605, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/format_reward": 0.41666669212281704, |
| "step": 41, |
| "w_high_ratio": 0.060201918706297874, |
| "w_low_ratio": 0.03864650521427393, |
| "w_max": 1.765150785446167, |
| "w_mean": 1.2053866684436798, |
| "w_min": 1.0444519155498174e-26, |
| "w_std": 0.20880188420414925 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3039.260498046875, |
| "cov_mean": 4.992843923901091e-05, |
| "cov_std": 0.12326429784297943, |
| "entropy": 0.4560546875, |
| "epoch": 0.048, |
| "grad_norm": 0.10436006635427475, |
| "kl": 0.0007390975952148438, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0037, |
| "reward": 0.3333333358168602, |
| "reward_std": 0.15885811299085617, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 42, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.016482737846672535, |
| "w_max": 1.6183056831359863, |
| "w_mean": 1.1592219173908234, |
| "w_min": 0.5, |
| "w_std": 0.06949653849005699 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3322.5208740234375, |
| "cov_mean": 1.5148519764807133e-05, |
| "cov_std": 0.3199145011603832, |
| "entropy": 0.39794921875, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.22395218908786774, |
| "kl": 0.0014238357543945312, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.0251, |
| "reward": 0.45833334140479565, |
| "reward_std": 0.34082313999533653, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 43, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03815819416195154, |
| "w_max": 1.3987224996089935, |
| "w_mean": 1.0974957346916199, |
| "w_min": 0.0, |
| "w_std": 0.18725593388080597 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2873.947967529297, |
| "cov_mean": 1.1262121915933676e-05, |
| "cov_std": 0.3456302881240845, |
| "entropy": 0.37548828125, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.570351243019104, |
| "kl": 0.0011093616485595703, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": -0.0802, |
| "reward": 0.7500000447034836, |
| "reward_std": 0.31945212185382843, |
| "rewards/accuracy_reward": 0.2812500074505806, |
| "rewards/format_reward": 0.4687500074505806, |
| "step": 44, |
| "w_high_ratio": 0.15513932332396507, |
| "w_low_ratio": 0.028022687416523695, |
| "w_max": 2.1701363921165466, |
| "w_mean": 1.4684883952140808, |
| "w_min": 0.25, |
| "w_std": 0.245870441198349 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3659.1355590820312, |
| "cov_mean": -1.3919307093601674e-05, |
| "cov_std": 0.3560608774423599, |
| "entropy": 0.4130859375, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.33030757308006287, |
| "kl": 0.0011370182037353516, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0303, |
| "reward": 0.416666679084301, |
| "reward_std": 0.5311296693980694, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/format_reward": 0.25, |
| "step": 45, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04757110681384802, |
| "w_max": 1.4556901454925537, |
| "w_mean": 1.08749720454216, |
| "w_min": 0.0, |
| "w_std": 0.21380594745278358 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3486.7916870117188, |
| "cov_mean": 2.1816805514163207e-05, |
| "cov_std": 0.2602475844323635, |
| "entropy": 0.4794921875, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.28776484727859497, |
| "kl": 0.0007176399230957031, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.058, |
| "reward": 0.23958334233611822, |
| "reward_std": 0.2835810258984566, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.20833333488553762, |
| "step": 46, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03342714952304959, |
| "w_max": 1.2925868034362793, |
| "w_mean": 1.0403871834278107, |
| "w_min": 1.838248673476915e-29, |
| "w_std": 0.14680924825370312 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3093.4479370117188, |
| "cov_mean": -4.72289493700373e-05, |
| "cov_std": 0.39596526324748993, |
| "entropy": 0.3857421875, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.3468870520591736, |
| "kl": 0.0010945796966552734, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": -0.064, |
| "reward": 0.84375, |
| "reward_std": 0.5307980924844742, |
| "rewards/accuracy_reward": 0.322916679084301, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 47, |
| "w_high_ratio": 0.10172786563634872, |
| "w_low_ratio": 0.03984384797513485, |
| "w_max": 1.9562607407569885, |
| "w_mean": 1.385090559720993, |
| "w_min": 0.25, |
| "w_std": 0.24371833354234695 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2902.979278564453, |
| "cov_mean": -2.4175317776098382e-05, |
| "cov_std": 0.4257803037762642, |
| "entropy": 0.388671875, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.5910794734954834, |
| "kl": 0.002681732177734375, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": -0.0371, |
| "reward": 0.6562500074505806, |
| "reward_std": 0.493436336517334, |
| "rewards/accuracy_reward": 0.22916666697710752, |
| "rewards/format_reward": 0.42708334140479565, |
| "step": 48, |
| "w_high_ratio": 0.05750561133027077, |
| "w_low_ratio": 0.0480266478843987, |
| "w_max": 1.9184067249298096, |
| "w_mean": 1.2389512956142426, |
| "w_min": 9.458764634192515e-45, |
| "w_std": 0.2734759133309126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2592.4271545410156, |
| "cov_mean": -5.175127171241911e-06, |
| "cov_std": 0.3634071573615074, |
| "entropy": 0.36669921875, |
| "epoch": 0.056, |
| "grad_norm": 0.33904796838760376, |
| "kl": 0.006507396697998047, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0033, |
| "reward": 0.8437500447034836, |
| "reward_std": 0.5799632221460342, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/format_reward": 0.5729166865348816, |
| "step": 49, |
| "w_high_ratio": 0.0624999962747097, |
| "w_low_ratio": 0.04148435592651367, |
| "w_max": 1.7326014041900635, |
| "w_mean": 1.239928662776947, |
| "w_min": 4.925564102101732e-43, |
| "w_std": 0.23503416404128075 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3253.1355590820312, |
| "cov_mean": -2.146356928278692e-05, |
| "cov_std": 0.2279180847108364, |
| "entropy": 0.35498046875, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.15721456706523895, |
| "kl": 0.0008764266967773438, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0086, |
| "reward": 0.635416679084301, |
| "reward_std": 0.3225880041718483, |
| "rewards/accuracy_reward": 0.2708333386108279, |
| "rewards/format_reward": 0.3645833395421505, |
| "step": 50, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.028236051555722952, |
| "w_max": 1.5944055318832397, |
| "w_mean": 1.1758275628089905, |
| "w_min": 0.25, |
| "w_std": 0.12848308496177197 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2558.1043090820312, |
| "cov_mean": 5.439032975118607e-05, |
| "cov_std": 0.29480236768722534, |
| "entropy": 0.43408203125, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.38427427411079407, |
| "kl": 0.00363922119140625, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.0141, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.2830107621848583, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.53125, |
| "step": 51, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.029488239903002977, |
| "w_max": 1.5800293982028961, |
| "w_mean": 1.1582573056221008, |
| "w_min": 0.25, |
| "w_std": 0.16206533834338188 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3121.510467529297, |
| "cov_mean": 4.514171632763464e-05, |
| "cov_std": 0.3024504631757736, |
| "entropy": 0.38525390625, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.38342365622520447, |
| "kl": 0.0027284622192382812, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": -0.0158, |
| "reward": 0.6666666772216558, |
| "reward_std": 0.48630647361278534, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.41666668467223644, |
| "step": 52, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.048233418725430965, |
| "w_max": 1.4871755540370941, |
| "w_mean": 1.1300460696220398, |
| "w_min": 0.0, |
| "w_std": 0.221306212246418 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3089.6146850585938, |
| "cov_mean": 1.5540852473350242e-05, |
| "cov_std": 0.4309914745390415, |
| "entropy": 0.42138671875, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.32499778270721436, |
| "kl": 0.001051187515258789, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0334, |
| "reward": 0.9062500447034836, |
| "reward_std": 0.615619845688343, |
| "rewards/accuracy_reward": 0.3020833432674408, |
| "rewards/format_reward": 0.6041666865348816, |
| "step": 53, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.05247905198484659, |
| "w_max": 1.4815186858177185, |
| "w_mean": 1.1437698602676392, |
| "w_min": 0.0, |
| "w_std": 0.2556675784289837 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2948.479248046875, |
| "cov_mean": 5.7689636832947144e-05, |
| "cov_std": 0.4967670738697052, |
| "entropy": 0.3662109375, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.5084431767463684, |
| "kl": 0.0007784366607666016, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0313, |
| "reward": 1.0208333730697632, |
| "reward_std": 0.6375530436635017, |
| "rewards/accuracy_reward": 0.4375000149011612, |
| "rewards/format_reward": 0.5833333507180214, |
| "step": 54, |
| "w_high_ratio": 0.12949026003479958, |
| "w_low_ratio": 0.03830569516867399, |
| "w_max": 1.9918950200080872, |
| "w_mean": 1.3933196365833282, |
| "w_min": 0.0, |
| "w_std": 0.25901878997683525 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3389.4376220703125, |
| "cov_mean": 1.1409799526518327e-05, |
| "cov_std": 0.2976246848702431, |
| "entropy": 0.40380859375, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.2319922149181366, |
| "kl": 0.0006909370422363281, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0036, |
| "reward": 0.614583358168602, |
| "reward_std": 0.4451694190502167, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.3645833432674408, |
| "step": 55, |
| "w_high_ratio": 0.008611755445599556, |
| "w_low_ratio": 0.038189588114619255, |
| "w_max": 1.4636406004428864, |
| "w_mean": 1.113456517457962, |
| "w_min": 0.25, |
| "w_std": 0.1765221506357193 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3142.2188720703125, |
| "cov_mean": -1.977225656446535e-05, |
| "cov_std": 0.4043290466070175, |
| "entropy": 0.36962890625, |
| "epoch": 0.064, |
| "grad_norm": 0.5777710676193237, |
| "kl": 0.003372669219970703, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": -0.0624, |
| "reward": 0.666666679084301, |
| "reward_std": 0.36768075451254845, |
| "rewards/accuracy_reward": 0.2083333432674408, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 56, |
| "w_high_ratio": 0.02573772892355919, |
| "w_low_ratio": 0.0384283890016377, |
| "w_max": 1.6343314349651337, |
| "w_mean": 1.174754112958908, |
| "w_min": 0.0, |
| "w_std": 0.2166101150214672 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3526.041748046875, |
| "cov_mean": 5.059504655946512e-05, |
| "cov_std": 0.19530736654996872, |
| "entropy": 0.3095703125, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.14592108130455017, |
| "kl": 0.0003554821014404297, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0068, |
| "reward": 0.4270833432674408, |
| "reward_std": 0.2942384257912636, |
| "rewards/accuracy_reward": 0.11458333861082792, |
| "rewards/format_reward": 0.3125, |
| "step": 57, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.024141859263181686, |
| "w_max": 1.4007738828659058, |
| "w_mean": 1.1122069656848907, |
| "w_min": 0.25, |
| "w_std": 0.10743825510144234 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2448.125030517578, |
| "cov_mean": 2.2156835257192142e-05, |
| "cov_std": 0.41437317430973053, |
| "entropy": 0.3251953125, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.31742021441459656, |
| "kl": 0.003879547119140625, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": -0.0288, |
| "reward": 1.0625000223517418, |
| "reward_std": 0.4887799397110939, |
| "rewards/accuracy_reward": 0.3437500037252903, |
| "rewards/format_reward": 0.7187500074505806, |
| "step": 58, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.03917268430814147, |
| "w_max": 1.9121226966381073, |
| "w_mean": 1.4166812300682068, |
| "w_min": 0.0, |
| "w_std": 0.25635192170739174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3166.6458740234375, |
| "cov_mean": -4.421618541528005e-06, |
| "cov_std": 0.23622582852840424, |
| "entropy": 0.33251953125, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.5350203514099121, |
| "kl": 0.0004105567932128906, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0178, |
| "reward": 0.572916672565043, |
| "reward_std": 0.32521966844797134, |
| "rewards/accuracy_reward": 0.20833333861082792, |
| "rewards/format_reward": 0.36458334140479565, |
| "step": 59, |
| "w_high_ratio": 0.09398643299937248, |
| "w_low_ratio": 0.029231622349470854, |
| "w_max": 1.7523704767227173, |
| "w_mean": 1.1871490776538849, |
| "w_min": 0.0, |
| "w_std": 0.18543793261051178 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3290.9688720703125, |
| "cov_mean": 2.4174340069293976e-05, |
| "cov_std": 0.2900906167924404, |
| "entropy": 0.37353515625, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.3470577299594879, |
| "kl": 0.002052783966064453, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0597, |
| "reward": 0.4895833395421505, |
| "reward_std": 0.39402854442596436, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/format_reward": 0.38541667722165585, |
| "step": 60, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03313248883932829, |
| "w_max": 1.4663892686367035, |
| "w_mean": 1.140129953622818, |
| "w_min": 0.0, |
| "w_std": 0.1620137356221676 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3175.916748046875, |
| "cov_mean": 4.250624078849796e-05, |
| "cov_std": 0.4415072202682495, |
| "entropy": 0.3564453125, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.3775484263896942, |
| "kl": 0.0009458065032958984, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": -0.0396, |
| "reward": 0.791666679084301, |
| "reward_std": 0.4833543188869953, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 61, |
| "w_high_ratio": 0.04859437793493271, |
| "w_low_ratio": 0.03715647594071925, |
| "w_max": 1.7687608003616333, |
| "w_mean": 1.2712468802928925, |
| "w_min": 0.0, |
| "w_std": 0.24916893057525158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2722.5313110351562, |
| "cov_mean": 3.7332507645260193e-06, |
| "cov_std": 0.3699860963970423, |
| "entropy": 0.30859375, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 1.0045063495635986, |
| "kl": 0.02886199951171875, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0303, |
| "reward": 0.8750000298023224, |
| "reward_std": 0.5333737134933472, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/format_reward": 0.583333358168602, |
| "step": 62, |
| "w_high_ratio": 0.06600858364254236, |
| "w_low_ratio": 0.04690547380596399, |
| "w_max": 1.9925485253334045, |
| "w_mean": 1.3004825711250305, |
| "w_min": 0.0, |
| "w_std": 0.25962154380977154 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2446.8959045410156, |
| "cov_mean": -3.0669682018924505e-05, |
| "cov_std": 0.44857871532440186, |
| "entropy": 0.3876953125, |
| "epoch": 0.072, |
| "grad_norm": 0.4270949065685272, |
| "kl": 0.001964569091796875, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": -0.0266, |
| "reward": 1.0208333879709244, |
| "reward_std": 0.5118110477924347, |
| "rewards/accuracy_reward": 0.3229166781529784, |
| "rewards/format_reward": 0.6979166716337204, |
| "step": 63, |
| "w_high_ratio": 0.24508000910282135, |
| "w_low_ratio": 0.04229559004306793, |
| "w_max": 2.53433358669281, |
| "w_mean": 1.5855962336063385, |
| "w_min": 3.1441053104750004e-38, |
| "w_std": 0.3284341022372246 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3022.9896240234375, |
| "cov_mean": -4.004061929663294e-05, |
| "cov_std": 0.24233128875494003, |
| "entropy": 0.376953125, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.22661255300045013, |
| "kl": 0.0009641647338867188, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0441, |
| "reward": 0.6666666977107525, |
| "reward_std": 0.42464151978492737, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.41666668467223644, |
| "step": 64, |
| "w_high_ratio": 0.05784625560045242, |
| "w_low_ratio": 0.024026920087635517, |
| "w_max": 1.9166311621665955, |
| "w_mean": 1.3068864345550537, |
| "w_min": 0.0, |
| "w_std": 0.15533896535634995 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2987.9584350585938, |
| "cov_mean": -1.1828518722722947e-05, |
| "cov_std": 0.3229193612933159, |
| "entropy": 0.33544921875, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.20347988605499268, |
| "kl": 0.0009045600891113281, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0257, |
| "reward": 0.7187500037252903, |
| "reward_std": 0.4016053378582001, |
| "rewards/accuracy_reward": 0.2500000027939677, |
| "rewards/format_reward": 0.46875, |
| "step": 65, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.04217576887458563, |
| "w_max": 1.5320913791656494, |
| "w_mean": 1.177913784980774, |
| "w_min": 0.0, |
| "w_std": 0.17242734879255295 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2277.3542709350586, |
| "cov_mean": 2.5952657381367317e-05, |
| "cov_std": 0.2567654103040695, |
| "entropy": 0.30517578125, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.39114055037498474, |
| "kl": 0.0011463165283203125, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0283, |
| "reward": 0.8750000111758709, |
| "reward_std": 0.32678278163075447, |
| "rewards/accuracy_reward": 0.35416667349636555, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 66, |
| "w_high_ratio": 0.050385382026433945, |
| "w_low_ratio": 0.036725505255162716, |
| "w_max": 1.6868340969085693, |
| "w_mean": 1.2045796811580658, |
| "w_min": 4.016504513755072e-38, |
| "w_std": 0.18733475357294083 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3692.8646240234375, |
| "cov_mean": -1.3762917205895064e-05, |
| "cov_std": 0.1253320723772049, |
| "entropy": 0.3369140625, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.09913324564695358, |
| "kl": 0.0007886886596679688, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": -0.0047, |
| "reward": 0.1979166716337204, |
| "reward_std": 0.14884886890649796, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.1666666716337204, |
| "step": 67, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.014627222903072834, |
| "w_max": 1.3194924890995026, |
| "w_mean": 1.0943252593278885, |
| "w_min": 0.5, |
| "w_std": 0.08329889550805092 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2192.4896240234375, |
| "cov_mean": 5.210197195992805e-05, |
| "cov_std": 0.2974618822336197, |
| "entropy": 0.3623046875, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.48533204197883606, |
| "kl": 0.004261016845703125, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": -0.0773, |
| "reward": 0.864583358168602, |
| "reward_std": 0.3760298416018486, |
| "rewards/accuracy_reward": 0.2395833432674408, |
| "rewards/format_reward": 0.625, |
| "step": 68, |
| "w_high_ratio": 0.2266840934753418, |
| "w_low_ratio": 0.028733241837471724, |
| "w_max": 2.714290827512741, |
| "w_mean": 1.5712151527404785, |
| "w_min": 0.25, |
| "w_std": 0.22248514741659164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2785.791748046875, |
| "cov_mean": 5.550627793127205e-05, |
| "cov_std": 0.24297186359763145, |
| "entropy": 0.435546875, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.4455502927303314, |
| "kl": 0.003231048583984375, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": -0.0721, |
| "reward": 0.4791666716337204, |
| "reward_std": 0.22700205445289612, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.4479166716337204, |
| "step": 69, |
| "w_high_ratio": 0.18034584820270538, |
| "w_low_ratio": 0.019336777739226818, |
| "w_max": 2.187383383512497, |
| "w_mean": 1.3382967710494995, |
| "w_min": 0.25, |
| "w_std": 0.14556573703885078 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3339.760498046875, |
| "cov_mean": 3.0603625077674224e-05, |
| "cov_std": 0.3574352115392685, |
| "entropy": 0.35400390625, |
| "epoch": 0.08, |
| "grad_norm": 0.4004529118537903, |
| "kl": 0.0030617713928222656, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0435, |
| "reward": 0.48958334513008595, |
| "reward_std": 0.448788546025753, |
| "rewards/accuracy_reward": 0.1145833395421505, |
| "rewards/format_reward": 0.37500001676380634, |
| "step": 70, |
| "w_high_ratio": 0.056590817868709564, |
| "w_low_ratio": 0.04363738652318716, |
| "w_max": 1.621414452791214, |
| "w_mean": 1.1405479907989502, |
| "w_min": 0.0, |
| "w_std": 0.21908994019031525 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2994.0834197998047, |
| "cov_mean": 5.412803147919476e-06, |
| "cov_std": 0.24315955862402916, |
| "entropy": 0.39501953125, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.43441104888916016, |
| "kl": 0.003234386444091797, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0297, |
| "reward": 0.4791666669771075, |
| "reward_std": 0.2685965895652771, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/format_reward": 0.3333333460614085, |
| "step": 71, |
| "w_high_ratio": 0.05637816712260246, |
| "w_low_ratio": 0.024559800047427416, |
| "w_max": 1.8640311062335968, |
| "w_mean": 1.2403113842010498, |
| "w_min": 0.25, |
| "w_std": 0.18912436068058014 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3187.5313110351562, |
| "cov_mean": -5.872446490684524e-05, |
| "cov_std": 0.47990038990974426, |
| "entropy": 0.45556640625, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.49201899766921997, |
| "kl": 0.0031557083129882812, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": -0.0603, |
| "reward": 0.510416679084301, |
| "reward_std": 0.46190567314624786, |
| "rewards/accuracy_reward": 0.07291666697710752, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 72, |
| "w_high_ratio": 0.13975006341934204, |
| "w_low_ratio": 0.04982540290802717, |
| "w_max": 1.821915477514267, |
| "w_mean": 1.327934205532074, |
| "w_min": 0.0, |
| "w_std": 0.2968912795186043 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3788.3125610351562, |
| "cov_mean": -2.8033528451487655e-05, |
| "cov_std": 0.27588948607444763, |
| "entropy": 0.44091796875, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.1494845449924469, |
| "kl": 0.00031256675720214844, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": -0.0024, |
| "reward": 0.3541666716337204, |
| "reward_std": 0.38621756434440613, |
| "rewards/accuracy_reward": 0.1354166716337204, |
| "rewards/format_reward": 0.2187500037252903, |
| "step": 73, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03520650416612625, |
| "w_max": 1.26780503988266, |
| "w_mean": 1.0412998497486115, |
| "w_min": 0.25, |
| "w_std": 0.1442563608288765 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3536.3751220703125, |
| "cov_mean": 5.950678314547986e-05, |
| "cov_std": 0.23448628932237625, |
| "entropy": 0.3876953125, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.26546525955200195, |
| "kl": 0.0013303756713867188, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0267, |
| "reward": 0.4791666865348816, |
| "reward_std": 0.3228915072977543, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.2604166716337204, |
| "step": 74, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.030950261279940605, |
| "w_max": 1.3889389336109161, |
| "w_mean": 1.0708437263965607, |
| "w_min": 0.0, |
| "w_std": 0.1411808580160141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3248.729248046875, |
| "cov_mean": -2.3449571926903445e-05, |
| "cov_std": 0.35559114813804626, |
| "entropy": 0.35546875, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.23657207190990448, |
| "kl": 0.0014653205871582031, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0671, |
| "reward": 0.5625000111758709, |
| "reward_std": 0.37235569953918457, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 75, |
| "w_high_ratio": 0.05818156525492668, |
| "w_low_ratio": 0.04433906823396683, |
| "w_max": 1.794895738363266, |
| "w_mean": 1.1899544298648834, |
| "w_min": 0.25, |
| "w_std": 0.21789918839931488 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3163.4583740234375, |
| "cov_mean": 1.617619454918895e-05, |
| "cov_std": 0.1999940201640129, |
| "entropy": 0.37890625, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.1549980193376541, |
| "kl": 0.00043952465057373047, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.016, |
| "reward": 0.4479166865348816, |
| "reward_std": 0.24646350741386414, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.4166666828095913, |
| "step": 76, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.027174705173820257, |
| "w_max": 1.7661568522453308, |
| "w_mean": 1.2089157700538635, |
| "w_min": 0.25, |
| "w_std": 0.11429934948682785 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3343.1146850585938, |
| "cov_mean": -4.351784809841774e-06, |
| "cov_std": 0.24954789131879807, |
| "entropy": 0.38623046875, |
| "epoch": 0.088, |
| "grad_norm": 0.16160623729228973, |
| "kl": 0.0005016326904296875, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": -0.0028, |
| "reward": 0.5416666977107525, |
| "reward_std": 0.3281986638903618, |
| "rewards/accuracy_reward": 0.15625001024454832, |
| "rewards/format_reward": 0.3854166828095913, |
| "step": 77, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.025709405075758696, |
| "w_max": 1.4693324863910675, |
| "w_mean": 1.1720669269561768, |
| "w_min": 0.25, |
| "w_std": 0.13362640514969826 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3459.5625610351562, |
| "cov_mean": 1.626165885681985e-05, |
| "cov_std": 0.48687614500522614, |
| "entropy": 0.37744140625, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.24848157167434692, |
| "kl": 0.0005955696105957031, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": -0.0075, |
| "reward": 0.604166679084301, |
| "reward_std": 0.6587165221571922, |
| "rewards/accuracy_reward": 0.23958334140479565, |
| "rewards/format_reward": 0.3645833395421505, |
| "step": 78, |
| "w_high_ratio": 0.04009601101279259, |
| "w_low_ratio": 0.051254406571388245, |
| "w_max": 1.6542562246322632, |
| "w_mean": 1.174688458442688, |
| "w_min": 0.0, |
| "w_std": 0.27206049114465714 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2496.4479370117188, |
| "cov_mean": 4.12471272284165e-05, |
| "cov_std": 0.26665735617280006, |
| "entropy": 0.3173828125, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.3005116581916809, |
| "kl": 0.0013518333435058594, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": -0.0126, |
| "reward": 0.8854166865348816, |
| "reward_std": 0.3675435855984688, |
| "rewards/accuracy_reward": 0.2604166744276881, |
| "rewards/format_reward": 0.6250000149011612, |
| "step": 79, |
| "w_high_ratio": 0.10870501399040222, |
| "w_low_ratio": 0.03222281183116138, |
| "w_max": 1.7319224178791046, |
| "w_mean": 1.2745553255081177, |
| "w_min": 0.0, |
| "w_std": 0.17969628423452377 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3547.4063110351562, |
| "cov_mean": -3.7483160667761695e-05, |
| "cov_std": 0.2408691681921482, |
| "entropy": 0.4462890625, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.18163201212882996, |
| "kl": 0.001146554946899414, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0002, |
| "reward": 0.447916679084301, |
| "reward_std": 0.4143633097410202, |
| "rewards/accuracy_reward": 0.14583333674818277, |
| "rewards/format_reward": 0.30208333395421505, |
| "step": 80, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.028170655015856028, |
| "w_max": 1.4094052910804749, |
| "w_mean": 1.0949373543262482, |
| "w_min": 0.25, |
| "w_std": 0.12403910420835018 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3345.6771850585938, |
| "cov_mean": 5.422214962891303e-05, |
| "cov_std": 0.2975510209798813, |
| "entropy": 0.50634765625, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.23768211901187897, |
| "kl": 0.0026841163635253906, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0359, |
| "reward": 0.4375, |
| "reward_std": 0.29862353205680847, |
| "rewards/accuracy_reward": 0.09375000558793545, |
| "rewards/format_reward": 0.3437500149011612, |
| "step": 81, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.039988940581679344, |
| "w_max": 1.4104009568691254, |
| "w_mean": 1.1443687975406647, |
| "w_min": 0.25, |
| "w_std": 0.15021733939647675 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3128.5313110351562, |
| "cov_mean": -4.930557446414241e-05, |
| "cov_std": 0.1716586910188198, |
| "entropy": 0.40576171875, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.2130168080329895, |
| "kl": 0.0011749267578125, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": -0.0095, |
| "reward": 0.645833358168602, |
| "reward_std": 0.28174424916505814, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 82, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.020149634685367346, |
| "w_max": 1.7989980578422546, |
| "w_mean": 1.2575880885124207, |
| "w_min": 0.25, |
| "w_std": 0.11121102049946785 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3006.2188110351562, |
| "cov_mean": -4.109572182642296e-05, |
| "cov_std": 0.26048484444618225, |
| "entropy": 0.41064453125, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.342207670211792, |
| "kl": 0.002028226852416992, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.0346, |
| "reward": 0.5625000149011612, |
| "reward_std": 0.40875787287950516, |
| "rewards/accuracy_reward": 0.19791667256504297, |
| "rewards/format_reward": 0.3645833469927311, |
| "step": 83, |
| "w_high_ratio": 0.054204463958740234, |
| "w_low_ratio": 0.02545588812790811, |
| "w_max": 1.758713960647583, |
| "w_mean": 1.165531873703003, |
| "w_min": 1.1802768922825628e-26, |
| "w_std": 0.13860907219350338 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3249.6770935058594, |
| "cov_mean": 1.03536285678274e-06, |
| "cov_std": 0.3258565291762352, |
| "entropy": 0.40576171875, |
| "epoch": 0.096, |
| "grad_norm": 0.3563970625400543, |
| "kl": 0.0010137557983398438, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.0069, |
| "reward": 0.6979166865348816, |
| "reward_std": 0.44399677217006683, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/format_reward": 0.4062500149011612, |
| "step": 84, |
| "w_high_ratio": 0.0561169758439064, |
| "w_low_ratio": 0.032409061677753925, |
| "w_max": 1.6246945261955261, |
| "w_mean": 1.1986894607543945, |
| "w_min": 0.25, |
| "w_std": 0.20476746186614037 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3284.145965576172, |
| "cov_mean": -2.360550899993541e-05, |
| "cov_std": 0.322089783847332, |
| "entropy": 0.313232421875, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.2012510597705841, |
| "kl": 0.00048732757568359375, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": 0.022, |
| "reward": 0.6041666939854622, |
| "reward_std": 0.5810144916176796, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 85, |
| "w_high_ratio": 0.04291652888059616, |
| "w_low_ratio": 0.044021588284522295, |
| "w_max": 1.5220162570476532, |
| "w_mean": 1.2036064565181732, |
| "w_min": 0.0, |
| "w_std": 0.2122751884162426 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3284.5208740234375, |
| "cov_mean": 7.192745897555142e-05, |
| "cov_std": 0.2986150402575731, |
| "entropy": 0.4326171875, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.47124311327934265, |
| "kl": 0.00186920166015625, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": -0.0661, |
| "reward": 0.541666679084301, |
| "reward_std": 0.26891910284757614, |
| "rewards/accuracy_reward": 0.15625000279396772, |
| "rewards/format_reward": 0.385416679084301, |
| "step": 86, |
| "w_high_ratio": 0.14706559106707573, |
| "w_low_ratio": 0.02547268127091229, |
| "w_max": 2.1021605730056763, |
| "w_mean": 1.3188546895980835, |
| "w_min": 0.25, |
| "w_std": 0.20756208524107933 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2947.7188110351562, |
| "cov_mean": -3.2274874229187844e-05, |
| "cov_std": 0.491459421813488, |
| "entropy": 0.478515625, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.4571603536605835, |
| "kl": 0.00186920166015625, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": -0.0175, |
| "reward": 0.7604167014360428, |
| "reward_std": 0.5320771858096123, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.5729166939854622, |
| "step": 87, |
| "w_high_ratio": 0.09944025427103043, |
| "w_low_ratio": 0.051899916026741266, |
| "w_max": 2.0812322199344635, |
| "w_mean": 1.3910081684589386, |
| "w_min": 0.0, |
| "w_std": 0.28863297775387764 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2671.010528564453, |
| "cov_mean": 8.175054426828865e-05, |
| "cov_std": 0.5737064629793167, |
| "entropy": 0.42578125, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.8297140598297119, |
| "kl": 0.006494998931884766, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": -0.0247, |
| "reward": 1.052083358168602, |
| "reward_std": 0.6991286426782608, |
| "rewards/accuracy_reward": 0.385416679084301, |
| "rewards/format_reward": 0.6666666865348816, |
| "step": 88, |
| "w_high_ratio": 0.19003082811832428, |
| "w_low_ratio": 0.0572223337367177, |
| "w_max": 2.4647006690502167, |
| "w_mean": 1.4986785650253296, |
| "w_min": 0.0, |
| "w_std": 0.39150019735097885 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3554.2291870117188, |
| "cov_mean": -3.470801129878964e-05, |
| "cov_std": 0.33541250973939896, |
| "entropy": 0.4052734375, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.35713109374046326, |
| "kl": 0.0019817352294921875, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": -0.0037, |
| "reward": 0.5625000074505806, |
| "reward_std": 0.5584763810038567, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 89, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.046938784420490265, |
| "w_max": 1.4760373830795288, |
| "w_mean": 1.1144923567771912, |
| "w_min": 4.203895392974451e-45, |
| "w_std": 0.22363057732582092 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2691.072998046875, |
| "cov_mean": 3.220158714611898e-05, |
| "cov_std": 0.29605602473020554, |
| "entropy": 0.5078125, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.3885078728199005, |
| "kl": 0.0037512779235839844, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0576, |
| "reward": 0.5833333432674408, |
| "reward_std": 0.29030610620975494, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.5208333535119891, |
| "step": 90, |
| "w_high_ratio": 0.029189134016633034, |
| "w_low_ratio": 0.02723024832084775, |
| "w_max": 1.7544832229614258, |
| "w_mean": 1.3031582236289978, |
| "w_min": 0.25, |
| "w_std": 0.1653207652270794 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3429.2813110351562, |
| "cov_mean": 4.58851766893531e-06, |
| "cov_std": 0.255037359893322, |
| "entropy": 0.44970703125, |
| "epoch": 0.104, |
| "grad_norm": 0.21901822090148926, |
| "kl": 0.0016050338745117188, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0086, |
| "reward": 0.4791666679084301, |
| "reward_std": 0.42312975227832794, |
| "rewards/accuracy_reward": 0.14583334140479565, |
| "rewards/format_reward": 0.33333334140479565, |
| "step": 91, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03593640075996518, |
| "w_max": 1.3688502609729767, |
| "w_mean": 1.1163062751293182, |
| "w_min": 0.0, |
| "w_std": 0.16501911543309689 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2883.354217529297, |
| "cov_mean": 0.00012286239780223696, |
| "cov_std": 0.3810538575053215, |
| "entropy": 0.35205078125, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.3391018509864807, |
| "kl": 0.0031414031982421875, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0203, |
| "reward": 0.6979167014360428, |
| "reward_std": 0.44955648854374886, |
| "rewards/accuracy_reward": 0.18750001024454832, |
| "rewards/format_reward": 0.5104166716337204, |
| "step": 92, |
| "w_high_ratio": 0.04653368145227432, |
| "w_low_ratio": 0.04205216746777296, |
| "w_max": 1.9255772531032562, |
| "w_mean": 1.2982309758663177, |
| "w_min": 1.1786321383436036e-41, |
| "w_std": 0.2336835414171219 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3868.2708740234375, |
| "cov_mean": 6.7976218360854546e-06, |
| "cov_std": 0.18175788596272469, |
| "entropy": 0.52001953125, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.20475780963897705, |
| "kl": 0.0019414424896240234, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0439, |
| "reward": 0.07291666883975267, |
| "reward_std": 0.16979892551898956, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/format_reward": 0.06250000186264515, |
| "step": 93, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.026896574534475803, |
| "w_max": 1.2091633975505829, |
| "w_mean": 1.0145649313926697, |
| "w_min": 0.25, |
| "w_std": 0.10182364657521248 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3365.0313720703125, |
| "cov_mean": 5.4338164773071185e-05, |
| "cov_std": 0.3005314916372299, |
| "entropy": 0.498046875, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.2650594711303711, |
| "kl": 0.0022263526916503906, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": -0.0027, |
| "reward": 0.40625001303851604, |
| "reward_std": 0.3088777959346771, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 94, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03483147523365915, |
| "w_max": 1.5337306559085846, |
| "w_mean": 1.0965670943260193, |
| "w_min": 3.503246160812043e-46, |
| "w_std": 0.16744443587958813 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3801.90625, |
| "cov_mean": 3.249865312682232e-05, |
| "cov_std": 0.22550634294748306, |
| "entropy": 0.39990234375, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.1532547026872635, |
| "kl": 0.00028705596923828125, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0152, |
| "reward": 0.2395833395421505, |
| "reward_std": 0.36024561524391174, |
| "rewards/accuracy_reward": 0.07291666883975267, |
| "rewards/format_reward": 0.16666666977107525, |
| "step": 95, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.027520990930497646, |
| "w_max": 1.2406704127788544, |
| "w_mean": 1.0429764091968536, |
| "w_min": 0.25, |
| "w_std": 0.12879234366118908 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2939.229248046875, |
| "cov_mean": -0.00012694882570940536, |
| "cov_std": 0.42605962604284286, |
| "entropy": 0.35693359375, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.46688124537467957, |
| "kl": 0.0020439624786376953, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": -0.018, |
| "reward": 0.7500000298023224, |
| "reward_std": 0.5337796807289124, |
| "rewards/accuracy_reward": 0.260416679084301, |
| "rewards/format_reward": 0.4895833432674408, |
| "step": 96, |
| "w_high_ratio": 0.09955519065260887, |
| "w_low_ratio": 0.04101596772670746, |
| "w_max": 2.1674250662326813, |
| "w_mean": 1.3317046463489532, |
| "w_min": 0.0, |
| "w_std": 0.2793598398566246 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3480.8334350585938, |
| "cov_mean": -2.861599477910204e-05, |
| "cov_std": 0.2684932127594948, |
| "entropy": 0.400390625, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.336283415555954, |
| "kl": 0.0008549690246582031, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": -0.0136, |
| "reward": 0.5416666716337204, |
| "reward_std": 0.3185878023505211, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.3229166716337204, |
| "step": 97, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03224400524049997, |
| "w_max": 1.3480738699436188, |
| "w_mean": 1.129571795463562, |
| "w_min": 0.25, |
| "w_std": 0.1670757606625557 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3187.0418090820312, |
| "cov_mean": -3.800686135946307e-05, |
| "cov_std": 0.3548770062625408, |
| "entropy": 0.37158203125, |
| "epoch": 0.112, |
| "grad_norm": 0.4635946452617645, |
| "kl": 0.0006475448608398438, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": -0.0451, |
| "reward": 0.5312500149011612, |
| "reward_std": 0.2919851318001747, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.3958333432674408, |
| "step": 98, |
| "w_high_ratio": 0.08355391025543213, |
| "w_low_ratio": 0.03265182231552899, |
| "w_max": 1.7274979948997498, |
| "w_mean": 1.2217411994934082, |
| "w_min": 0.0, |
| "w_std": 0.2073941007256508 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3087.635467529297, |
| "cov_mean": 6.88847137553239e-06, |
| "cov_std": 0.1545543149113655, |
| "entropy": 0.34375, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.336643785238266, |
| "kl": 0.001453399658203125, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0319, |
| "reward": 0.5000000204890966, |
| "reward_std": 0.22891659289598465, |
| "rewards/accuracy_reward": 0.19791667442768812, |
| "rewards/format_reward": 0.3020833348855376, |
| "step": 99, |
| "w_high_ratio": 0.06669171899557114, |
| "w_low_ratio": 0.019805304240435362, |
| "w_max": 1.3878345787525177, |
| "w_mean": 1.1877047568559647, |
| "w_min": 0.25, |
| "w_std": 0.10978816263377666 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 2950.5938110351562, |
| "cov_mean": 4.911199903290253e-05, |
| "cov_std": 0.4131714701652527, |
| "entropy": 0.34521484375, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.6010158061981201, |
| "kl": 0.0022754669189453125, |
| "learning_rate": 1e-07, |
| "loss": -0.0995, |
| "reward": 0.8750000223517418, |
| "reward_std": 0.5578841716051102, |
| "rewards/accuracy_reward": 0.3541666865348816, |
| "rewards/format_reward": 0.5208333507180214, |
| "step": 100, |
| "w_high_ratio": 0.19728600606322289, |
| "w_low_ratio": 0.03664776147343218, |
| "w_max": 2.3017463386058807, |
| "w_mean": 1.389022558927536, |
| "w_min": 4.2088412759042857e-39, |
| "w_std": 0.2764138747006655 |
| }, |
| { |
| "epoch": 0.11428571428571428, |
| "step": 100, |
| "total_flos": 0.0, |
| "train_loss": -0.0008640377339906991, |
| "train_runtime": 8354.3803, |
| "train_samples_per_second": 1.149, |
| "train_steps_per_second": 0.012 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|