| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2547121752419766, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.578125, |
| "epoch": 0.0005094243504839531, |
| "grad_norm": 21.497011168465292, |
| "kl": 0.0, |
| "learning_rate": 9.997452878247579e-07, |
| "loss": -0.0, |
| "reward": -0.492842435836792, |
| "reward_std": 0.7784243226051331, |
| "rewards/accuracy_reward": -0.4125000238418579, |
| "rewards/cosine_rewards": -0.08018936403095722, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": -0.0001530575300421333, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 211.796875, |
| "epoch": 0.0010188487009679063, |
| "grad_norm": 8.570878529351686, |
| "kl": 0.00115203857421875, |
| "learning_rate": 9.99490575649516e-07, |
| "loss": 0.0, |
| "reward": -0.2021125927567482, |
| "reward_std": 0.686398446559906, |
| "rewards/accuracy_reward": -0.18437501601874828, |
| "rewards/cosine_rewards": -0.01752197090536356, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": -0.00021561131143243983, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 242.640625, |
| "epoch": 0.0015282730514518594, |
| "grad_norm": 7.698910727869972, |
| "kl": 0.0014190673828125, |
| "learning_rate": 9.99235863474274e-07, |
| "loss": 0.0001, |
| "reward": -0.6304773092269897, |
| "reward_std": 0.5950716435909271, |
| "rewards/accuracy_reward": -0.6093750298023224, |
| "rewards/cosine_rewards": -0.03664374351501465, |
| "rewards/format_reward": 0.015625, |
| "rewards/repetition_rewards": -8.355615136679262e-05, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 192.765625, |
| "epoch": 0.0020376974019358125, |
| "grad_norm": 8.264023776311538, |
| "kl": 0.00258636474609375, |
| "learning_rate": 9.98981151299032e-07, |
| "loss": 0.0001, |
| "reward": -0.4020528346300125, |
| "reward_std": 0.7227448225021362, |
| "rewards/accuracy_reward": -0.38750001788139343, |
| "rewards/cosine_rewards": -0.014348747674375772, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": -0.00020408956333994865, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 199.953125, |
| "epoch": 0.0025471217524197657, |
| "grad_norm": 9.41735274952485, |
| "kl": 0.00286865234375, |
| "learning_rate": 9.9872643912379e-07, |
| "loss": 0.0001, |
| "reward": -0.45950669050216675, |
| "reward_std": 0.6219092607498169, |
| "rewards/accuracy_reward": -0.4343750476837158, |
| "rewards/cosine_rewards": -0.02503613755106926, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": -9.553764903103001e-05, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.9375, |
| "epoch": 0.003056546102903719, |
| "grad_norm": 12.944765767909546, |
| "kl": 0.008697509765625, |
| "learning_rate": 9.984717269485481e-07, |
| "loss": 0.0003, |
| "reward": -0.42242346704006195, |
| "reward_std": 0.6794147342443466, |
| "rewards/accuracy_reward": -0.40937504172325134, |
| "rewards/cosine_rewards": -0.028209966607391834, |
| "rewards/format_reward": 0.015625, |
| "rewards/repetition_rewards": -0.0004634863289538771, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.859375, |
| "epoch": 0.003565970453387672, |
| "grad_norm": 10.259430825273313, |
| "kl": 0.013763427734375, |
| "learning_rate": 9.98217014773306e-07, |
| "loss": 0.0005, |
| "reward": -0.33318234980106354, |
| "reward_std": 0.7437820434570312, |
| "rewards/accuracy_reward": -0.35625000298023224, |
| "rewards/cosine_rewards": -0.00804880098439753, |
| "rewards/format_reward": 0.03125, |
| "rewards/repetition_rewards": -0.00013354701513890177, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.0625, |
| "epoch": 0.004075394803871625, |
| "grad_norm": 8.664940595308508, |
| "kl": 0.01800537109375, |
| "learning_rate": 9.979623025980642e-07, |
| "loss": 0.0007, |
| "reward": -0.3353596553206444, |
| "reward_std": 0.7424190640449524, |
| "rewards/accuracy_reward": -0.32500002533197403, |
| "rewards/cosine_rewards": -0.010187382809817791, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": -0.00017226976342499256, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 130.609375, |
| "epoch": 0.004584819154355578, |
| "grad_norm": 12.906962146752678, |
| "kl": 0.013641357421875, |
| "learning_rate": 9.977075904228221e-07, |
| "loss": 0.0005, |
| "reward": -0.5576262176036835, |
| "reward_std": 0.38936011493206024, |
| "rewards/accuracy_reward": -0.546875, |
| "rewards/cosine_rewards": -0.010545612312853336, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": -0.00020559210679493845, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 118.375, |
| "epoch": 0.005094243504839531, |
| "grad_norm": 12.675664846435772, |
| "kl": 0.014129638671875, |
| "learning_rate": 9.974528782475803e-07, |
| "loss": 0.0006, |
| "reward": -0.5825353264808655, |
| "reward_std": 0.32141495356336236, |
| "rewards/accuracy_reward": -0.5750000178813934, |
| "rewards/cosine_rewards": -0.0075353041756898165, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.5625, |
| "epoch": 0.0056036678553234845, |
| "grad_norm": 83.14378275688269, |
| "kl": 0.011932373046875, |
| "learning_rate": 9.971981660723382e-07, |
| "loss": 0.0005, |
| "reward": -0.4973638355731964, |
| "reward_std": 0.6479763090610504, |
| "rewards/accuracy_reward": -0.4906250536441803, |
| "rewards/cosine_rewards": -0.006738818949088454, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 122.5, |
| "epoch": 0.006113092205807438, |
| "grad_norm": 10.015051037156322, |
| "kl": 0.01776123046875, |
| "learning_rate": 9.969434538970963e-07, |
| "loss": 0.0007, |
| "reward": -0.5842953324317932, |
| "reward_std": 0.3923248201608658, |
| "rewards/accuracy_reward": -0.5750000029802322, |
| "rewards/cosine_rewards": -0.009295305702835321, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.984375, |
| "epoch": 0.006622516556291391, |
| "grad_norm": 11.394446741932766, |
| "kl": 0.018157958984375, |
| "learning_rate": 9.966887417218542e-07, |
| "loss": 0.0007, |
| "reward": -0.5545713007450104, |
| "reward_std": 0.5603736639022827, |
| "rewards/accuracy_reward": -0.5468750298023224, |
| "rewards/cosine_rewards": -0.007696274435147643, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.8125, |
| "epoch": 0.007131940906775344, |
| "grad_norm": 11.774338615514537, |
| "kl": 0.017730712890625, |
| "learning_rate": 9.964340295466124e-07, |
| "loss": 0.0007, |
| "reward": -0.24103393778204918, |
| "reward_std": 0.770084798336029, |
| "rewards/accuracy_reward": -0.23750002682209015, |
| "rewards/cosine_rewards": -0.0035339330206625164, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.5, |
| "epoch": 0.007641365257259297, |
| "grad_norm": 12.461454822945, |
| "kl": 0.01995849609375, |
| "learning_rate": 9.961793173713703e-07, |
| "loss": 0.0008, |
| "reward": -0.7055607736110687, |
| "reward_std": 0.2303236834704876, |
| "rewards/accuracy_reward": -0.7156250178813934, |
| "rewards/cosine_rewards": -0.005560769001021981, |
| "rewards/format_reward": 0.015625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 101.53125, |
| "epoch": 0.00815078960774325, |
| "grad_norm": 16.951183982865736, |
| "kl": 0.0206298828125, |
| "learning_rate": 9.959246051961282e-07, |
| "loss": 0.0008, |
| "reward": -0.3540929928421974, |
| "reward_std": 0.7245323657989502, |
| "rewards/accuracy_reward": -0.3500000238418579, |
| "rewards/cosine_rewards": -0.004092983668670058, |
| "rewards/format_reward": 0.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 96.578125, |
| "epoch": 0.008660213958227204, |
| "grad_norm": 9.458837096460513, |
| "kl": 0.025634765625, |
| "learning_rate": 9.956698930208864e-07, |
| "loss": 0.001, |
| "reward": -0.36599001288414, |
| "reward_std": 0.6569808125495911, |
| "rewards/accuracy_reward": -0.37812502682209015, |
| "rewards/cosine_rewards": -0.003489995375275612, |
| "rewards/format_reward": 0.015625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 96.625, |
| "epoch": 0.009169638308711156, |
| "grad_norm": 11.937426620152417, |
| "kl": 0.02801513671875, |
| "learning_rate": 9.954151808456443e-07, |
| "loss": 0.0011, |
| "reward": -0.40710097551345825, |
| "reward_std": 0.7412720322608948, |
| "rewards/accuracy_reward": -0.43437501788139343, |
| "rewards/cosine_rewards": -0.003975986503064632, |
| "rewards/format_reward": 0.03125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 97.21875, |
| "epoch": 0.00967906265919511, |
| "grad_norm": 12.317962197180934, |
| "kl": 0.03466796875, |
| "learning_rate": 9.951604686704024e-07, |
| "loss": 0.0014, |
| "reward": -0.25013431906700134, |
| "reward_std": 0.7123757898807526, |
| "rewards/accuracy_reward": -0.32500001788139343, |
| "rewards/cosine_rewards": -0.003259307239204645, |
| "rewards/format_reward": 0.078125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 97.109375, |
| "epoch": 0.010188487009679063, |
| "grad_norm": 24.27751022595849, |
| "kl": 0.037109375, |
| "learning_rate": 9.949057564951603e-07, |
| "loss": 0.0015, |
| "reward": -0.2632312625646591, |
| "reward_std": 0.6930468529462814, |
| "rewards/accuracy_reward": -0.4624999985098839, |
| "rewards/cosine_rewards": -0.003856247873045504, |
| "rewards/format_reward": 0.203125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.03125, |
| "epoch": 0.010697911360163017, |
| "grad_norm": 12.508736780907405, |
| "kl": 0.053955078125, |
| "learning_rate": 9.946510443199185e-07, |
| "loss": 0.0022, |
| "reward": -0.010567170567810535, |
| "reward_std": 0.7874742448329926, |
| "rewards/accuracy_reward": -0.4125000238418579, |
| "rewards/cosine_rewards": -0.004317150334827602, |
| "rewards/format_reward": 0.40625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.390625, |
| "epoch": 0.011207335710646969, |
| "grad_norm": 10.983519481785477, |
| "kl": 0.073974609375, |
| "learning_rate": 9.943963321446764e-07, |
| "loss": 0.003, |
| "reward": 0.5529356598854065, |
| "reward_std": 0.9540310502052307, |
| "rewards/accuracy_reward": -0.2093750163912773, |
| "rewards/cosine_rewards": -0.003314302652142942, |
| "rewards/format_reward": 0.765625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.796875, |
| "epoch": 0.011716760061130923, |
| "grad_norm": 59.13650095831239, |
| "kl": 0.084716796875, |
| "learning_rate": 9.941416199694345e-07, |
| "loss": 0.0034, |
| "reward": 0.49799469113349915, |
| "reward_std": 0.6547213792800903, |
| "rewards/accuracy_reward": -0.43437501788139343, |
| "rewards/cosine_rewards": -0.005130313569679856, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.34375, |
| "epoch": 0.012226184411614875, |
| "grad_norm": 33.85321217925331, |
| "kl": 0.078369140625, |
| "learning_rate": 9.938869077941925e-07, |
| "loss": 0.0031, |
| "reward": 0.5440552532672882, |
| "reward_std": 0.4689805209636688, |
| "rewards/accuracy_reward": -0.43437501788139343, |
| "rewards/cosine_rewards": -0.005944762844592333, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.078125, |
| "epoch": 0.01273560876209883, |
| "grad_norm": 27.614415529764607, |
| "kl": 0.23876953125, |
| "learning_rate": 9.936321956189506e-07, |
| "loss": 0.0096, |
| "reward": 0.319291889667511, |
| "reward_std": 0.2991320895962417, |
| "rewards/accuracy_reward": -0.659375011920929, |
| "rewards/cosine_rewards": -0.005708091426640749, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 21.09375, |
| "epoch": 0.013245033112582781, |
| "grad_norm": 78.27860464199816, |
| "kl": 0.810546875, |
| "learning_rate": 9.933774834437085e-07, |
| "loss": 0.0324, |
| "reward": 0.758573591709137, |
| "reward_std": 0.8151377141475677, |
| "rewards/accuracy_reward": -0.24062500894069672, |
| "rewards/cosine_rewards": -0.0008013773494894849, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 18.75, |
| "epoch": 0.013754457463066735, |
| "grad_norm": 15.792726008582374, |
| "kl": 0.876953125, |
| "learning_rate": 9.931227712684667e-07, |
| "loss": 0.0351, |
| "reward": 0.5177058726549149, |
| "reward_std": 0.6054319739341736, |
| "rewards/accuracy_reward": -0.46562501788139343, |
| "rewards/cosine_rewards": -0.000942649960052222, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.000101461038866546, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 15.15625, |
| "epoch": 0.014263881813550688, |
| "grad_norm": 28.538085950991302, |
| "kl": 0.853515625, |
| "learning_rate": 9.928680590932246e-07, |
| "loss": 0.0342, |
| "reward": 0.30591557919979095, |
| "reward_std": 0.3449897766113281, |
| "rewards/accuracy_reward": -0.6625000238418579, |
| "rewards/cosine_rewards": -0.00033439824983361177, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.1875, |
| "epoch": 0.014773306164034642, |
| "grad_norm": 19.672166251005525, |
| "kl": 0.939453125, |
| "learning_rate": 9.926133469179825e-07, |
| "loss": 0.0375, |
| "reward": 0.4091247171163559, |
| "reward_std": 0.46140581369400024, |
| "rewards/accuracy_reward": -0.5750000476837158, |
| "rewards/cosine_rewards": -0.0002502501738490537, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 16.171875, |
| "epoch": 0.015282730514518594, |
| "grad_norm": 35.96869326683099, |
| "kl": 1.416015625, |
| "learning_rate": 9.923586347427406e-07, |
| "loss": 0.0566, |
| "reward": 0.5554585456848145, |
| "reward_std": 0.7011753022670746, |
| "rewards/accuracy_reward": -0.3812499940395355, |
| "rewards/cosine_rewards": -0.0007914370798971504, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 16.15625, |
| "epoch": 0.015792154865002548, |
| "grad_norm": 24.745191247130563, |
| "kl": 1.01171875, |
| "learning_rate": 9.921039225674986e-07, |
| "loss": 0.0405, |
| "reward": 0.6306657046079636, |
| "reward_std": 0.7620185613632202, |
| "rewards/accuracy_reward": -0.32187502086162567, |
| "rewards/cosine_rewards": -0.0005842609098181129, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 39.140625, |
| "epoch": 0.0163015792154865, |
| "grad_norm": 12.709104159306316, |
| "kl": 0.7265625, |
| "learning_rate": 9.918492103922567e-07, |
| "loss": 0.0291, |
| "reward": 0.38606902956962585, |
| "reward_std": 0.8792209327220917, |
| "rewards/accuracy_reward": -0.39375001192092896, |
| "rewards/cosine_rewards": -0.001430943259038031, |
| "rewards/format_reward": 0.78125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 15.03125, |
| "epoch": 0.016811003565970453, |
| "grad_norm": 17.976804747397264, |
| "kl": 0.904296875, |
| "learning_rate": 9.915944982170146e-07, |
| "loss": 0.0361, |
| "reward": 0.4996982365846634, |
| "reward_std": 0.7649624943733215, |
| "rewards/accuracy_reward": -0.4687500149011612, |
| "rewards/cosine_rewards": -0.00030174180574249476, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 22.609375, |
| "epoch": 0.017320427916454408, |
| "grad_norm": 39.58286880123024, |
| "kl": 0.8828125, |
| "learning_rate": 9.913397860417728e-07, |
| "loss": 0.0353, |
| "reward": 0.4207390695810318, |
| "reward_std": 0.8459209501743317, |
| "rewards/accuracy_reward": -0.4375000298023224, |
| "rewards/cosine_rewards": -0.0011358977280906402, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.328125, |
| "epoch": 0.01782985226693836, |
| "grad_norm": 19.17123986238676, |
| "kl": 0.955078125, |
| "learning_rate": 9.910850738665307e-07, |
| "loss": 0.0383, |
| "reward": 0.474868506193161, |
| "reward_std": 0.6449769139289856, |
| "rewards/accuracy_reward": -0.49375005066394806, |
| "rewards/cosine_rewards": -0.0001314536166319158, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 24.328125, |
| "epoch": 0.018339276617422313, |
| "grad_norm": 23.5658227799872, |
| "kl": 0.95703125, |
| "learning_rate": 9.908303616912888e-07, |
| "loss": 0.0382, |
| "reward": 0.4700201153755188, |
| "reward_std": 0.7454200983047485, |
| "rewards/accuracy_reward": -0.41875001788139343, |
| "rewards/cosine_rewards": -0.0018548529915278777, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.921875, |
| "epoch": 0.018848700967906265, |
| "grad_norm": 11.872294898362206, |
| "kl": 1.001953125, |
| "learning_rate": 9.905756495160467e-07, |
| "loss": 0.0401, |
| "reward": 0.5029261708259583, |
| "reward_std": 0.7742039263248444, |
| "rewards/accuracy_reward": -0.4343750327825546, |
| "rewards/cosine_rewards": -0.0001988118929148186, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 16.40625, |
| "epoch": 0.01935812531839022, |
| "grad_norm": 14.39070050703297, |
| "kl": 0.978515625, |
| "learning_rate": 9.903209373408049e-07, |
| "loss": 0.0391, |
| "reward": 0.4616774320602417, |
| "reward_std": 0.7915183901786804, |
| "rewards/accuracy_reward": -0.4125000238418579, |
| "rewards/cosine_rewards": -0.000822544090624433, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 22.1875, |
| "epoch": 0.019867549668874173, |
| "grad_norm": 9.423514240680602, |
| "kl": 0.9375, |
| "learning_rate": 9.900662251655628e-07, |
| "loss": 0.0376, |
| "reward": 0.5430571883916855, |
| "reward_std": 0.5502887666225433, |
| "rewards/accuracy_reward": -0.4062500447034836, |
| "rewards/cosine_rewards": -0.003600762978749117, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.00021701389050576836, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 12.4375, |
| "epoch": 0.020376974019358125, |
| "grad_norm": 25.806979588800377, |
| "kl": 0.9140625, |
| "learning_rate": 9.89811512990321e-07, |
| "loss": 0.0366, |
| "reward": 0.503069132566452, |
| "reward_std": 0.6060213148593903, |
| "rewards/accuracy_reward": -0.4656250327825546, |
| "rewards/cosine_rewards": -5.582944686466362e-05, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 12.46875, |
| "epoch": 0.020886398369842078, |
| "grad_norm": 20.104239930601235, |
| "kl": 0.9296875, |
| "learning_rate": 9.895568008150789e-07, |
| "loss": 0.0372, |
| "reward": 0.631201758980751, |
| "reward_std": 0.7148115336894989, |
| "rewards/accuracy_reward": -0.35312502086162567, |
| "rewards/cosine_rewards": -4.822800292458851e-05, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 12.828125, |
| "epoch": 0.021395822720326033, |
| "grad_norm": 7.720832433302504, |
| "kl": 0.841796875, |
| "learning_rate": 9.89302088639837e-07, |
| "loss": 0.0336, |
| "reward": 0.5936954319477081, |
| "reward_std": 0.4961870163679123, |
| "rewards/accuracy_reward": -0.4062500298023224, |
| "rewards/cosine_rewards": -5.458852319861762e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 12.984375, |
| "epoch": 0.021905247070809986, |
| "grad_norm": 9.831243087089065, |
| "kl": 0.76953125, |
| "learning_rate": 9.89047376464595e-07, |
| "loss": 0.0308, |
| "reward": 0.6499472558498383, |
| "reward_std": 0.7755721807479858, |
| "rewards/accuracy_reward": -0.3500000238418579, |
| "rewards/cosine_rewards": -5.273178430797998e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.022414671421293938, |
| "grad_norm": 15.405185965961632, |
| "kl": 0.79296875, |
| "learning_rate": 9.88792664289353e-07, |
| "loss": 0.0318, |
| "reward": 0.8749629557132721, |
| "reward_std": 0.8532125055789948, |
| "rewards/accuracy_reward": -0.1250000149011612, |
| "rewards/cosine_rewards": -3.706023017002735e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.02292409577177789, |
| "grad_norm": 73.99173140679622, |
| "kl": 0.814453125, |
| "learning_rate": 9.88537952114111e-07, |
| "loss": 0.0326, |
| "reward": 0.8468359708786011, |
| "reward_std": 0.6202812939882278, |
| "rewards/accuracy_reward": -0.15312501415610313, |
| "rewards/cosine_rewards": -3.904559889633674e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.09375, |
| "epoch": 0.023433520122261846, |
| "grad_norm": 57.01467559353994, |
| "kl": 0.802734375, |
| "learning_rate": 9.882832399388691e-07, |
| "loss": 0.0321, |
| "reward": 0.7187013626098633, |
| "reward_std": 0.7317405939102173, |
| "rewards/accuracy_reward": -0.26562502793967724, |
| "rewards/cosine_rewards": -4.8641444664099254e-05, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 14.8125, |
| "epoch": 0.023942944472745798, |
| "grad_norm": 49.94239079179156, |
| "kl": 0.8125, |
| "learning_rate": 9.88028527763627e-07, |
| "loss": 0.0325, |
| "reward": 0.7904289066791534, |
| "reward_std": 0.6411640644073486, |
| "rewards/accuracy_reward": -0.2093750163912773, |
| "rewards/cosine_rewards": -0.00019609702576417476, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.984375, |
| "epoch": 0.02445236882322975, |
| "grad_norm": 29.421172213478044, |
| "kl": 0.8046875, |
| "learning_rate": 9.877738155883852e-07, |
| "loss": 0.0322, |
| "reward": 0.7342777252197266, |
| "reward_std": 0.3429698422551155, |
| "rewards/accuracy_reward": -0.2656249850988388, |
| "rewards/cosine_rewards": -9.731029422255233e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.024961793173713703, |
| "grad_norm": 30.98665699286148, |
| "kl": 0.86328125, |
| "learning_rate": 9.87519103413143e-07, |
| "loss": 0.0346, |
| "reward": 1.0437248945236206, |
| "reward_std": 0.6164620369672775, |
| "rewards/accuracy_reward": 0.04374997317790985, |
| "rewards/cosine_rewards": -2.5148013037323835e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.02547121752419766, |
| "grad_norm": 25.805195350433394, |
| "kl": 0.787109375, |
| "learning_rate": 9.872643912379012e-07, |
| "loss": 0.0315, |
| "reward": 0.6499470472335815, |
| "reward_std": 0.4753982424736023, |
| "rewards/accuracy_reward": -0.3500000238418579, |
| "rewards/cosine_rewards": -5.2943185437470675e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.02598064187468161, |
| "grad_norm": 60.37753917289171, |
| "kl": 0.865234375, |
| "learning_rate": 9.870096790626592e-07, |
| "loss": 0.0347, |
| "reward": 1.0999788641929626, |
| "reward_std": 0.716822475194931, |
| "rewards/accuracy_reward": 0.09999999590218067, |
| "rewards/cosine_rewards": -2.117727399308933e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.026490066225165563, |
| "grad_norm": 60.29300770886218, |
| "kl": 0.859375, |
| "learning_rate": 9.867549668874173e-07, |
| "loss": 0.0343, |
| "reward": 1.3249947428703308, |
| "reward_std": 0.6325759440660477, |
| "rewards/accuracy_reward": 0.32499997690320015, |
| "rewards/cosine_rewards": -5.294318725646008e-06, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 14.859375, |
| "epoch": 0.026999490575649515, |
| "grad_norm": 33.49731491963465, |
| "kl": 0.96484375, |
| "learning_rate": 9.865002547121752e-07, |
| "loss": 0.0386, |
| "reward": 0.6497911810874939, |
| "reward_std": 0.23335448652505875, |
| "rewards/accuracy_reward": -0.3500000163912773, |
| "rewards/cosine_rewards": -0.00020882973694824614, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.02750891492613347, |
| "grad_norm": 21.106152145400298, |
| "kl": 0.85546875, |
| "learning_rate": 9.862455425369333e-07, |
| "loss": 0.0342, |
| "reward": 1.3812487125396729, |
| "reward_std": 0.26327238231897354, |
| "rewards/accuracy_reward": 0.3812499940395355, |
| "rewards/cosine_rewards": -1.3235799087851774e-06, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.028018339276617423, |
| "grad_norm": 50.8468456202969, |
| "kl": 0.767578125, |
| "learning_rate": 9.859908303616913e-07, |
| "loss": 0.0307, |
| "reward": 1.493756651878357, |
| "reward_std": 0.3182205259799957, |
| "rewards/accuracy_reward": 0.4937499910593033, |
| "rewards/cosine_rewards": 6.617898179683834e-06, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.028527763627101375, |
| "grad_norm": 45.22229728431362, |
| "kl": 0.833984375, |
| "learning_rate": 9.857361181864494e-07, |
| "loss": 0.0334, |
| "reward": 0.9030899405479431, |
| "reward_std": 0.2386654019355774, |
| "rewards/accuracy_reward": -0.09687501937150955, |
| "rewards/cosine_rewards": -3.507485962472856e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.029037187977585328, |
| "grad_norm": 251.48941881136554, |
| "kl": 0.828125, |
| "learning_rate": 9.854814060112073e-07, |
| "loss": 0.0331, |
| "reward": 1.5781376361846924, |
| "reward_std": 0.3039933070540428, |
| "rewards/accuracy_reward": 0.5781249701976776, |
| "rewards/cosine_rewards": 1.2574006632348755e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 14.453125, |
| "epoch": 0.029546612328069283, |
| "grad_norm": 36.77436472817121, |
| "kl": 0.939453125, |
| "learning_rate": 9.852266938359653e-07, |
| "loss": 0.0376, |
| "reward": 1.334273636341095, |
| "reward_std": 0.34448733925819397, |
| "rewards/accuracy_reward": 0.34999997913837433, |
| "rewards/cosine_rewards": -0.00010142281280423049, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 14.4375, |
| "epoch": 0.030056036678553236, |
| "grad_norm": 46.54068675299219, |
| "kl": 0.89453125, |
| "learning_rate": 9.849719816607234e-07, |
| "loss": 0.0358, |
| "reward": 0.9967500269412994, |
| "reward_std": 0.4488208740949631, |
| "rewards/accuracy_reward": 0.012499993667006493, |
| "rewards/cosine_rewards": -0.00012501747096393956, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.030565461029037188, |
| "grad_norm": 63.09725081890949, |
| "kl": 0.837890625, |
| "learning_rate": 9.847172694854813e-07, |
| "loss": 0.0335, |
| "reward": 0.9874708652496338, |
| "reward_std": 0.33707569539546967, |
| "rewards/accuracy_reward": -0.012500002980232239, |
| "rewards/cosine_rewards": -2.9118752991053043e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 13.0, |
| "epoch": 0.03107488537952114, |
| "grad_norm": 105.45023488529014, |
| "kl": 0.8359375, |
| "learning_rate": 9.844625573102394e-07, |
| "loss": 0.0334, |
| "reward": 1.1281058490276337, |
| "reward_std": 0.3039932996034622, |
| "rewards/accuracy_reward": 0.12812498584389687, |
| "rewards/cosine_rewards": -1.919190435728524e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 14.640625, |
| "epoch": 0.031584309730005096, |
| "grad_norm": 85.12929156788996, |
| "kl": 0.84375, |
| "learning_rate": 9.842078451349974e-07, |
| "loss": 0.0337, |
| "reward": 0.7748350501060486, |
| "reward_std": 0.5448895841836929, |
| "rewards/accuracy_reward": -0.2093750238418579, |
| "rewards/cosine_rewards": -0.00016493651855853386, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 19.078125, |
| "epoch": 0.032093734080489045, |
| "grad_norm": 8.426167428667783, |
| "kl": 0.814453125, |
| "learning_rate": 9.839531329597555e-07, |
| "loss": 0.0326, |
| "reward": 0.874523401260376, |
| "reward_std": 0.0010828198865056038, |
| "rewards/accuracy_reward": -0.1250000149011612, |
| "rewards/cosine_rewards": -0.0004766158472193638, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 19.34375, |
| "epoch": 0.032603158430973, |
| "grad_norm": 86.47647424288853, |
| "kl": 0.810546875, |
| "learning_rate": 9.836984207845134e-07, |
| "loss": 0.0324, |
| "reward": 1.6625866889953613, |
| "reward_std": 0.19662056118249893, |
| "rewards/accuracy_reward": 0.6625000238418579, |
| "rewards/cosine_rewards": 8.658922160975635e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 17.6875, |
| "epoch": 0.033112582781456956, |
| "grad_norm": 59.22678939682069, |
| "kl": 0.86328125, |
| "learning_rate": 9.834437086092716e-07, |
| "loss": 0.0345, |
| "reward": 0.915763258934021, |
| "reward_std": 0.082692209049128, |
| "rewards/accuracy_reward": -0.06875000894069672, |
| "rewards/cosine_rewards": 0.00013823993504047394, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 24.671875, |
| "epoch": 0.033622007131940905, |
| "grad_norm": 170.96252285475958, |
| "kl": 0.7734375, |
| "learning_rate": 9.831889964340295e-07, |
| "loss": 0.0309, |
| "reward": 1.2965829372406006, |
| "reward_std": 0.32569222897291183, |
| "rewards/accuracy_reward": 0.2968750074505806, |
| "rewards/cosine_rewards": -0.0002921203849837184, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 18.890625, |
| "epoch": 0.03413143148242486, |
| "grad_norm": 302.02752287161115, |
| "kl": 0.84765625, |
| "learning_rate": 9.829342842587876e-07, |
| "loss": 0.0339, |
| "reward": 1.2968038320541382, |
| "reward_std": 0.27610647678375244, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": -7.123823161236942e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 22.984375, |
| "epoch": 0.034640855832908816, |
| "grad_norm": 617.6786375620823, |
| "kl": 0.77734375, |
| "learning_rate": 9.826795720835456e-07, |
| "loss": 0.0311, |
| "reward": 1.4656760096549988, |
| "reward_std": 0.2886117473244667, |
| "rewards/accuracy_reward": 0.46562498807907104, |
| "rewards/cosine_rewards": 5.1008202717639506e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 24.34375, |
| "epoch": 0.035150280183392765, |
| "grad_norm": 29.79960642054238, |
| "kl": 0.728515625, |
| "learning_rate": 9.824248599083037e-07, |
| "loss": 0.0292, |
| "reward": 1.309334635734558, |
| "reward_std": 0.20424916595220566, |
| "rewards/accuracy_reward": 0.32500000298023224, |
| "rewards/cosine_rewards": -4.041045031044632e-05, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 23.90625, |
| "epoch": 0.03565970453387672, |
| "grad_norm": 91.00871807242451, |
| "kl": 0.744140625, |
| "learning_rate": 9.821701477330616e-07, |
| "loss": 0.0298, |
| "reward": 1.2686043679714203, |
| "reward_std": 0.10558865318307653, |
| "rewards/accuracy_reward": 0.26874998211860657, |
| "rewards/cosine_rewards": -0.00014566810568794608, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 36.328125, |
| "epoch": 0.03616912888436067, |
| "grad_norm": 159.05100875041754, |
| "kl": 0.765625, |
| "learning_rate": 9.819154355578195e-07, |
| "loss": 0.0306, |
| "reward": 1.2812767028808594, |
| "reward_std": 0.6231541335582733, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": 2.6669338694773614e-05, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 38.4375, |
| "epoch": 0.036678553234844626, |
| "grad_norm": 97.83490373613579, |
| "kl": 0.666015625, |
| "learning_rate": 9.816607233825777e-07, |
| "loss": 0.0266, |
| "reward": 1.647216558456421, |
| "reward_std": 0.32463081181049347, |
| "rewards/accuracy_reward": 0.6625000089406967, |
| "rewards/cosine_rewards": 0.00034145097015425563, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 40.046875, |
| "epoch": 0.03718797758532858, |
| "grad_norm": 88.5872574021467, |
| "kl": 0.630859375, |
| "learning_rate": 9.814060112073356e-07, |
| "loss": 0.0253, |
| "reward": 1.7599374055862427, |
| "reward_std": 0.3454015702009201, |
| "rewards/accuracy_reward": 0.7750000059604645, |
| "rewards/cosine_rewards": 0.0005623315373668447, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 60.140625, |
| "epoch": 0.03769740193581253, |
| "grad_norm": 16.49274288281998, |
| "kl": 0.501953125, |
| "learning_rate": 9.811512990320937e-07, |
| "loss": 0.0201, |
| "reward": 1.7728378772735596, |
| "reward_std": 0.2738931328058243, |
| "rewards/accuracy_reward": 0.8031250238418579, |
| "rewards/cosine_rewards": 0.0009629083215259016, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.9375, |
| "epoch": 0.038206826286296486, |
| "grad_norm": 18.283090697254373, |
| "kl": 0.18359375, |
| "learning_rate": 9.808965868568517e-07, |
| "loss": 0.0074, |
| "reward": 1.437682330608368, |
| "reward_std": 0.19817885756492615, |
| "rewards/accuracy_reward": 0.4375000149011612, |
| "rewards/cosine_rewards": 0.00018233060836791992, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.34375, |
| "epoch": 0.03871625063678044, |
| "grad_norm": 21.656183371701722, |
| "kl": 0.13330078125, |
| "learning_rate": 9.806418746816098e-07, |
| "loss": 0.0054, |
| "reward": 1.2524056434631348, |
| "reward_std": 0.14976192265748978, |
| "rewards/accuracy_reward": 0.26874999701976776, |
| "rewards/cosine_rewards": -0.0007193188357632607, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.5, |
| "epoch": 0.03922567498726439, |
| "grad_norm": 12.450187143986858, |
| "kl": 0.1328125, |
| "learning_rate": 9.803871625063677e-07, |
| "loss": 0.0053, |
| "reward": 1.535181999206543, |
| "reward_std": 0.04510992762516253, |
| "rewards/accuracy_reward": 0.5499999970197678, |
| "rewards/cosine_rewards": 0.0008070359472185373, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.96875, |
| "epoch": 0.039735099337748346, |
| "grad_norm": 96.22924405795379, |
| "kl": 0.12744140625, |
| "learning_rate": 9.801324503311258e-07, |
| "loss": 0.0051, |
| "reward": 1.4221445322036743, |
| "reward_std": 0.5387175530195236, |
| "rewards/accuracy_reward": 0.4374999850988388, |
| "rewards/cosine_rewards": 0.0002695363436941989, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.46875, |
| "epoch": 0.040244523688232295, |
| "grad_norm": 22.356744283314942, |
| "kl": 0.12451171875, |
| "learning_rate": 9.798777381558838e-07, |
| "loss": 0.005, |
| "reward": 0.9280500411987305, |
| "reward_std": 0.3032594621181488, |
| "rewards/accuracy_reward": -0.06875001452863216, |
| "rewards/cosine_rewards": -0.003199932281859219, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.09375, |
| "epoch": 0.04075394803871625, |
| "grad_norm": 11.587196398677966, |
| "kl": 0.12353515625, |
| "learning_rate": 9.79623025980642e-07, |
| "loss": 0.0049, |
| "reward": 1.0698014497756958, |
| "reward_std": 0.306557297706604, |
| "rewards/accuracy_reward": 0.07187498360872269, |
| "rewards/cosine_rewards": -0.0020735373545903713, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 119.359375, |
| "epoch": 0.041263372389200206, |
| "grad_norm": 15.552459183086345, |
| "kl": 0.115234375, |
| "learning_rate": 9.793683138053998e-07, |
| "loss": 0.0046, |
| "reward": 1.902881920337677, |
| "reward_std": 0.2866080105304718, |
| "rewards/accuracy_reward": 0.9156250059604645, |
| "rewards/cosine_rewards": 0.0028820185689255595, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 121.28125, |
| "epoch": 0.041772796739684155, |
| "grad_norm": 21.56424441435487, |
| "kl": 0.110107421875, |
| "learning_rate": 9.79113601630158e-07, |
| "loss": 0.0044, |
| "reward": 1.2677271366119385, |
| "reward_std": 0.10610348492627963, |
| "rewards/accuracy_reward": 0.26874999701976776, |
| "rewards/cosine_rewards": -0.0010228125611320138, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.484375, |
| "epoch": 0.04228222109016811, |
| "grad_norm": 11.018514876954287, |
| "kl": 0.125244140625, |
| "learning_rate": 9.788588894549159e-07, |
| "loss": 0.005, |
| "reward": 1.2675296068191528, |
| "reward_std": 0.16161296842619777, |
| "rewards/accuracy_reward": 0.26874999701976776, |
| "rewards/cosine_rewards": -0.0012203185469843447, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 119.078125, |
| "epoch": 0.04279164544065207, |
| "grad_norm": 20.603407343348504, |
| "kl": 0.1083984375, |
| "learning_rate": 9.78604177279674e-07, |
| "loss": 0.0043, |
| "reward": 1.1548139452934265, |
| "reward_std": 0.537171483039856, |
| "rewards/accuracy_reward": 0.1562499925494194, |
| "rewards/cosine_rewards": -0.0014360386412590742, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 122.640625, |
| "epoch": 0.043301069791136015, |
| "grad_norm": 60.83530697951784, |
| "kl": 0.18359375, |
| "learning_rate": 9.78349465104432e-07, |
| "loss": 0.0073, |
| "reward": 1.5197246074676514, |
| "reward_std": 0.5150813460350037, |
| "rewards/accuracy_reward": 0.518750011920929, |
| "rewards/cosine_rewards": 0.000974582158960402, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 121.015625, |
| "epoch": 0.04381049414161997, |
| "grad_norm": 13.721540678774238, |
| "kl": 0.12451171875, |
| "learning_rate": 9.780947529291899e-07, |
| "loss": 0.005, |
| "reward": 1.1832407712936401, |
| "reward_std": 0.18641822785139084, |
| "rewards/accuracy_reward": 0.18437500298023224, |
| "rewards/cosine_rewards": -0.001134182559326291, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 119.578125, |
| "epoch": 0.04431991849210392, |
| "grad_norm": 215.40977191184217, |
| "kl": 0.115478515625, |
| "learning_rate": 9.77840040753948e-07, |
| "loss": 0.0046, |
| "reward": 1.2237018644809723, |
| "reward_std": 0.23010382801294327, |
| "rewards/accuracy_reward": 0.24062499776482582, |
| "rewards/cosine_rewards": -0.0012981001054868102, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 127.125, |
| "epoch": 0.044829342842587876, |
| "grad_norm": 11.570945117677702, |
| "kl": 0.110595703125, |
| "learning_rate": 9.77585328578706e-07, |
| "loss": 0.0044, |
| "reward": 1.5510605573654175, |
| "reward_std": 0.0015471973456442356, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.001060541602782905, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 129.953125, |
| "epoch": 0.04533876719307183, |
| "grad_norm": 9.451343704964001, |
| "kl": 0.104248046875, |
| "learning_rate": 9.77330616403464e-07, |
| "loss": 0.0042, |
| "reward": 1.5073344111442566, |
| "reward_std": 0.35981758683919907, |
| "rewards/accuracy_reward": 0.5218750052154064, |
| "rewards/cosine_rewards": 0.0010844313073903322, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 129.71875, |
| "epoch": 0.04584819154355578, |
| "grad_norm": 16.032910799390205, |
| "kl": 0.094970703125, |
| "learning_rate": 9.77075904228222e-07, |
| "loss": 0.0038, |
| "reward": 1.919905662536621, |
| "reward_std": 0.24129686888772994, |
| "rewards/accuracy_reward": 0.9156250059604645, |
| "rewards/cosine_rewards": 0.0042806623969227076, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 130.390625, |
| "epoch": 0.046357615894039736, |
| "grad_norm": 21.086614900693085, |
| "kl": 0.101806640625, |
| "learning_rate": 9.768211920529801e-07, |
| "loss": 0.0041, |
| "reward": 1.5919697284698486, |
| "reward_std": 0.2428576573729515, |
| "rewards/accuracy_reward": 0.6062500029802322, |
| "rewards/cosine_rewards": 0.0013447333476506174, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.734375, |
| "epoch": 0.04686704024452369, |
| "grad_norm": 8.093804874468816, |
| "kl": 0.095458984375, |
| "learning_rate": 9.76566479877738e-07, |
| "loss": 0.0038, |
| "reward": 1.6935226917266846, |
| "reward_std": 0.1869470328092575, |
| "rewards/accuracy_reward": 0.690625011920929, |
| "rewards/cosine_rewards": 0.0028977063193451613, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.203125, |
| "epoch": 0.04737646459500764, |
| "grad_norm": 10.624707519768712, |
| "kl": 0.099609375, |
| "learning_rate": 9.763117677024962e-07, |
| "loss": 0.004, |
| "reward": 1.4030739068984985, |
| "reward_std": 0.3680836334824562, |
| "rewards/accuracy_reward": 0.43437500298023224, |
| "rewards/cosine_rewards": -5.1158247515559196e-05, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.90625, |
| "epoch": 0.047885888945491596, |
| "grad_norm": 25.156466743552734, |
| "kl": 0.101318359375, |
| "learning_rate": 9.760570555272541e-07, |
| "loss": 0.0041, |
| "reward": 1.5800000429153442, |
| "reward_std": 0.5084549486637115, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/cosine_rewards": 0.0018750545859802514, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 148.46875, |
| "epoch": 0.048395313295975545, |
| "grad_norm": 12.684719084813777, |
| "kl": 0.10205078125, |
| "learning_rate": 9.758023433520122e-07, |
| "loss": 0.0041, |
| "reward": 1.5234779119491577, |
| "reward_std": 0.18682076036930084, |
| "rewards/accuracy_reward": 0.5218749940395355, |
| "rewards/cosine_rewards": 0.0016028713434934616, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 161.109375, |
| "epoch": 0.0489047376464595, |
| "grad_norm": 13.854458043447748, |
| "kl": 0.108154296875, |
| "learning_rate": 9.755476311767702e-07, |
| "loss": 0.0043, |
| "reward": 1.6655999422073364, |
| "reward_std": 0.4286635220050812, |
| "rewards/accuracy_reward": 0.6624999791383743, |
| "rewards/cosine_rewards": 0.0030998505535535514, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 167.609375, |
| "epoch": 0.049414161996943456, |
| "grad_norm": 15.632405914543359, |
| "kl": 0.098388671875, |
| "learning_rate": 9.752929190015283e-07, |
| "loss": 0.0039, |
| "reward": 1.1535860896110535, |
| "reward_std": 0.36188751459121704, |
| "rewards/accuracy_reward": 0.1562499888241291, |
| "rewards/cosine_rewards": -0.002663849270902574, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.453125, |
| "epoch": 0.049923586347427405, |
| "grad_norm": 8.747829969093605, |
| "kl": 0.112060546875, |
| "learning_rate": 9.750382068262862e-07, |
| "loss": 0.0045, |
| "reward": 1.3531205654144287, |
| "reward_std": 0.18947682529687881, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": -4.528439603745937e-06, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.71875, |
| "epoch": 0.05043301069791136, |
| "grad_norm": 12.880266278774922, |
| "kl": 0.112060546875, |
| "learning_rate": 9.747834946510442e-07, |
| "loss": 0.0045, |
| "reward": 1.619386613368988, |
| "reward_std": 0.5798123776912689, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.0037616335321217775, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 173.859375, |
| "epoch": 0.05094243504839532, |
| "grad_norm": 18.637994264229288, |
| "kl": 0.109619140625, |
| "learning_rate": 9.745287824758023e-07, |
| "loss": 0.0044, |
| "reward": 1.448248565196991, |
| "reward_std": 0.4085986465215683, |
| "rewards/accuracy_reward": 0.4624999687075615, |
| "rewards/cosine_rewards": 0.0013735336251556873, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.375, |
| "epoch": 0.051451859398879266, |
| "grad_norm": 26.647397602965935, |
| "kl": 0.110107421875, |
| "learning_rate": 9.742740703005602e-07, |
| "loss": 0.0044, |
| "reward": 1.0668614506721497, |
| "reward_std": 0.35026729106903076, |
| "rewards/accuracy_reward": 0.07187499292194843, |
| "rewards/cosine_rewards": -0.004894306650385261, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.00011927480954909697, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 174.0, |
| "epoch": 0.05196128374936322, |
| "grad_norm": 12.612763516820905, |
| "kl": 0.112060546875, |
| "learning_rate": 9.740193581253183e-07, |
| "loss": 0.0045, |
| "reward": 1.4354371428489685, |
| "reward_std": 0.20804932340979576, |
| "rewards/accuracy_reward": 0.46562499552965164, |
| "rewards/cosine_rewards": 0.0010621265973895788, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 176.984375, |
| "epoch": 0.05247070809984717, |
| "grad_norm": 19.00540852037756, |
| "kl": 0.116455078125, |
| "learning_rate": 9.737646459500763e-07, |
| "loss": 0.0047, |
| "reward": 1.0997494161128998, |
| "reward_std": 0.5674505531787872, |
| "rewards/accuracy_reward": 0.1499999761581421, |
| "rewards/cosine_rewards": -0.003250634763389826, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0001250000059371814, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 187.3125, |
| "epoch": 0.052980132450331126, |
| "grad_norm": 8.993773014446798, |
| "kl": 0.115478515625, |
| "learning_rate": 9.735099337748344e-07, |
| "loss": 0.0046, |
| "reward": 1.547185480594635, |
| "reward_std": 0.5772347450256348, |
| "rewards/accuracy_reward": 0.5750000029802322, |
| "rewards/cosine_rewards": 0.0034354651579633355, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 182.09375, |
| "epoch": 0.05348955680081508, |
| "grad_norm": 21.83379704072219, |
| "kl": 0.11279296875, |
| "learning_rate": 9.732552215995923e-07, |
| "loss": 0.0045, |
| "reward": 0.9665651321411133, |
| "reward_std": 0.19240357726812363, |
| "rewards/accuracy_reward": -0.012500010430812836, |
| "rewards/cosine_rewards": -0.005309856729581952, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 184.296875, |
| "epoch": 0.05399898115129903, |
| "grad_norm": 15.74936299058057, |
| "kl": 0.1240234375, |
| "learning_rate": 9.730005094243505e-07, |
| "loss": 0.005, |
| "reward": 0.8526512682437897, |
| "reward_std": 0.45641621947288513, |
| "rewards/accuracy_reward": -0.1250000149011612, |
| "rewards/cosine_rewards": -0.006723731989040971, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 178.203125, |
| "epoch": 0.054508405501782986, |
| "grad_norm": 7.621263779056561, |
| "kl": 0.116455078125, |
| "learning_rate": 9.727457972491084e-07, |
| "loss": 0.0047, |
| "reward": 1.4213617444038391, |
| "reward_std": 0.42073580622673035, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": -0.0005132523947395384, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 180.234375, |
| "epoch": 0.05501782985226694, |
| "grad_norm": 14.098307573524853, |
| "kl": 0.119140625, |
| "learning_rate": 9.724910850738665e-07, |
| "loss": 0.0048, |
| "reward": 1.1232723593711853, |
| "reward_std": 0.45567604154348373, |
| "rewards/accuracy_reward": 0.12812499329447746, |
| "rewards/cosine_rewards": -0.004852580255828798, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 183.328125, |
| "epoch": 0.05552725420275089, |
| "grad_norm": 10.937546247684887, |
| "kl": 0.18994140625, |
| "learning_rate": 9.722363728986245e-07, |
| "loss": 0.0076, |
| "reward": 1.8381596803665161, |
| "reward_std": 0.28626738488674164, |
| "rewards/accuracy_reward": 0.831250011920929, |
| "rewards/cosine_rewards": 0.006909639807417989, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 174.375, |
| "epoch": 0.056036678553234846, |
| "grad_norm": 16.445027328915916, |
| "kl": 0.11181640625, |
| "learning_rate": 9.719816607233826e-07, |
| "loss": 0.0045, |
| "reward": 1.2096136808395386, |
| "reward_std": 0.36066293716430664, |
| "rewards/accuracy_reward": 0.21249999105930328, |
| "rewards/cosine_rewards": -0.002886334084905684, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.765625, |
| "epoch": 0.056546102903718795, |
| "grad_norm": 18.874204475299454, |
| "kl": 0.106689453125, |
| "learning_rate": 9.717269485481405e-07, |
| "loss": 0.0043, |
| "reward": 1.3519207835197449, |
| "reward_std": 0.08376272046007216, |
| "rewards/accuracy_reward": 0.3531250059604645, |
| "rewards/cosine_rewards": -0.0012042350135743618, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 157.796875, |
| "epoch": 0.05705552725420275, |
| "grad_norm": 9.104959106458736, |
| "kl": 0.121337890625, |
| "learning_rate": 9.714722363728986e-07, |
| "loss": 0.0049, |
| "reward": 1.381228744983673, |
| "reward_std": 0.16323383897542953, |
| "rewards/accuracy_reward": 0.3812500238418579, |
| "rewards/cosine_rewards": -2.1282234229147434e-05, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 162.171875, |
| "epoch": 0.05756495160468671, |
| "grad_norm": 13.28611466805594, |
| "kl": 0.10888671875, |
| "learning_rate": 9.712175241976566e-07, |
| "loss": 0.0044, |
| "reward": 1.3366525173187256, |
| "reward_std": 0.28694501193240285, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": -0.0008475282229483128, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 159.4375, |
| "epoch": 0.058074375955170655, |
| "grad_norm": 20.0350592953945, |
| "kl": 0.107666015625, |
| "learning_rate": 9.709628120224145e-07, |
| "loss": 0.0043, |
| "reward": 1.4107850790023804, |
| "reward_std": 0.18804995715618134, |
| "rewards/accuracy_reward": 0.40937501937150955, |
| "rewards/cosine_rewards": 0.001410042867064476, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 159.6875, |
| "epoch": 0.05858380030565461, |
| "grad_norm": 9.526099983425397, |
| "kl": 0.10595703125, |
| "learning_rate": 9.707080998471726e-07, |
| "loss": 0.0042, |
| "reward": 1.4221826791763306, |
| "reward_std": 0.2918977811932564, |
| "rewards/accuracy_reward": 0.4374999925494194, |
| "rewards/cosine_rewards": 0.00030758429784327745, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 157.984375, |
| "epoch": 0.05909322465613857, |
| "grad_norm": 14.046929728525855, |
| "kl": 0.11572265625, |
| "learning_rate": 9.704533876719306e-07, |
| "loss": 0.0046, |
| "reward": 1.2389479279518127, |
| "reward_std": 0.4534989148378372, |
| "rewards/accuracy_reward": 0.24062498658895493, |
| "rewards/cosine_rewards": -0.00167706364300102, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 167.390625, |
| "epoch": 0.059602649006622516, |
| "grad_norm": 12.733865198375515, |
| "kl": 0.105224609375, |
| "learning_rate": 9.701986754966887e-07, |
| "loss": 0.0042, |
| "reward": 1.0669120252132416, |
| "reward_std": 0.319850392639637, |
| "rewards/accuracy_reward": 0.07187498360872269, |
| "rewards/cosine_rewards": -0.004963014740496874, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.140625, |
| "epoch": 0.06011207335710647, |
| "grad_norm": 11.670513012812588, |
| "kl": 0.093505859375, |
| "learning_rate": 9.699439633214466e-07, |
| "loss": 0.0038, |
| "reward": 1.665140986442566, |
| "reward_std": 0.12403370253741741, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.0026409668498672545, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 155.828125, |
| "epoch": 0.06062149770759042, |
| "grad_norm": 28.993909689663674, |
| "kl": 0.1015625, |
| "learning_rate": 9.696892511462047e-07, |
| "loss": 0.0041, |
| "reward": 1.0673952102661133, |
| "reward_std": 0.30907338857650757, |
| "rewards/accuracy_reward": 0.07187499105930328, |
| "rewards/cosine_rewards": -0.004479756113141775, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 154.40625, |
| "epoch": 0.061130922058074376, |
| "grad_norm": 27.685530130702478, |
| "kl": 0.104736328125, |
| "learning_rate": 9.694345389709627e-07, |
| "loss": 0.0042, |
| "reward": 1.4373126029968262, |
| "reward_std": 0.21315501490607858, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": -0.0001873411238193512, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.0625, |
| "epoch": 0.06164034640855833, |
| "grad_norm": 12.6569610895899, |
| "kl": 0.121826171875, |
| "learning_rate": 9.691798267957208e-07, |
| "loss": 0.0049, |
| "reward": 1.3240773677825928, |
| "reward_std": 0.26775629818439484, |
| "rewards/accuracy_reward": 0.32499999552965164, |
| "rewards/cosine_rewards": -0.0009226472466252744, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 151.78125, |
| "epoch": 0.06214977075904228, |
| "grad_norm": 11.557058687556623, |
| "kl": 0.103759765625, |
| "learning_rate": 9.689251146204787e-07, |
| "loss": 0.0041, |
| "reward": 1.7503631114959717, |
| "reward_std": 0.08287379238754511, |
| "rewards/accuracy_reward": 0.7468750178813934, |
| "rewards/cosine_rewards": 0.0034881452447734773, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 158.578125, |
| "epoch": 0.06265919510952624, |
| "grad_norm": 19.676894670897823, |
| "kl": 0.1025390625, |
| "learning_rate": 9.686704024452369e-07, |
| "loss": 0.0041, |
| "reward": 1.3520426154136658, |
| "reward_std": 0.24371477961540222, |
| "rewards/accuracy_reward": 0.3531249761581421, |
| "rewards/cosine_rewards": -0.0010823981137946248, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 167.484375, |
| "epoch": 0.06316861946001019, |
| "grad_norm": 7.7934253879172894, |
| "kl": 0.10107421875, |
| "learning_rate": 9.684156902699948e-07, |
| "loss": 0.004, |
| "reward": 1.4387494623661041, |
| "reward_std": 0.26880691200494766, |
| "rewards/accuracy_reward": 0.4374999888241291, |
| "rewards/cosine_rewards": 0.0015281732194125652, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.000278731546131894, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 160.671875, |
| "epoch": 0.06367804381049415, |
| "grad_norm": 14.376740722468174, |
| "kl": 0.1064453125, |
| "learning_rate": 9.68160978094753e-07, |
| "loss": 0.0043, |
| "reward": 1.2107464671134949, |
| "reward_std": 0.20096861571073532, |
| "rewards/accuracy_reward": 0.21249999292194843, |
| "rewards/cosine_rewards": -0.0017535560764372349, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 158.25, |
| "epoch": 0.06418746816097809, |
| "grad_norm": 24.20355073697258, |
| "kl": 0.10693359375, |
| "learning_rate": 9.679062659195109e-07, |
| "loss": 0.0043, |
| "reward": 1.09614896774292, |
| "reward_std": 0.4781967103481293, |
| "rewards/accuracy_reward": 0.09999998658895493, |
| "rewards/cosine_rewards": -0.0038510175654664636, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 165.203125, |
| "epoch": 0.06469689251146205, |
| "grad_norm": 8.56510891629326, |
| "kl": 0.111083984375, |
| "learning_rate": 9.676515537442688e-07, |
| "loss": 0.0044, |
| "reward": 1.5521512031555176, |
| "reward_std": 0.46633191406726837, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.002151212247554213, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 164.421875, |
| "epoch": 0.065206316861946, |
| "grad_norm": 13.379306876395301, |
| "kl": 0.120849609375, |
| "learning_rate": 9.67396841569027e-07, |
| "loss": 0.0048, |
| "reward": 1.722820222377777, |
| "reward_std": 0.32213538885116577, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/cosine_rewards": 0.004070190014317632, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 173.703125, |
| "epoch": 0.06571574121242996, |
| "grad_norm": 153.40445361752842, |
| "kl": 0.1162109375, |
| "learning_rate": 9.67142129393785e-07, |
| "loss": 0.0047, |
| "reward": 1.2643532752990723, |
| "reward_std": 0.23417328391224146, |
| "rewards/accuracy_reward": 0.2656250037252903, |
| "rewards/cosine_rewards": -0.0012717264471575618, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.5625, |
| "epoch": 0.06622516556291391, |
| "grad_norm": 9.275290295194138, |
| "kl": 0.10302734375, |
| "learning_rate": 9.66887417218543e-07, |
| "loss": 0.0041, |
| "reward": 1.0960015654563904, |
| "reward_std": 0.21426187455654144, |
| "rewards/accuracy_reward": 0.09999999403953552, |
| "rewards/cosine_rewards": -0.003998432832304388, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 176.265625, |
| "epoch": 0.06673458991339785, |
| "grad_norm": 19.81840956306445, |
| "kl": 0.105224609375, |
| "learning_rate": 9.66632705043301e-07, |
| "loss": 0.0042, |
| "reward": 1.6365814805030823, |
| "reward_std": 0.20723329484462738, |
| "rewards/accuracy_reward": 0.6343750059604645, |
| "rewards/cosine_rewards": 0.0022064344957470894, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 174.234375, |
| "epoch": 0.06724401426388181, |
| "grad_norm": 8.864924483945437, |
| "kl": 0.108642578125, |
| "learning_rate": 9.66377992868059e-07, |
| "loss": 0.0044, |
| "reward": 1.3077268600463867, |
| "reward_std": 0.3278057724237442, |
| "rewards/accuracy_reward": 0.32500000670552254, |
| "rewards/cosine_rewards": -0.0012397709069773555, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00040839536814019084, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.6875, |
| "epoch": 0.06775343861436577, |
| "grad_norm": 14.31694516925494, |
| "kl": 0.112060546875, |
| "learning_rate": 9.661232806928172e-07, |
| "loss": 0.0045, |
| "reward": 1.3927981853485107, |
| "reward_std": 0.3101032227277756, |
| "rewards/accuracy_reward": 0.40937499701976776, |
| "rewards/cosine_rewards": -0.0007292817026609555, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00022258506942307577, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 174.515625, |
| "epoch": 0.06826286296484972, |
| "grad_norm": 13.206286933876525, |
| "kl": 0.110107421875, |
| "learning_rate": 9.65868568517575e-07, |
| "loss": 0.0044, |
| "reward": 1.4952284097671509, |
| "reward_std": 0.16513758851215243, |
| "rewards/accuracy_reward": 0.4937499910593033, |
| "rewards/cosine_rewards": 0.0014784452505409718, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 188.15625, |
| "epoch": 0.06877228731533368, |
| "grad_norm": 8.196723017225267, |
| "kl": 0.110107421875, |
| "learning_rate": 9.656138563423332e-07, |
| "loss": 0.0044, |
| "reward": 1.3523318767547607, |
| "reward_std": 0.19119784235954285, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": -0.0007931197178550065, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 184.375, |
| "epoch": 0.06928171166581763, |
| "grad_norm": 7.96588213615428, |
| "kl": 0.10400390625, |
| "learning_rate": 9.653591441670911e-07, |
| "loss": 0.0042, |
| "reward": 1.3809208273887634, |
| "reward_std": 0.16492938250303268, |
| "rewards/accuracy_reward": 0.3812500238418579, |
| "rewards/cosine_rewards": -0.00032906350679695606, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 179.78125, |
| "epoch": 0.06979113601630157, |
| "grad_norm": 15.45343619628501, |
| "kl": 0.1142578125, |
| "learning_rate": 9.651044319918493e-07, |
| "loss": 0.0046, |
| "reward": 1.5364066362380981, |
| "reward_std": 0.3309681713581085, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.002031611278653145, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 191.484375, |
| "epoch": 0.07030056036678553, |
| "grad_norm": 10.658123621710919, |
| "kl": 0.1142578125, |
| "learning_rate": 9.648497198166072e-07, |
| "loss": 0.0046, |
| "reward": 1.5228744149208069, |
| "reward_std": 0.08533496968448162, |
| "rewards/accuracy_reward": 0.5218750089406967, |
| "rewards/cosine_rewards": 0.0011002181563526392, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.00010080645006382838, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.734375, |
| "epoch": 0.07080998471726949, |
| "grad_norm": 13.847520067525743, |
| "kl": 0.118408203125, |
| "learning_rate": 9.645950076413653e-07, |
| "loss": 0.0047, |
| "reward": 0.6919489502906799, |
| "reward_std": 0.29361478984355927, |
| "rewards/accuracy_reward": -0.29375001788139343, |
| "rewards/cosine_rewards": -0.014301038347184658, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 187.6875, |
| "epoch": 0.07131940906775344, |
| "grad_norm": 32.08146525993247, |
| "kl": 0.115234375, |
| "learning_rate": 9.643402954661233e-07, |
| "loss": 0.0046, |
| "reward": 1.3814507126808167, |
| "reward_std": 0.10938079445622861, |
| "rewards/accuracy_reward": 0.3812499940395355, |
| "rewards/cosine_rewards": 0.00031310925260186195, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.00011241007450735196, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.890625, |
| "epoch": 0.0718288334182374, |
| "grad_norm": 13.580365765026054, |
| "kl": 0.12158203125, |
| "learning_rate": 9.640855832908814e-07, |
| "loss": 0.0049, |
| "reward": 1.2904618978500366, |
| "reward_std": 0.09482555650174618, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": -0.006186658749356866, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.00022644927958026528, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 196.5, |
| "epoch": 0.07233825776872134, |
| "grad_norm": 24.285612348709225, |
| "kl": 0.113525390625, |
| "learning_rate": 9.638308711156393e-07, |
| "loss": 0.0045, |
| "reward": 1.4387189745903015, |
| "reward_std": 0.30863603949546814, |
| "rewards/accuracy_reward": 0.4374999888241291, |
| "rewards/cosine_rewards": 0.0012189627159386873, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.5, |
| "epoch": 0.0728476821192053, |
| "grad_norm": 22.094925488371874, |
| "kl": 0.11669921875, |
| "learning_rate": 9.635761589403972e-07, |
| "loss": 0.0047, |
| "reward": 1.495344638824463, |
| "reward_std": 0.46879828721284866, |
| "rewards/accuracy_reward": 0.4937499761581421, |
| "rewards/cosine_rewards": 0.0015945886261761189, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 195.125, |
| "epoch": 0.07335710646968925, |
| "grad_norm": 7.954348731599845, |
| "kl": 0.1259765625, |
| "learning_rate": 9.633214467651554e-07, |
| "loss": 0.005, |
| "reward": 1.5794875025749207, |
| "reward_std": 0.2703954949975014, |
| "rewards/accuracy_reward": 0.6062500178813934, |
| "rewards/cosine_rewards": 0.0044874417362734675, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 203.234375, |
| "epoch": 0.0738665308201732, |
| "grad_norm": 10.240352920236468, |
| "kl": 0.1240234375, |
| "learning_rate": 9.630667345899133e-07, |
| "loss": 0.005, |
| "reward": 1.323024868965149, |
| "reward_std": 0.3642221838235855, |
| "rewards/accuracy_reward": 0.32499999552965164, |
| "rewards/cosine_rewards": -0.0019751336076296866, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 210.96875, |
| "epoch": 0.07437595517065716, |
| "grad_norm": 7.757169620409243, |
| "kl": 0.1318359375, |
| "learning_rate": 9.628120224146714e-07, |
| "loss": 0.0053, |
| "reward": 1.4797114729881287, |
| "reward_std": 0.40076301991939545, |
| "rewards/accuracy_reward": 0.4937500078231096, |
| "rewards/cosine_rewards": 0.001799287972971797, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00021284000831656158, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 203.015625, |
| "epoch": 0.0748853795211411, |
| "grad_norm": 9.791138031684504, |
| "kl": 0.1123046875, |
| "learning_rate": 9.625573102394294e-07, |
| "loss": 0.0045, |
| "reward": 1.5510019659996033, |
| "reward_std": 0.27960680425167084, |
| "rewards/accuracy_reward": 0.5781249850988388, |
| "rewards/cosine_rewards": 0.0043075907160528, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0001806358341127634, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 211.6875, |
| "epoch": 0.07539480387162506, |
| "grad_norm": 7.852116556950569, |
| "kl": 0.12060546875, |
| "learning_rate": 9.623025980641875e-07, |
| "loss": 0.0048, |
| "reward": 1.2943141460418701, |
| "reward_std": 0.43064263463020325, |
| "rewards/accuracy_reward": 0.2968749701976776, |
| "rewards/cosine_rewards": -0.0023121244739741087, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0002487746678525582, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.8125, |
| "epoch": 0.07590422822210902, |
| "grad_norm": 19.19937094751421, |
| "kl": 0.1240234375, |
| "learning_rate": 9.620478858889454e-07, |
| "loss": 0.005, |
| "reward": 1.8400413990020752, |
| "reward_std": 0.39075249433517456, |
| "rewards/accuracy_reward": 0.8593749701976776, |
| "rewards/cosine_rewards": 0.011916308663785458, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.890625, |
| "epoch": 0.07641365257259297, |
| "grad_norm": 17.532821778348946, |
| "kl": 0.1376953125, |
| "learning_rate": 9.617931737137036e-07, |
| "loss": 0.0055, |
| "reward": 1.5498095750808716, |
| "reward_std": 0.29365313798189163, |
| "rewards/accuracy_reward": 0.5781250298023224, |
| "rewards/cosine_rewards": 0.0030243303044699132, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -8.979885024018586e-05, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 215.3125, |
| "epoch": 0.07692307692307693, |
| "grad_norm": 8.653969191960044, |
| "kl": 0.120361328125, |
| "learning_rate": 9.615384615384615e-07, |
| "loss": 0.0048, |
| "reward": 1.2203205227851868, |
| "reward_std": 0.5703159868717194, |
| "rewards/accuracy_reward": 0.24062499403953552, |
| "rewards/cosine_rewards": -0.004613903176505119, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -6.565126386703923e-05, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 214.484375, |
| "epoch": 0.07743250127356088, |
| "grad_norm": 56.921468540439704, |
| "kl": 0.119873046875, |
| "learning_rate": 9.612837493632196e-07, |
| "loss": 0.0048, |
| "reward": 1.2310086488723755, |
| "reward_std": 0.41925153136253357, |
| "rewards/accuracy_reward": 0.2656249925494194, |
| "rewards/cosine_rewards": -0.00321156473364681, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.00015470296784769744, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.0, |
| "epoch": 0.07794192562404482, |
| "grad_norm": 9.400079372239615, |
| "kl": 0.107666015625, |
| "learning_rate": 9.610290371879775e-07, |
| "loss": 0.0043, |
| "reward": 1.6124141216278076, |
| "reward_std": 0.487982913851738, |
| "rewards/accuracy_reward": 0.606249988079071, |
| "rewards/cosine_rewards": 0.006320342654362321, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.00015624999650754035, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 214.90625, |
| "epoch": 0.07845134997452878, |
| "grad_norm": 16.320123460893743, |
| "kl": 0.125732421875, |
| "learning_rate": 9.607743250127357e-07, |
| "loss": 0.005, |
| "reward": 1.5100122094154358, |
| "reward_std": 0.4279818534851074, |
| "rewards/accuracy_reward": 0.5218749791383743, |
| "rewards/cosine_rewards": 0.0037621970986947417, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.125, |
| "epoch": 0.07896077432501274, |
| "grad_norm": 12.345482711993489, |
| "kl": 0.163330078125, |
| "learning_rate": 9.605196128374936e-07, |
| "loss": 0.0065, |
| "reward": 0.9318991005420685, |
| "reward_std": 0.23740804940462112, |
| "rewards/accuracy_reward": -0.04062497615814209, |
| "rewards/cosine_rewards": -0.011850890005007386, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.671875, |
| "epoch": 0.07947019867549669, |
| "grad_norm": 6.991107442023172, |
| "kl": 0.1240234375, |
| "learning_rate": 9.602649006622515e-07, |
| "loss": 0.005, |
| "reward": 0.9476701319217682, |
| "reward_std": 0.33865927904844284, |
| "rewards/accuracy_reward": -0.04062502086162567, |
| "rewards/cosine_rewards": -0.011704806645866483, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 239.46875, |
| "epoch": 0.07997962302598065, |
| "grad_norm": 11.299778697394407, |
| "kl": 0.117919921875, |
| "learning_rate": 9.600101884870097e-07, |
| "loss": 0.0047, |
| "reward": 1.3649136424064636, |
| "reward_std": 0.42557042837142944, |
| "rewards/accuracy_reward": 0.3812499940395355, |
| "rewards/cosine_rewards": -0.0007113651372492313, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 231.015625, |
| "epoch": 0.08048904737646459, |
| "grad_norm": 16.322542283478924, |
| "kl": 0.12255859375, |
| "learning_rate": 9.597554763117676e-07, |
| "loss": 0.0049, |
| "reward": 1.3941306471824646, |
| "reward_std": 0.4155275672674179, |
| "rewards/accuracy_reward": 0.40937498211860657, |
| "rewards/cosine_rewards": 0.000547687232028693, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00016711230273358524, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.3125, |
| "epoch": 0.08099847172694855, |
| "grad_norm": 6.393101239111367, |
| "kl": 0.11865234375, |
| "learning_rate": 9.595007641365257e-07, |
| "loss": 0.0047, |
| "reward": 1.2930153012275696, |
| "reward_std": 0.2976529533043504, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": -0.0038597104139626026, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 215.125, |
| "epoch": 0.0815078960774325, |
| "grad_norm": 15.2053157125375, |
| "kl": 0.119140625, |
| "learning_rate": 9.592460519612836e-07, |
| "loss": 0.0048, |
| "reward": 1.2102863192558289, |
| "reward_std": 0.43368688225746155, |
| "rewards/accuracy_reward": 0.2124999836087227, |
| "rewards/cosine_rewards": -0.0022136420011520386, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 209.109375, |
| "epoch": 0.08201732042791646, |
| "grad_norm": 8.230345044429809, |
| "kl": 0.11376953125, |
| "learning_rate": 9.589913397860418e-07, |
| "loss": 0.0045, |
| "reward": 1.5247125625610352, |
| "reward_std": 0.313697911798954, |
| "rewards/accuracy_reward": 0.5218749791383743, |
| "rewards/cosine_rewards": 0.0028375727706588805, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.421875, |
| "epoch": 0.08252674477840041, |
| "grad_norm": 7.440050437747031, |
| "kl": 0.132568359375, |
| "learning_rate": 9.587366276107997e-07, |
| "loss": 0.0053, |
| "reward": 1.4958758354187012, |
| "reward_std": 0.2720055654644966, |
| "rewards/accuracy_reward": 0.4937499761581421, |
| "rewards/cosine_rewards": 0.002125886792782694, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.125, |
| "epoch": 0.08303616912888435, |
| "grad_norm": 44.81125981696038, |
| "kl": 0.119384765625, |
| "learning_rate": 9.584819154355578e-07, |
| "loss": 0.0048, |
| "reward": 1.5242316722869873, |
| "reward_std": 0.6014019548892975, |
| "rewards/accuracy_reward": 0.5218749940395355, |
| "rewards/cosine_rewards": 0.0023566827294416726, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 207.046875, |
| "epoch": 0.08354559347936831, |
| "grad_norm": 13.052733655899614, |
| "kl": 0.119140625, |
| "learning_rate": 9.582272032603158e-07, |
| "loss": 0.0048, |
| "reward": 1.6689130067825317, |
| "reward_std": 0.2884200101252645, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.0064131125109270215, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.53125, |
| "epoch": 0.08405501782985227, |
| "grad_norm": 45.85990240379708, |
| "kl": 0.455078125, |
| "learning_rate": 9.57972491085074e-07, |
| "loss": 0.0181, |
| "reward": 1.72576242685318, |
| "reward_std": 0.48727013170719147, |
| "rewards/accuracy_reward": 0.7187499701976776, |
| "rewards/cosine_rewards": 0.007012464571744204, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 203.890625, |
| "epoch": 0.08456444218033622, |
| "grad_norm": 62.93710363981597, |
| "kl": 0.117431640625, |
| "learning_rate": 9.577177789098318e-07, |
| "loss": 0.0047, |
| "reward": 0.9781621694564819, |
| "reward_std": 0.20786645263433456, |
| "rewards/accuracy_reward": -0.012500008568167686, |
| "rewards/cosine_rewards": -0.009250549599528313, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -8.729050023248419e-05, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.109375, |
| "epoch": 0.08507386653082018, |
| "grad_norm": 11.046432411232663, |
| "kl": 0.13134765625, |
| "learning_rate": 9.5746306673459e-07, |
| "loss": 0.0052, |
| "reward": 1.3762089014053345, |
| "reward_std": 0.32985249161720276, |
| "rewards/accuracy_reward": 0.37812500819563866, |
| "rewards/cosine_rewards": -0.0019161199452355504, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.578125, |
| "epoch": 0.08558329088130413, |
| "grad_norm": 5.950394983602238, |
| "kl": 0.11279296875, |
| "learning_rate": 9.572083545593479e-07, |
| "loss": 0.0045, |
| "reward": 1.0190700888633728, |
| "reward_std": 0.6297826766967773, |
| "rewards/accuracy_reward": 0.04062497615814209, |
| "rewards/cosine_rewards": -0.005929919425398111, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.71875, |
| "epoch": 0.08609271523178808, |
| "grad_norm": 17.640294707452878, |
| "kl": 0.11572265625, |
| "learning_rate": 9.56953642384106e-07, |
| "loss": 0.0046, |
| "reward": 0.9798631221055984, |
| "reward_std": 0.20544240390881896, |
| "rewards/accuracy_reward": -0.012500017881393433, |
| "rewards/cosine_rewards": -0.007636879570782185, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 190.34375, |
| "epoch": 0.08660213958227203, |
| "grad_norm": 7.762505201079191, |
| "kl": 0.112548828125, |
| "learning_rate": 9.56698930208864e-07, |
| "loss": 0.0045, |
| "reward": 1.152494490146637, |
| "reward_std": 0.30975981056690216, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/cosine_rewards": -0.0037555836606770754, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 194.875, |
| "epoch": 0.08711156393275599, |
| "grad_norm": 11.506198147130426, |
| "kl": 0.111083984375, |
| "learning_rate": 9.564442180336219e-07, |
| "loss": 0.0045, |
| "reward": 0.9780029058456421, |
| "reward_std": 0.6867689490318298, |
| "rewards/accuracy_reward": -0.012500010430812836, |
| "rewards/cosine_rewards": -0.009497055783867836, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.125, |
| "epoch": 0.08762098828323994, |
| "grad_norm": 11.819517524957538, |
| "kl": 0.10546875, |
| "learning_rate": 9.5618950585838e-07, |
| "loss": 0.0042, |
| "reward": 1.3241556882858276, |
| "reward_std": 0.3773365914821625, |
| "rewards/accuracy_reward": 0.32499998807907104, |
| "rewards/cosine_rewards": -0.0008443233091384172, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 182.34375, |
| "epoch": 0.0881304126337239, |
| "grad_norm": 8.416955648144409, |
| "kl": 0.116943359375, |
| "learning_rate": 9.55934793683138e-07, |
| "loss": 0.0047, |
| "reward": 1.6661878824234009, |
| "reward_std": 0.20251824986189604, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.003687863936647773, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 191.921875, |
| "epoch": 0.08863983698420784, |
| "grad_norm": 7.131693684668596, |
| "kl": 0.12060546875, |
| "learning_rate": 9.55680081507896e-07, |
| "loss": 0.0048, |
| "reward": 1.0765551328659058, |
| "reward_std": 0.3972722738981247, |
| "rewards/accuracy_reward": 0.09999998845160007, |
| "rewards/cosine_rewards": -0.007819817401468754, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 181.75, |
| "epoch": 0.0891492613346918, |
| "grad_norm": 11.781793838489648, |
| "kl": 0.11083984375, |
| "learning_rate": 9.55425369332654e-07, |
| "loss": 0.0044, |
| "reward": 1.5515506863594055, |
| "reward_std": 0.3071342632174492, |
| "rewards/accuracy_reward": 0.5500000268220901, |
| "rewards/cosine_rewards": 0.0015506702475249767, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.609375, |
| "epoch": 0.08965868568517575, |
| "grad_norm": 40.566147710572594, |
| "kl": 0.109130859375, |
| "learning_rate": 9.551706571574121e-07, |
| "loss": 0.0044, |
| "reward": 1.5527549982070923, |
| "reward_std": 0.39126846194267273, |
| "rewards/accuracy_reward": 0.5499999895691872, |
| "rewards/cosine_rewards": 0.0027549704536795616, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 178.53125, |
| "epoch": 0.09016811003565971, |
| "grad_norm": 12.817541000164553, |
| "kl": 0.10595703125, |
| "learning_rate": 9.5491594498217e-07, |
| "loss": 0.0042, |
| "reward": 1.9808745980262756, |
| "reward_std": 0.08480274910107255, |
| "rewards/accuracy_reward": 0.971875011920929, |
| "rewards/cosine_rewards": 0.009283588267862797, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0002840909000951797, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 186.75, |
| "epoch": 0.09067753438614366, |
| "grad_norm": 7.914504626786531, |
| "kl": 0.103759765625, |
| "learning_rate": 9.546612328069282e-07, |
| "loss": 0.0041, |
| "reward": 1.5245178937911987, |
| "reward_std": 0.34963520616292953, |
| "rewards/accuracy_reward": 0.5218750089406967, |
| "rewards/cosine_rewards": 0.0026429439894855022, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 188.9375, |
| "epoch": 0.0911869587366276, |
| "grad_norm": 8.759963126963402, |
| "kl": 0.13037109375, |
| "learning_rate": 9.544065206316861e-07, |
| "loss": 0.0052, |
| "reward": 1.638785481452942, |
| "reward_std": 0.2284149518236518, |
| "rewards/accuracy_reward": 0.6343750059604645, |
| "rewards/cosine_rewards": 0.0044105148408561945, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.90625, |
| "epoch": 0.09169638308711156, |
| "grad_norm": 6.418887244829745, |
| "kl": 0.116943359375, |
| "learning_rate": 9.541518084564442e-07, |
| "loss": 0.0047, |
| "reward": 1.3810052275657654, |
| "reward_std": 0.4032685235142708, |
| "rewards/accuracy_reward": 0.3812499940395355, |
| "rewards/cosine_rewards": -0.0001616678200662136, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -8.311169949593022e-05, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 186.078125, |
| "epoch": 0.09220580743759552, |
| "grad_norm": 6.304857280986285, |
| "kl": 0.12890625, |
| "learning_rate": 9.538970962812022e-07, |
| "loss": 0.0051, |
| "reward": 1.2630045115947723, |
| "reward_std": 0.17365956178400666, |
| "rewards/accuracy_reward": 0.2656250037252903, |
| "rewards/cosine_rewards": -0.002620481769554317, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 190.265625, |
| "epoch": 0.09271523178807947, |
| "grad_norm": 11.35970580998742, |
| "kl": 0.11181640625, |
| "learning_rate": 9.536423841059602e-07, |
| "loss": 0.0045, |
| "reward": 1.6366259455680847, |
| "reward_std": 0.2085256204009056, |
| "rewards/accuracy_reward": 0.6343750059604645, |
| "rewards/cosine_rewards": 0.002349784132093191, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -9.889240755001083e-05, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 195.96875, |
| "epoch": 0.09322465613856343, |
| "grad_norm": 33.125698049494765, |
| "kl": 0.118408203125, |
| "learning_rate": 9.533876719307182e-07, |
| "loss": 0.0048, |
| "reward": 1.553468942642212, |
| "reward_std": 0.16592675540596247, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.003468883689492941, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": 0.0, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.171875, |
| "epoch": 0.09373408048904738, |
| "grad_norm": 15.866784423418947, |
| "kl": 0.115234375, |
| "learning_rate": 9.531329597554763e-07, |
| "loss": 0.0046, |
| "reward": 1.1207141280174255, |
| "reward_std": 0.19428733736276627, |
| "rewards/accuracy_reward": 0.12812499701976776, |
| "rewards/cosine_rewards": -0.007209272123873234, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.00020161290012765676, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.640625, |
| "epoch": 0.09424350483953133, |
| "grad_norm": 20.352317051449248, |
| "kl": 0.3115234375, |
| "learning_rate": 9.528782475802343e-07, |
| "loss": 0.0124, |
| "reward": 1.6525439023971558, |
| "reward_std": 0.38362888991832733, |
| "rewards/accuracy_reward": 0.6625000238418579, |
| "rewards/cosine_rewards": 0.005779681145213544, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00011081559932790697, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.234375, |
| "epoch": 0.09475292919001528, |
| "grad_norm": 9.691736527471202, |
| "kl": 0.124755859375, |
| "learning_rate": 9.526235354049923e-07, |
| "loss": 0.005, |
| "reward": 0.9626118838787079, |
| "reward_std": 0.40054861456155777, |
| "rewards/accuracy_reward": -0.015625011175870895, |
| "rewards/cosine_rewards": -0.006138101452961564, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.578125, |
| "epoch": 0.09526235354049924, |
| "grad_norm": 9.437447186660206, |
| "kl": 0.123046875, |
| "learning_rate": 9.523688232297503e-07, |
| "loss": 0.0049, |
| "reward": 1.569740116596222, |
| "reward_std": 0.1397167220711708, |
| "rewards/accuracy_reward": 0.5781250298023224, |
| "rewards/cosine_rewards": 0.007240177597850561, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 233.625, |
| "epoch": 0.09577177789098319, |
| "grad_norm": 14.943967064784786, |
| "kl": 0.15087890625, |
| "learning_rate": 9.521141110545084e-07, |
| "loss": 0.006, |
| "reward": 1.0288785099983215, |
| "reward_std": 0.2980290725827217, |
| "rewards/accuracy_reward": 0.07187499105930328, |
| "rewards/cosine_rewards": -0.011638639261946082, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.00010775862028822303, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.921875, |
| "epoch": 0.09628120224146715, |
| "grad_norm": 15.000304282598657, |
| "kl": 0.12744140625, |
| "learning_rate": 9.518593988792664e-07, |
| "loss": 0.0051, |
| "reward": 1.3486477732658386, |
| "reward_std": 0.30507488548755646, |
| "rewards/accuracy_reward": 0.34999997913837433, |
| "rewards/cosine_rewards": -0.0010681524872779846, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0002840909000951797, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.96875, |
| "epoch": 0.09679062659195109, |
| "grad_norm": 5.941418391788204, |
| "kl": 0.13916015625, |
| "learning_rate": 9.516046867040244e-07, |
| "loss": 0.0056, |
| "reward": 1.6924657821655273, |
| "reward_std": 0.3630830645561218, |
| "rewards/accuracy_reward": 0.7187499850988388, |
| "rewards/cosine_rewards": 0.004965720232576132, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.421875, |
| "epoch": 0.09730005094243505, |
| "grad_norm": 7.708020896616392, |
| "kl": 0.14453125, |
| "learning_rate": 9.513499745287824e-07, |
| "loss": 0.0058, |
| "reward": 1.250920683145523, |
| "reward_std": 0.4801155626773834, |
| "rewards/accuracy_reward": 0.2968749776482582, |
| "rewards/cosine_rewards": 0.0009206933900713921, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": 0.0, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.3125, |
| "epoch": 0.097809475292919, |
| "grad_norm": 13.916574397802314, |
| "kl": 0.13671875, |
| "learning_rate": 9.510952623535404e-07, |
| "loss": 0.0055, |
| "reward": 1.129820704460144, |
| "reward_std": 0.717576265335083, |
| "rewards/accuracy_reward": 0.21249999850988388, |
| "rewards/cosine_rewards": -0.0045542995212599635, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.21875, |
| "epoch": 0.09831889964340296, |
| "grad_norm": 5.201799859391353, |
| "kl": 0.13427734375, |
| "learning_rate": 9.508405501782984e-07, |
| "loss": 0.0054, |
| "reward": 1.3024629950523376, |
| "reward_std": 0.4307016432285309, |
| "rewards/accuracy_reward": 0.37812498956918716, |
| "rewards/cosine_rewards": 0.0024630045518279076, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.203125, |
| "epoch": 0.09882832399388691, |
| "grad_norm": 9.365246983313067, |
| "kl": 0.12939453125, |
| "learning_rate": 9.505858380030564e-07, |
| "loss": 0.0052, |
| "reward": 0.7090668827295303, |
| "reward_std": 0.5288920998573303, |
| "rewards/accuracy_reward": -0.23750002309679985, |
| "rewards/cosine_rewards": -0.022183137945830822, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 225.8125, |
| "epoch": 0.09933774834437085, |
| "grad_norm": 12.601377217707842, |
| "kl": 0.14501953125, |
| "learning_rate": 9.503311258278145e-07, |
| "loss": 0.0058, |
| "reward": 1.413894236087799, |
| "reward_std": 0.7254346013069153, |
| "rewards/accuracy_reward": 0.5218749940395355, |
| "rewards/cosine_rewards": 0.001518724486231804, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.00012443749437807128, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.1875, |
| "epoch": 0.09984717269485481, |
| "grad_norm": 6.45339117400833, |
| "kl": 0.1376953125, |
| "learning_rate": 9.500764136525725e-07, |
| "loss": 0.0055, |
| "reward": 1.5756230354309082, |
| "reward_std": 0.48759835958480835, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.007316130446270108, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0004430353583302349, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 227.796875, |
| "epoch": 0.10035659704533877, |
| "grad_norm": 11.359096408502863, |
| "kl": 0.2138671875, |
| "learning_rate": 9.498217014773305e-07, |
| "loss": 0.0086, |
| "reward": 1.2498727440834045, |
| "reward_std": 0.5983296632766724, |
| "rewards/accuracy_reward": 0.37812499701976776, |
| "rewards/cosine_rewards": -0.0031815596157684922, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -7.070136052789167e-05, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.234375, |
| "epoch": 0.10086602139582272, |
| "grad_norm": 13.920508776432966, |
| "kl": 0.12255859375, |
| "learning_rate": 9.495669893020886e-07, |
| "loss": 0.0049, |
| "reward": 0.6862081587314606, |
| "reward_std": 0.7485357820987701, |
| "rewards/accuracy_reward": -0.2656250223517418, |
| "rewards/cosine_rewards": -0.01686593284830451, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -5.089576370664872e-05, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.140625, |
| "epoch": 0.10137544574630668, |
| "grad_norm": 8.671114765474545, |
| "kl": 0.124267578125, |
| "learning_rate": 9.493122771268466e-07, |
| "loss": 0.005, |
| "reward": 1.1818422079086304, |
| "reward_std": 0.5584293901920319, |
| "rewards/accuracy_reward": 0.29375000298023224, |
| "rewards/cosine_rewards": -0.002258662148960866, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0002741228090599179, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 216.875, |
| "epoch": 0.10188487009679063, |
| "grad_norm": 10.078840755345926, |
| "kl": 0.126708984375, |
| "learning_rate": 9.490575649516046e-07, |
| "loss": 0.0051, |
| "reward": 1.3181660771369934, |
| "reward_std": 0.6019489467144012, |
| "rewards/accuracy_reward": 0.40937498211860657, |
| "rewards/cosine_rewards": 0.002541057765483856, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 199.515625, |
| "epoch": 0.10239429444727458, |
| "grad_norm": 5.555372749575627, |
| "kl": 0.130859375, |
| "learning_rate": 9.488028527763627e-07, |
| "loss": 0.0052, |
| "reward": 1.5905040502548218, |
| "reward_std": 0.41512130200862885, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.006129102781414986, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.03125, |
| "epoch": 0.10290371879775853, |
| "grad_norm": 10.052848791937414, |
| "kl": 0.12890625, |
| "learning_rate": 9.485481406011207e-07, |
| "loss": 0.0051, |
| "reward": 1.1239948272705078, |
| "reward_std": 0.9119550585746765, |
| "rewards/accuracy_reward": 0.26875001192092896, |
| "rewards/cosine_rewards": -0.004009488970041275, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.00012065636838087812, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 243.5, |
| "epoch": 0.10341314314824249, |
| "grad_norm": 11.024825341246498, |
| "kl": 0.115966796875, |
| "learning_rate": 9.482934284258787e-07, |
| "loss": 0.0046, |
| "reward": 0.9447762966156006, |
| "reward_std": 0.800986647605896, |
| "rewards/accuracy_reward": 0.12812498584389687, |
| "rewards/cosine_rewards": -0.011282204184681177, |
| "rewards/format_reward": 0.828125, |
| "rewards/repetition_rewards": -0.00019145716942148283, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.375, |
| "epoch": 0.10392256749872644, |
| "grad_norm": 7.842670844504004, |
| "kl": 0.118408203125, |
| "learning_rate": 9.480387162506367e-07, |
| "loss": 0.0047, |
| "reward": 1.2975149750709534, |
| "reward_std": 0.6046717762947083, |
| "rewards/accuracy_reward": 0.40937497094273567, |
| "rewards/cosine_rewards": -0.0023666354827582836, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.00011837121564894915, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.234375, |
| "epoch": 0.1044319918492104, |
| "grad_norm": 20.462186918743633, |
| "kl": 0.125732421875, |
| "learning_rate": 9.477840040753947e-07, |
| "loss": 0.005, |
| "reward": 1.0097321271896362, |
| "reward_std": 0.4872446656227112, |
| "rewards/accuracy_reward": 0.1250000149011612, |
| "rewards/cosine_rewards": -0.005829372443258762, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -6.351625779643655e-05, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.328125, |
| "epoch": 0.10494141619969434, |
| "grad_norm": 18.578745554695768, |
| "kl": 0.1298828125, |
| "learning_rate": 9.475292919001527e-07, |
| "loss": 0.0052, |
| "reward": 0.8584832549095154, |
| "reward_std": 0.5744369626045227, |
| "rewards/accuracy_reward": 0.040624991059303284, |
| "rewards/cosine_rewards": -0.010225818026810884, |
| "rewards/format_reward": 0.828125, |
| "rewards/repetition_rewards": -4.101049853488803e-05, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 209.234375, |
| "epoch": 0.1054508405501783, |
| "grad_norm": 20.30723423804259, |
| "kl": 0.11376953125, |
| "learning_rate": 9.472745797249107e-07, |
| "loss": 0.0045, |
| "reward": 1.0917281210422516, |
| "reward_std": 0.4984763488173485, |
| "rewards/accuracy_reward": 0.24062497913837433, |
| "rewards/cosine_rewards": -0.008111415430903435, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0001604560275154654, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 202.28125, |
| "epoch": 0.10596026490066225, |
| "grad_norm": 12.813170138938949, |
| "kl": 0.130859375, |
| "learning_rate": 9.470198675496688e-07, |
| "loss": 0.0052, |
| "reward": 1.258288562297821, |
| "reward_std": 0.4630318433046341, |
| "rewards/accuracy_reward": 0.3812500238418579, |
| "rewards/cosine_rewards": 0.0022279657423496246, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.00018934992840513587, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 199.0, |
| "epoch": 0.10646968925114621, |
| "grad_norm": 10.281274330223528, |
| "kl": 0.15185546875, |
| "learning_rate": 9.467651553744268e-07, |
| "loss": 0.0061, |
| "reward": 1.2817729711532593, |
| "reward_std": 0.4518425017595291, |
| "rewards/accuracy_reward": 0.40937501937150955, |
| "rewards/cosine_rewards": -0.0026020415825769305, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 191.90625, |
| "epoch": 0.10697911360163016, |
| "grad_norm": 5.991490542584776, |
| "kl": 0.118896484375, |
| "learning_rate": 9.465104431991848e-07, |
| "loss": 0.0047, |
| "reward": 1.537351131439209, |
| "reward_std": 0.5478895753622055, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/cosine_rewards": 0.00610114517621696, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.015625, |
| "epoch": 0.1074885379521141, |
| "grad_norm": 60.00536554673066, |
| "kl": 0.115966796875, |
| "learning_rate": 9.462557310239428e-07, |
| "loss": 0.0046, |
| "reward": 0.7785031795501709, |
| "reward_std": 0.42832519114017487, |
| "rewards/accuracy_reward": -0.1250000223517418, |
| "rewards/cosine_rewards": -0.018284045159816742, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -8.778089977568015e-05, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.21875, |
| "epoch": 0.10799796230259806, |
| "grad_norm": 21.72003474650607, |
| "kl": 0.1220703125, |
| "learning_rate": 9.460010188487009e-07, |
| "loss": 0.0049, |
| "reward": 1.0551989674568176, |
| "reward_std": 0.46487441658973694, |
| "rewards/accuracy_reward": 0.18437497317790985, |
| "rewards/cosine_rewards": -0.004139983095228672, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -3.600230411393568e-05, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 237.609375, |
| "epoch": 0.10850738665308202, |
| "grad_norm": 13.593112035734373, |
| "kl": 0.118896484375, |
| "learning_rate": 9.457463066734589e-07, |
| "loss": 0.0048, |
| "reward": 1.4092811346054077, |
| "reward_std": 0.6851305663585663, |
| "rewards/accuracy_reward": 0.6343750059604645, |
| "rewards/cosine_rewards": 0.009402429801411927, |
| "rewards/format_reward": 0.765625, |
| "rewards/repetition_rewards": -0.00012127523950766772, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 266.65625, |
| "epoch": 0.10901681100356597, |
| "grad_norm": 102.50188740817565, |
| "kl": 0.114013671875, |
| "learning_rate": 9.45491594498217e-07, |
| "loss": 0.0046, |
| "reward": 1.3722986578941345, |
| "reward_std": 0.5870523750782013, |
| "rewards/accuracy_reward": 0.5218750089406967, |
| "rewards/cosine_rewards": -0.008539619389921427, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.000411735316447448, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.796875, |
| "epoch": 0.10952623535404993, |
| "grad_norm": 17.000809347446246, |
| "kl": 0.114013671875, |
| "learning_rate": 9.452368823229751e-07, |
| "loss": 0.0046, |
| "reward": 1.182218611240387, |
| "reward_std": 0.6600647866725922, |
| "rewards/accuracy_reward": 0.265625, |
| "rewards/cosine_rewards": -0.02086095977574587, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -4.542151145869866e-05, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.796875, |
| "epoch": 0.11003565970453388, |
| "grad_norm": 21.15698667263886, |
| "kl": 0.107666015625, |
| "learning_rate": 9.449821701477331e-07, |
| "loss": 0.0043, |
| "reward": 1.1834356784820557, |
| "reward_std": 0.686463937163353, |
| "rewards/accuracy_reward": 0.2968749925494194, |
| "rewards/cosine_rewards": -0.019689313136041164, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": 0.0, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 400.296875, |
| "epoch": 0.11054508405501783, |
| "grad_norm": 7.9150578217370535, |
| "kl": 0.09814453125, |
| "learning_rate": 9.447274579724911e-07, |
| "loss": 0.0039, |
| "reward": 1.0662736892700195, |
| "reward_std": 0.8222787380218506, |
| "rewards/accuracy_reward": 0.18437499552965164, |
| "rewards/cosine_rewards": -0.0396728478372097, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.00030338978831423447, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.671875, |
| "epoch": 0.11105450840550178, |
| "grad_norm": 10.397918174345362, |
| "kl": 0.1357421875, |
| "learning_rate": 9.444727457972492e-07, |
| "loss": 0.0054, |
| "reward": 1.6620882153511047, |
| "reward_std": 0.6701975017786026, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/cosine_rewards": 0.021556629799306393, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -9.343791316496208e-05, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.984375, |
| "epoch": 0.11156393275598574, |
| "grad_norm": 16.664810091620872, |
| "kl": 0.09619140625, |
| "learning_rate": 9.442180336220072e-07, |
| "loss": 0.0038, |
| "reward": 0.6033791899681091, |
| "reward_std": 0.47761378437280655, |
| "rewards/accuracy_reward": -0.2656250149011612, |
| "rewards/cosine_rewards": -0.0521757323294878, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0006950152310309932, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.359375, |
| "epoch": 0.11207335710646969, |
| "grad_norm": 6.030171938320145, |
| "kl": 0.09423828125, |
| "learning_rate": 9.439633214467651e-07, |
| "loss": 0.0038, |
| "reward": 1.0732125043869019, |
| "reward_std": 0.532948449254036, |
| "rewards/accuracy_reward": 0.18437499180436134, |
| "rewards/cosine_rewards": -0.017268475145101547, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.00014401252064999426, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.921875, |
| "epoch": 0.11258278145695365, |
| "grad_norm": 13.578372073106125, |
| "kl": 0.08740234375, |
| "learning_rate": 9.437086092715231e-07, |
| "loss": 0.0035, |
| "reward": 1.089949607849121, |
| "reward_std": 0.7014666199684143, |
| "rewards/accuracy_reward": 0.18437499180436134, |
| "rewards/cosine_rewards": -0.04696316970512271, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0005872593028470874, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 295.15625, |
| "epoch": 0.11309220580743759, |
| "grad_norm": 9.243544591708364, |
| "kl": 0.099609375, |
| "learning_rate": 9.434538970962812e-07, |
| "loss": 0.004, |
| "reward": 1.25474151968956, |
| "reward_std": 0.42495501041412354, |
| "rewards/accuracy_reward": 0.2968750074505806, |
| "rewards/cosine_rewards": -0.010176160372793674, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0007073541928548366, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.9375, |
| "epoch": 0.11360163015792155, |
| "grad_norm": 11.854145682727308, |
| "kl": 0.09423828125, |
| "learning_rate": 9.431991849210392e-07, |
| "loss": 0.0038, |
| "reward": 1.336020827293396, |
| "reward_std": 0.5929334163665771, |
| "rewards/accuracy_reward": 0.3812499977648258, |
| "rewards/cosine_rewards": 0.0018316814675927162, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0001858295945567079, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.109375, |
| "epoch": 0.1141110545084055, |
| "grad_norm": 9.830726171785136, |
| "kl": 0.13330078125, |
| "learning_rate": 9.429444727457972e-07, |
| "loss": 0.0053, |
| "reward": 0.9984832406044006, |
| "reward_std": 0.45606285333633423, |
| "rewards/accuracy_reward": 0.043749988079071045, |
| "rewards/cosine_rewards": -0.014016739558428526, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.640625, |
| "epoch": 0.11462047885888946, |
| "grad_norm": 9.176338530222994, |
| "kl": 0.115234375, |
| "learning_rate": 9.426897605705553e-07, |
| "loss": 0.0046, |
| "reward": 1.2019062638282776, |
| "reward_std": 0.7189642786979675, |
| "rewards/accuracy_reward": 0.29374999925494194, |
| "rewards/cosine_rewards": -0.01371871994342655, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 199.71875, |
| "epoch": 0.11512990320937341, |
| "grad_norm": 11.349158424596924, |
| "kl": 0.110107421875, |
| "learning_rate": 9.424350483953133e-07, |
| "loss": 0.0044, |
| "reward": 1.3144216537475586, |
| "reward_std": 0.4914311468601227, |
| "rewards/accuracy_reward": 0.32499998807907104, |
| "rewards/cosine_rewards": 0.005097148037748411, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -5.056634472566657e-05, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.390625, |
| "epoch": 0.11563932755985736, |
| "grad_norm": 8.61981591225416, |
| "kl": 0.105712890625, |
| "learning_rate": 9.421803362200713e-07, |
| "loss": 0.0042, |
| "reward": 1.064522534608841, |
| "reward_std": 0.3509945422410965, |
| "rewards/accuracy_reward": 0.15312501788139343, |
| "rewards/cosine_rewards": -0.010458544362336397, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -1.8939394067274407e-05, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.75, |
| "epoch": 0.11614875191034131, |
| "grad_norm": 12.524545820572607, |
| "kl": 0.106689453125, |
| "learning_rate": 9.419256240448294e-07, |
| "loss": 0.0043, |
| "reward": 1.3913479149341583, |
| "reward_std": 0.2862061709165573, |
| "rewards/accuracy_reward": 0.40937498956918716, |
| "rewards/cosine_rewards": -0.0021625147201120853, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00023957982193678617, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.5, |
| "epoch": 0.11665817626082527, |
| "grad_norm": 17.83927533701907, |
| "kl": 0.13232421875, |
| "learning_rate": 9.416709118695874e-07, |
| "loss": 0.0053, |
| "reward": 1.5334136486053467, |
| "reward_std": 0.45667168498039246, |
| "rewards/accuracy_reward": 0.6062500029802322, |
| "rewards/cosine_rewards": 0.005288586835376918, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.671875, |
| "epoch": 0.11716760061130922, |
| "grad_norm": 23.81656431934888, |
| "kl": 0.108642578125, |
| "learning_rate": 9.414161996943454e-07, |
| "loss": 0.0043, |
| "reward": 1.1264008283615112, |
| "reward_std": 0.6892756521701813, |
| "rewards/accuracy_reward": 0.21249999105930328, |
| "rewards/cosine_rewards": -0.02350334101356566, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -9.586199303157628e-05, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.59375, |
| "epoch": 0.11767702496179318, |
| "grad_norm": 28.467466655884312, |
| "kl": 0.12109375, |
| "learning_rate": 9.411614875191034e-07, |
| "loss": 0.0048, |
| "reward": 1.4944193363189697, |
| "reward_std": 0.3786798119544983, |
| "rewards/accuracy_reward": 0.518750011920929, |
| "rewards/cosine_rewards": 0.0069862306118011475, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -6.69164874125272e-05, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.40625, |
| "epoch": 0.11818644931227713, |
| "grad_norm": 9.248494107592876, |
| "kl": 0.12060546875, |
| "learning_rate": 9.409067753438615e-07, |
| "loss": 0.0048, |
| "reward": 1.2941021919250488, |
| "reward_std": 0.5259552597999573, |
| "rewards/accuracy_reward": 0.3749999776482582, |
| "rewards/cosine_rewards": 0.012875130865722895, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -2.2944199372432195e-05, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.15625, |
| "epoch": 0.11869587366276108, |
| "grad_norm": 14.194855529249866, |
| "kl": 0.107666015625, |
| "learning_rate": 9.406520631686195e-07, |
| "loss": 0.0043, |
| "reward": 1.4532509446144104, |
| "reward_std": 0.47306837141513824, |
| "rewards/accuracy_reward": 0.46562501788139343, |
| "rewards/cosine_rewards": 0.003419560845941305, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00016866176156327128, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 376.875, |
| "epoch": 0.11920529801324503, |
| "grad_norm": 24.881210594835412, |
| "kl": 0.0986328125, |
| "learning_rate": 9.403973509933774e-07, |
| "loss": 0.0039, |
| "reward": 0.9971878528594971, |
| "reward_std": 0.8903799057006836, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/cosine_rewards": -0.018273995257914066, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.00016315293032675982, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.28125, |
| "epoch": 0.11971472236372899, |
| "grad_norm": 4.4996951306965975, |
| "kl": 0.08544921875, |
| "learning_rate": 9.401426388181355e-07, |
| "loss": 0.0034, |
| "reward": 1.3378186225891113, |
| "reward_std": 0.8512288331985474, |
| "rewards/accuracy_reward": 0.4593750089406967, |
| "rewards/cosine_rewards": -0.027322867885231972, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.000483501615235582, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.765625, |
| "epoch": 0.12022414671421294, |
| "grad_norm": 5.402908709299499, |
| "kl": 0.080322265625, |
| "learning_rate": 9.398879266428935e-07, |
| "loss": 0.0032, |
| "reward": 1.4937435388565063, |
| "reward_std": 0.35082364082336426, |
| "rewards/accuracy_reward": 0.5499999821186066, |
| "rewards/cosine_rewards": -0.024816589895635843, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0001899001763376873, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.796875, |
| "epoch": 0.1207335710646969, |
| "grad_norm": 11.178072528468537, |
| "kl": 0.0947265625, |
| "learning_rate": 9.396332144676515e-07, |
| "loss": 0.0038, |
| "reward": 1.1548867225646973, |
| "reward_std": 0.8218154907226562, |
| "rewards/accuracy_reward": 0.23749998956918716, |
| "rewards/cosine_rewards": -0.0042152018286287785, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0002730985652306117, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.640625, |
| "epoch": 0.12124299541518084, |
| "grad_norm": 3.9623481852584983, |
| "kl": 0.078857421875, |
| "learning_rate": 9.393785022924095e-07, |
| "loss": 0.0032, |
| "reward": 1.2667301297187805, |
| "reward_std": 0.8781076371669769, |
| "rewards/accuracy_reward": 0.40937498211860657, |
| "rewards/cosine_rewards": -0.04850983805954456, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0003849874483421445, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.515625, |
| "epoch": 0.1217524197656648, |
| "grad_norm": 5.742693044990549, |
| "kl": 0.094482421875, |
| "learning_rate": 9.391237901171676e-07, |
| "loss": 0.0038, |
| "reward": 0.6565631031990051, |
| "reward_std": 0.7225559949874878, |
| "rewards/accuracy_reward": -0.15312501043081284, |
| "rewards/cosine_rewards": -0.08046763762831688, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.00046927113726269454, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.265625, |
| "epoch": 0.12226184411614875, |
| "grad_norm": 7.476762059683771, |
| "kl": 0.08984375, |
| "learning_rate": 9.388690779419256e-07, |
| "loss": 0.0036, |
| "reward": 1.2646641731262207, |
| "reward_std": 0.35142165422439575, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": -0.016350463964045048, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00023539320682175457, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 241.125, |
| "epoch": 0.12277126846663271, |
| "grad_norm": 7.077545719384053, |
| "kl": 0.101318359375, |
| "learning_rate": 9.386143657666836e-07, |
| "loss": 0.0041, |
| "reward": 0.9919856488704681, |
| "reward_std": 0.6525652855634689, |
| "rewards/accuracy_reward": 0.043749988079071045, |
| "rewards/cosine_rewards": -0.020295456051826477, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.00021890102652832866, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 240.765625, |
| "epoch": 0.12328069281711666, |
| "grad_norm": 8.441107538365218, |
| "kl": 0.1044921875, |
| "learning_rate": 9.383596535914417e-07, |
| "loss": 0.0042, |
| "reward": 1.616421401500702, |
| "reward_std": 0.3022947758436203, |
| "rewards/accuracy_reward": 0.6593749970197678, |
| "rewards/cosine_rewards": 0.004121019504964352, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.00019959894416388124, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.796875, |
| "epoch": 0.1237901171676006, |
| "grad_norm": 9.380134906229868, |
| "kl": 0.11083984375, |
| "learning_rate": 9.381049414161997e-07, |
| "loss": 0.0044, |
| "reward": 1.3176445960998535, |
| "reward_std": 0.3997122645378113, |
| "rewards/accuracy_reward": 0.32499998807907104, |
| "rewards/cosine_rewards": -0.0073250585701316595, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -3.028100763913244e-05, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.53125, |
| "epoch": 0.12429954151808456, |
| "grad_norm": 5.4472083052212765, |
| "kl": 0.109375, |
| "learning_rate": 9.378502292409577e-07, |
| "loss": 0.0044, |
| "reward": 1.6422365307807922, |
| "reward_std": 0.2728146519511938, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.01141381449997425, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.00042724609375, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.828125, |
| "epoch": 0.12480896586856852, |
| "grad_norm": 8.442731808091501, |
| "kl": 0.1083984375, |
| "learning_rate": 9.375955170657157e-07, |
| "loss": 0.0043, |
| "reward": 1.3157773613929749, |
| "reward_std": 0.4320952445268631, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": 0.009643017314374447, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.00011561772407731041, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.6875, |
| "epoch": 0.12531839021905247, |
| "grad_norm": 14.294535330077375, |
| "kl": 0.1171875, |
| "learning_rate": 9.373408048904738e-07, |
| "loss": 0.0047, |
| "reward": 1.331631362438202, |
| "reward_std": 0.4216170907020569, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": 0.00982090923935175, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -6.454958565882407e-05, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.984375, |
| "epoch": 0.12582781456953643, |
| "grad_norm": 9.55932148178992, |
| "kl": 0.108642578125, |
| "learning_rate": 9.370860927152318e-07, |
| "loss": 0.0043, |
| "reward": 1.340530276298523, |
| "reward_std": 0.4534093588590622, |
| "rewards/accuracy_reward": 0.3531250096857548, |
| "rewards/cosine_rewards": 0.003091069171205163, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -6.0797665355494246e-05, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.234375, |
| "epoch": 0.12633723892002038, |
| "grad_norm": 17.80951371391722, |
| "kl": 0.110595703125, |
| "learning_rate": 9.368313805399897e-07, |
| "loss": 0.0044, |
| "reward": 1.2751246690750122, |
| "reward_std": 0.520209550857544, |
| "rewards/accuracy_reward": 0.2968749850988388, |
| "rewards/cosine_rewards": 0.010451191570609808, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0009515111669315957, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 366.625, |
| "epoch": 0.12684666327050434, |
| "grad_norm": 12.086775377390172, |
| "kl": 0.111083984375, |
| "learning_rate": 9.365766683647478e-07, |
| "loss": 0.0044, |
| "reward": 0.8402246385812759, |
| "reward_std": 0.6177513003349304, |
| "rewards/accuracy_reward": -0.04062502086162567, |
| "rewards/cosine_rewards": -0.04077841015532613, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0002469850951456465, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.125, |
| "epoch": 0.1273560876209883, |
| "grad_norm": 11.748922331365623, |
| "kl": 0.0869140625, |
| "learning_rate": 9.363219561895058e-07, |
| "loss": 0.0035, |
| "reward": 1.7429784536361694, |
| "reward_std": 0.6669142842292786, |
| "rewards/accuracy_reward": 0.746874988079071, |
| "rewards/cosine_rewards": 0.02791230659931898, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.000558776329853572, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.34375, |
| "epoch": 0.12786551197147222, |
| "grad_norm": 5.635036318066103, |
| "kl": 0.073974609375, |
| "learning_rate": 9.360672440142638e-07, |
| "loss": 0.003, |
| "reward": 1.3756027221679688, |
| "reward_std": 0.3430413454771042, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": -0.01460547186434269, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0004167625156696886, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.78125, |
| "epoch": 0.12837493632195618, |
| "grad_norm": 3.7721855945126013, |
| "kl": 0.071533203125, |
| "learning_rate": 9.358125318390219e-07, |
| "loss": 0.0029, |
| "reward": 1.1054343283176422, |
| "reward_std": 0.9506143927574158, |
| "rewards/accuracy_reward": 0.23749998211860657, |
| "rewards/cosine_rewards": -0.06930245459079742, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.00026325164799345657, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.8125, |
| "epoch": 0.12888436067244013, |
| "grad_norm": 4.515905173052182, |
| "kl": 0.06494140625, |
| "learning_rate": 9.355578196637799e-07, |
| "loss": 0.0026, |
| "reward": 1.1395662426948547, |
| "reward_std": 1.164560616016388, |
| "rewards/accuracy_reward": 0.24062499403953552, |
| "rewards/cosine_rewards": -0.06932513415813446, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0004836731095565483, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 750.703125, |
| "epoch": 0.1293937850229241, |
| "grad_norm": 3.759822074251192, |
| "kl": 0.0609130859375, |
| "learning_rate": 9.353031074885379e-07, |
| "loss": 0.0024, |
| "reward": 1.3212904930114746, |
| "reward_std": 0.9738726019859314, |
| "rewards/accuracy_reward": 0.4625000059604645, |
| "rewards/cosine_rewards": -0.03070250153541565, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0011320026533212513, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 604.40625, |
| "epoch": 0.12990320937340805, |
| "grad_norm": 4.697641184697458, |
| "kl": 0.082763671875, |
| "learning_rate": 9.350483953132959e-07, |
| "loss": 0.0033, |
| "reward": 1.1115484535694122, |
| "reward_std": 0.7478219866752625, |
| "rewards/accuracy_reward": 0.23125000298023224, |
| "rewards/cosine_rewards": -0.05675292294472456, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.00044860908383270726, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.84375, |
| "epoch": 0.130412633723892, |
| "grad_norm": 4.028619362240205, |
| "kl": 0.09765625, |
| "learning_rate": 9.34793683138054e-07, |
| "loss": 0.0039, |
| "reward": 1.457118034362793, |
| "reward_std": 0.788001298904419, |
| "rewards/accuracy_reward": 0.5218749791383743, |
| "rewards/cosine_rewards": -0.0015127966180443764, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0007440973713528365, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.40625, |
| "epoch": 0.13092205807437596, |
| "grad_norm": 5.63417318829876, |
| "kl": 0.07275390625, |
| "learning_rate": 9.34538970962812e-07, |
| "loss": 0.0029, |
| "reward": 1.4154019951820374, |
| "reward_std": 0.815990686416626, |
| "rewards/accuracy_reward": 0.518750011920929, |
| "rewards/cosine_rewards": -0.02476619742810726, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.00045683811185881495, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 676.984375, |
| "epoch": 0.1314314824248599, |
| "grad_norm": 6.021417699991294, |
| "kl": 0.065673828125, |
| "learning_rate": 9.3428425878757e-07, |
| "loss": 0.0026, |
| "reward": 0.6927553117275238, |
| "reward_std": 0.8777336776256561, |
| "rewards/accuracy_reward": -0.0781250074505806, |
| "rewards/cosine_rewards": -0.15041033178567886, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0005843567778356373, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 555.203125, |
| "epoch": 0.13194090677534387, |
| "grad_norm": 8.306056976533926, |
| "kl": 0.082763671875, |
| "learning_rate": 9.340295466123281e-07, |
| "loss": 0.0033, |
| "reward": 1.2112269699573517, |
| "reward_std": 0.9674933552742004, |
| "rewards/accuracy_reward": 0.43437499552965164, |
| "rewards/cosine_rewards": -0.08218972198665142, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0003333477216074243, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.484375, |
| "epoch": 0.13245033112582782, |
| "grad_norm": 6.232070420425315, |
| "kl": 0.06689453125, |
| "learning_rate": 9.337748344370861e-07, |
| "loss": 0.0027, |
| "reward": 1.0117461681365967, |
| "reward_std": 0.7802118062973022, |
| "rewards/accuracy_reward": 0.21249999478459358, |
| "rewards/cosine_rewards": -0.09079772233963013, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0005810301227029413, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.5625, |
| "epoch": 0.13295975547631178, |
| "grad_norm": 6.334798789966088, |
| "kl": 0.08544921875, |
| "learning_rate": 9.335201222618441e-07, |
| "loss": 0.0034, |
| "reward": 1.0542153716087341, |
| "reward_std": 0.8291297852993011, |
| "rewards/accuracy_reward": 0.18124999105930328, |
| "rewards/cosine_rewards": -0.032575659453868866, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0007089868013281375, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 356.015625, |
| "epoch": 0.1334691798267957, |
| "grad_norm": 10.460916199726386, |
| "kl": 0.098388671875, |
| "learning_rate": 9.33265410086602e-07, |
| "loss": 0.0039, |
| "reward": 0.6651052087545395, |
| "reward_std": 0.9073293209075928, |
| "rewards/accuracy_reward": -0.09687501192092896, |
| "rewards/cosine_rewards": -0.03463773522526026, |
| "rewards/format_reward": 0.796875, |
| "rewards/repetition_rewards": -0.000257108491496183, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.109375, |
| "epoch": 0.13397860417727966, |
| "grad_norm": 105.36035227056938, |
| "kl": 0.10546875, |
| "learning_rate": 9.330106979113601e-07, |
| "loss": 0.0042, |
| "reward": 1.6959076523780823, |
| "reward_std": 0.6433850526809692, |
| "rewards/accuracy_reward": 0.7374999523162842, |
| "rewards/cosine_rewards": 0.036790573969483376, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0002579164138296619, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 225.671875, |
| "epoch": 0.13448802852776362, |
| "grad_norm": 12.269079038415155, |
| "kl": 0.1044921875, |
| "learning_rate": 9.327559857361181e-07, |
| "loss": 0.0042, |
| "reward": 1.3304521441459656, |
| "reward_std": 0.7337057292461395, |
| "rewards/accuracy_reward": 0.40312500298023224, |
| "rewards/cosine_rewards": -0.009734044317156076, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0004387954395497218, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 191.421875, |
| "epoch": 0.13499745287824758, |
| "grad_norm": 5.0495702848173805, |
| "kl": 0.12451171875, |
| "learning_rate": 9.325012735608761e-07, |
| "loss": 0.005, |
| "reward": 1.5114508867263794, |
| "reward_std": 0.4991532266139984, |
| "rewards/accuracy_reward": 0.6031249910593033, |
| "rewards/cosine_rewards": 0.0021946561755612493, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0001188212918350473, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.890625, |
| "epoch": 0.13550687722873153, |
| "grad_norm": 8.380147213080475, |
| "kl": 0.11376953125, |
| "learning_rate": 9.322465613856342e-07, |
| "loss": 0.0046, |
| "reward": 1.3185867071151733, |
| "reward_std": 0.5081266015768051, |
| "rewards/accuracy_reward": 0.37812499701976776, |
| "rewards/cosine_rewards": 0.0029779861215502024, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -1.6225338185904548e-05, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 163.4375, |
| "epoch": 0.1360163015792155, |
| "grad_norm": 6.7670554711392725, |
| "kl": 0.1259765625, |
| "learning_rate": 9.319918492103922e-07, |
| "loss": 0.005, |
| "reward": 1.917210876941681, |
| "reward_std": 0.2323581874370575, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/cosine_rewards": 0.011114767286926508, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.00015385003644041717, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.0, |
| "epoch": 0.13652572592969944, |
| "grad_norm": 7.285618988190272, |
| "kl": 0.119873046875, |
| "learning_rate": 9.317371370351502e-07, |
| "loss": 0.0048, |
| "reward": 1.2626032829284668, |
| "reward_std": 0.6921159029006958, |
| "rewards/accuracy_reward": 0.34687499701976776, |
| "rewards/cosine_rewards": -0.006146675441414118, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.953125, |
| "epoch": 0.1370351502801834, |
| "grad_norm": 10.577916333197056, |
| "kl": 0.140625, |
| "learning_rate": 9.314824248599083e-07, |
| "loss": 0.0056, |
| "reward": 1.2036974430084229, |
| "reward_std": 0.5991593599319458, |
| "rewards/accuracy_reward": 0.2968749850988388, |
| "rewards/cosine_rewards": 0.0007108037825673819, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0001382743357680738, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 158.34375, |
| "epoch": 0.13754457463066735, |
| "grad_norm": 14.293416719573916, |
| "kl": 0.1201171875, |
| "learning_rate": 9.312277126846663e-07, |
| "loss": 0.0048, |
| "reward": 1.2185573279857635, |
| "reward_std": 0.43015679717063904, |
| "rewards/accuracy_reward": 0.24062500149011612, |
| "rewards/cosine_rewards": -0.006263321032747626, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00017934850984602235, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 166.46875, |
| "epoch": 0.1380539989811513, |
| "grad_norm": 9.393354579467495, |
| "kl": 0.1240234375, |
| "learning_rate": 9.309730005094243e-07, |
| "loss": 0.005, |
| "reward": 1.5472444295883179, |
| "reward_std": 0.5279964953660965, |
| "rewards/accuracy_reward": 0.606249988079071, |
| "rewards/cosine_rewards": 0.003494387026876211, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 149.609375, |
| "epoch": 0.13856342333163527, |
| "grad_norm": 7.200843784037537, |
| "kl": 0.117431640625, |
| "learning_rate": 9.307182883341823e-07, |
| "loss": 0.0047, |
| "reward": 1.3769221901893616, |
| "reward_std": 0.4806235730648041, |
| "rewards/accuracy_reward": 0.40937501937150955, |
| "rewards/cosine_rewards": -0.0011419787188060582, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -6.0797665355494246e-05, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.828125, |
| "epoch": 0.1390728476821192, |
| "grad_norm": 11.003082967675637, |
| "kl": 0.18359375, |
| "learning_rate": 9.304635761589404e-07, |
| "loss": 0.0073, |
| "reward": 1.3800683617591858, |
| "reward_std": 0.4095611423254013, |
| "rewards/accuracy_reward": 0.40937498211860657, |
| "rewards/cosine_rewards": 0.0019433526322245598, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": 0.0, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.453125, |
| "epoch": 0.13958227203260315, |
| "grad_norm": 6.9562429559567285, |
| "kl": 0.130859375, |
| "learning_rate": 9.302088639836984e-07, |
| "loss": 0.0052, |
| "reward": 1.424567699432373, |
| "reward_std": 0.2551300157792866, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": 0.0028896235453430563, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0001969077275134623, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 149.296875, |
| "epoch": 0.1400916963830871, |
| "grad_norm": 9.330566123712217, |
| "kl": 0.1240234375, |
| "learning_rate": 9.299541518084564e-07, |
| "loss": 0.005, |
| "reward": 1.2650930285453796, |
| "reward_std": 0.42955365777015686, |
| "rewards/accuracy_reward": 0.32499999552965164, |
| "rewards/cosine_rewards": 0.0025930306874215603, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 207.75, |
| "epoch": 0.14060112073357106, |
| "grad_norm": 8.495983071057866, |
| "kl": 0.11962890625, |
| "learning_rate": 9.296994396332144e-07, |
| "loss": 0.0048, |
| "reward": 1.8627826571464539, |
| "reward_std": 0.2839447557926178, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/cosine_rewards": 0.019193909130990505, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0001612851265235804, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 162.515625, |
| "epoch": 0.14111054508405502, |
| "grad_norm": 12.855580556594866, |
| "kl": 0.14306640625, |
| "learning_rate": 9.294447274579724e-07, |
| "loss": 0.0057, |
| "reward": 1.5162805318832397, |
| "reward_std": 0.6588033437728882, |
| "rewards/accuracy_reward": 0.6343750059604645, |
| "rewards/cosine_rewards": -0.008702149149030447, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -1.7208149074576795e-05, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 281.484375, |
| "epoch": 0.14161996943453897, |
| "grad_norm": 10.357622467045976, |
| "kl": 0.102783203125, |
| "learning_rate": 9.291900152827304e-07, |
| "loss": 0.0041, |
| "reward": 1.1203789710998535, |
| "reward_std": 0.6814777851104736, |
| "rewards/accuracy_reward": 0.17812500894069672, |
| "rewards/cosine_rewards": -0.010577938985079527, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0002930604387074709, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.390625, |
| "epoch": 0.14212939378502293, |
| "grad_norm": 23.483530147678458, |
| "kl": 0.114013671875, |
| "learning_rate": 9.289353031074884e-07, |
| "loss": 0.0046, |
| "reward": 1.3522316813468933, |
| "reward_std": 0.28182537853717804, |
| "rewards/accuracy_reward": 0.3812500238418579, |
| "rewards/cosine_rewards": 0.0023158364929258823, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -8.418447396252304e-05, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 271.96875, |
| "epoch": 0.14263881813550688, |
| "grad_norm": 5.804128123317947, |
| "kl": 0.109619140625, |
| "learning_rate": 9.286805909322465e-07, |
| "loss": 0.0044, |
| "reward": 1.2828457355499268, |
| "reward_std": 0.5574119389057159, |
| "rewards/accuracy_reward": 0.3500000163912773, |
| "rewards/cosine_rewards": -0.004654169548302889, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": 0.0, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 346.984375, |
| "epoch": 0.14314824248599084, |
| "grad_norm": 9.062411976412948, |
| "kl": 0.09130859375, |
| "learning_rate": 9.284258787570045e-07, |
| "loss": 0.0037, |
| "reward": 1.9385767579078674, |
| "reward_std": 0.3151838555932045, |
| "rewards/accuracy_reward": 0.9437500238418579, |
| "rewards/cosine_rewards": 0.04203657992184162, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0003348248792462982, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 395.875, |
| "epoch": 0.1436576668364748, |
| "grad_norm": 8.221976106115973, |
| "kl": 0.104248046875, |
| "learning_rate": 9.281711665817625e-07, |
| "loss": 0.0042, |
| "reward": 1.323907494544983, |
| "reward_std": 0.6098371148109436, |
| "rewards/accuracy_reward": 0.40312501788139343, |
| "rewards/cosine_rewards": 0.014727211673744023, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.00019470852021186147, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.21875, |
| "epoch": 0.14416709118695872, |
| "grad_norm": 7.812929807725803, |
| "kl": 0.084228515625, |
| "learning_rate": 9.279164544065206e-07, |
| "loss": 0.0034, |
| "reward": 1.3660696744918823, |
| "reward_std": 0.6207956671714783, |
| "rewards/accuracy_reward": 0.46562500298023224, |
| "rewards/cosine_rewards": -0.005538210505619645, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0002670584217412397, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 525.6875, |
| "epoch": 0.14467651553744268, |
| "grad_norm": 8.491615227525342, |
| "kl": 0.08056640625, |
| "learning_rate": 9.276617422312786e-07, |
| "loss": 0.0032, |
| "reward": 1.3353699743747711, |
| "reward_std": 0.5946642160415649, |
| "rewards/accuracy_reward": 0.40937500447034836, |
| "rewards/cosine_rewards": -0.02690817415714264, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0002218634108430706, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 599.984375, |
| "epoch": 0.14518593988792663, |
| "grad_norm": 16.68462964732329, |
| "kl": 0.077880859375, |
| "learning_rate": 9.274070300560366e-07, |
| "loss": 0.0031, |
| "reward": 0.9605185687541962, |
| "reward_std": 0.7793702185153961, |
| "rewards/accuracy_reward": 0.09999999031424522, |
| "rewards/cosine_rewards": -0.06100003980100155, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0003564156068023294, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.3125, |
| "epoch": 0.1456953642384106, |
| "grad_norm": 10.597989534798653, |
| "kl": 0.068115234375, |
| "learning_rate": 9.271523178807946e-07, |
| "loss": 0.0027, |
| "reward": 1.1927469968795776, |
| "reward_std": 1.0099957585334778, |
| "rewards/accuracy_reward": 0.34999997913837433, |
| "rewards/cosine_rewards": -0.04736426845192909, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0005137407861184329, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.234375, |
| "epoch": 0.14620478858889455, |
| "grad_norm": 5.399955402557674, |
| "kl": 0.072265625, |
| "learning_rate": 9.268976057055527e-07, |
| "loss": 0.0029, |
| "reward": 0.821646511554718, |
| "reward_std": 0.9464232325553894, |
| "rewards/accuracy_reward": 0.03749999776482582, |
| "rewards/cosine_rewards": -0.10573448240756989, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0007440397967002355, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.796875, |
| "epoch": 0.1467142129393785, |
| "grad_norm": 5.9108976297695355, |
| "kl": 0.075439453125, |
| "learning_rate": 9.266428935303107e-07, |
| "loss": 0.003, |
| "reward": 1.8053097128868103, |
| "reward_std": 0.5278272330760956, |
| "rewards/accuracy_reward": 0.7749999761581421, |
| "rewards/cosine_rewards": 0.061956772580742836, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0003969733224948868, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.484375, |
| "epoch": 0.14722363728986246, |
| "grad_norm": 4.280094122642851, |
| "kl": 0.0692138671875, |
| "learning_rate": 9.263881813550687e-07, |
| "loss": 0.0028, |
| "reward": 0.7580513060092926, |
| "reward_std": 0.9215057492256165, |
| "rewards/accuracy_reward": -0.04062502086162567, |
| "rewards/cosine_rewards": -0.1223737820982933, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0008249446109402925, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 697.03125, |
| "epoch": 0.1477330616403464, |
| "grad_norm": 4.704795726585343, |
| "kl": 0.068359375, |
| "learning_rate": 9.261334691798267e-07, |
| "loss": 0.0027, |
| "reward": 1.0915009379386902, |
| "reward_std": 0.6004486382007599, |
| "rewards/accuracy_reward": 0.21249999105930328, |
| "rewards/cosine_rewards": -0.05759305879473686, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0009060115553438663, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 739.375, |
| "epoch": 0.14824248599083037, |
| "grad_norm": 5.468968593755717, |
| "kl": 0.065185546875, |
| "learning_rate": 9.258787570045847e-07, |
| "loss": 0.0026, |
| "reward": 1.328648567199707, |
| "reward_std": 0.8502229452133179, |
| "rewards/accuracy_reward": 0.40312500298023224, |
| "rewards/cosine_rewards": -0.027191368862986565, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.00041003923979587853, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 800.390625, |
| "epoch": 0.14875191034131433, |
| "grad_norm": 2.644913201802289, |
| "kl": 0.07861328125, |
| "learning_rate": 9.256240448293427e-07, |
| "loss": 0.0031, |
| "reward": 1.5775163769721985, |
| "reward_std": 0.6978716552257538, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/cosine_rewards": 0.031187113374471664, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0005458263913169503, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 969.328125, |
| "epoch": 0.14926133469179828, |
| "grad_norm": 3.858694298808609, |
| "kl": 0.0548095703125, |
| "learning_rate": 9.253693326541008e-07, |
| "loss": 0.0022, |
| "reward": 0.39561687409877777, |
| "reward_std": 1.1356619894504547, |
| "rewards/accuracy_reward": -0.1625000238418579, |
| "rewards/cosine_rewards": -0.23805859684944153, |
| "rewards/format_reward": 0.796875, |
| "rewards/repetition_rewards": -0.0006995665607973933, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1044.703125, |
| "epoch": 0.1497707590422822, |
| "grad_norm": 2.0660275837501643, |
| "kl": 0.0902099609375, |
| "learning_rate": 9.251146204788588e-07, |
| "loss": 0.0036, |
| "reward": 1.0626700818538666, |
| "reward_std": 1.1662874221801758, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": -0.055325835943222046, |
| "rewards/format_reward": 0.765625, |
| "rewards/repetition_rewards": -0.0007540385995525867, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 934.28125, |
| "epoch": 0.15028018339276616, |
| "grad_norm": 7.548230617974183, |
| "kl": 0.0538330078125, |
| "learning_rate": 9.248599083036168e-07, |
| "loss": 0.0022, |
| "reward": 1.2535955309867859, |
| "reward_std": 1.0525287985801697, |
| "rewards/accuracy_reward": 0.3750000223517418, |
| "rewards/cosine_rewards": -0.04287016252055764, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.00040922046173363924, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 790.09375, |
| "epoch": 0.15078960774325012, |
| "grad_norm": 3.7401563979919654, |
| "kl": 0.0584716796875, |
| "learning_rate": 9.246051961283748e-07, |
| "loss": 0.0023, |
| "reward": 1.1489249467849731, |
| "reward_std": 0.5376773178577423, |
| "rewards/accuracy_reward": 0.2937499899417162, |
| "rewards/cosine_rewards": -0.08189126010984182, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0004337812424637377, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 836.375, |
| "epoch": 0.15129903209373408, |
| "grad_norm": 2.8153067499507105, |
| "kl": 0.0618896484375, |
| "learning_rate": 9.243504839531329e-07, |
| "loss": 0.0025, |
| "reward": 1.3525272011756897, |
| "reward_std": 0.8126451969146729, |
| "rewards/accuracy_reward": 0.4906250238418579, |
| "rewards/cosine_rewards": -0.012518584728240967, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.000579186889808625, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 890.625, |
| "epoch": 0.15180845644421803, |
| "grad_norm": 5.316949650260697, |
| "kl": 0.0552978515625, |
| "learning_rate": 9.240957717778909e-07, |
| "loss": 0.0022, |
| "reward": 1.2640092372894287, |
| "reward_std": 0.8870376944541931, |
| "rewards/accuracy_reward": 0.4062499850988388, |
| "rewards/cosine_rewards": -0.0009442958980798721, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0006714609917253256, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 813.953125, |
| "epoch": 0.152317880794702, |
| "grad_norm": 3.8825478721674953, |
| "kl": 0.0574951171875, |
| "learning_rate": 9.23841059602649e-07, |
| "loss": 0.0023, |
| "reward": 1.2717376947402954, |
| "reward_std": 0.830648809671402, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": -0.03994514420628548, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.0008170758956111968, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 752.140625, |
| "epoch": 0.15282730514518594, |
| "grad_norm": 5.06920521582769, |
| "kl": 0.059814453125, |
| "learning_rate": 9.235863474274071e-07, |
| "loss": 0.0024, |
| "reward": 1.1217154264450073, |
| "reward_std": 0.8524642586708069, |
| "rewards/accuracy_reward": 0.24062498286366463, |
| "rewards/cosine_rewards": -0.04011305421590805, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0006714656192343682, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.625, |
| "epoch": 0.1533367294956699, |
| "grad_norm": 8.555507127767159, |
| "kl": 0.0672607421875, |
| "learning_rate": 9.233316352521651e-07, |
| "loss": 0.0027, |
| "reward": 1.1471417546272278, |
| "reward_std": 0.7909112870693207, |
| "rewards/accuracy_reward": 0.2656249962747097, |
| "rewards/cosine_rewards": -0.039835451170802116, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0005227623041719198, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.78125, |
| "epoch": 0.15384615384615385, |
| "grad_norm": 2.9040824114504877, |
| "kl": 0.064697265625, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 0.0026, |
| "reward": 0.9261243343353271, |
| "reward_std": 0.668161928653717, |
| "rewards/accuracy_reward": 0.1281249988824129, |
| "rewards/cosine_rewards": -0.06077958270907402, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0005960852140560746, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.484375, |
| "epoch": 0.1543555781966378, |
| "grad_norm": 4.4461209381275655, |
| "kl": 0.06298828125, |
| "learning_rate": 9.228222109016812e-07, |
| "loss": 0.0025, |
| "reward": 1.506935715675354, |
| "reward_std": 0.6653757691383362, |
| "rewards/accuracy_reward": 0.5468749850988388, |
| "rewards/cosine_rewards": 0.03871871158480644, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0005329845298547298, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.140625, |
| "epoch": 0.15486500254712177, |
| "grad_norm": 3.9254091150548933, |
| "kl": 0.069091796875, |
| "learning_rate": 9.225674987264391e-07, |
| "loss": 0.0028, |
| "reward": 1.3886016011238098, |
| "reward_std": 0.9017740190029144, |
| "rewards/accuracy_reward": 0.5749999731779099, |
| "rewards/cosine_rewards": -0.02933959849178791, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.000808820070233196, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.625, |
| "epoch": 0.1553744268976057, |
| "grad_norm": 6.070022774209878, |
| "kl": 0.068115234375, |
| "learning_rate": 9.223127865511971e-07, |
| "loss": 0.0027, |
| "reward": 1.6925800442695618, |
| "reward_std": 0.6231902837753296, |
| "rewards/accuracy_reward": 0.6625000238418579, |
| "rewards/cosine_rewards": 0.06164960749447346, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0003195497556589544, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.15625, |
| "epoch": 0.15588385124808965, |
| "grad_norm": 11.091865062468658, |
| "kl": 0.317138671875, |
| "learning_rate": 9.220580743759551e-07, |
| "loss": 0.0127, |
| "reward": 1.5423057079315186, |
| "reward_std": 0.3847469687461853, |
| "rewards/accuracy_reward": 0.5468749962747097, |
| "rewards/cosine_rewards": 0.05880427733063698, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0008736126183066517, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.578125, |
| "epoch": 0.1563932755985736, |
| "grad_norm": 3.103459103182676, |
| "kl": 0.0673828125, |
| "learning_rate": 9.218033622007132e-07, |
| "loss": 0.0027, |
| "reward": 1.6744784712791443, |
| "reward_std": 0.659433513879776, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/cosine_rewards": 0.06587037723511457, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0007668640464544296, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.265625, |
| "epoch": 0.15690269994905756, |
| "grad_norm": 4.50781660421839, |
| "kl": 0.068115234375, |
| "learning_rate": 9.215486500254712e-07, |
| "loss": 0.0027, |
| "reward": 1.145881563425064, |
| "reward_std": 1.0458006858825684, |
| "rewards/accuracy_reward": 0.34062499180436134, |
| "rewards/cosine_rewards": -0.03727734461426735, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.0012161528575234115, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 771.5625, |
| "epoch": 0.15741212429954152, |
| "grad_norm": 6.920330329994176, |
| "kl": 0.064208984375, |
| "learning_rate": 9.212939378502292e-07, |
| "loss": 0.0026, |
| "reward": 0.8615269958972931, |
| "reward_std": 0.9165626764297485, |
| "rewards/accuracy_reward": 0.140625, |
| "rewards/cosine_rewards": -0.07544910162687302, |
| "rewards/format_reward": 0.796875, |
| "rewards/repetition_rewards": -0.0005239159800112247, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 758.96875, |
| "epoch": 0.15792154865002547, |
| "grad_norm": 11.706103376756111, |
| "kl": 0.056396484375, |
| "learning_rate": 9.210392256749873e-07, |
| "loss": 0.0023, |
| "reward": 1.567901074886322, |
| "reward_std": 1.1157508492469788, |
| "rewards/accuracy_reward": 0.6437499821186066, |
| "rewards/cosine_rewards": 0.08178849518299103, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.0013874000869691372, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.390625, |
| "epoch": 0.15843097300050943, |
| "grad_norm": 4.42128730127276, |
| "kl": 0.062255859375, |
| "learning_rate": 9.207845134997453e-07, |
| "loss": 0.0025, |
| "reward": 0.944963201880455, |
| "reward_std": 0.9125352203845978, |
| "rewards/accuracy_reward": 0.16249998658895493, |
| "rewards/cosine_rewards": -0.04496639594435692, |
| "rewards/format_reward": 0.828125, |
| "rewards/repetition_rewards": -0.0006953877746127546, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 812.46875, |
| "epoch": 0.15894039735099338, |
| "grad_norm": 5.888648418898334, |
| "kl": 0.0587158203125, |
| "learning_rate": 9.205298013245033e-07, |
| "loss": 0.0023, |
| "reward": 0.6825668215751648, |
| "reward_std": 1.0514086484909058, |
| "rewards/accuracy_reward": 0.04999999701976776, |
| "rewards/cosine_rewards": -0.13240730948746204, |
| "rewards/format_reward": 0.765625, |
| "rewards/repetition_rewards": -0.0006508340884465724, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.703125, |
| "epoch": 0.15944982170147734, |
| "grad_norm": 5.201587660434957, |
| "kl": 0.0626220703125, |
| "learning_rate": 9.202750891492613e-07, |
| "loss": 0.0025, |
| "reward": 0.849999725818634, |
| "reward_std": 1.2490254640579224, |
| "rewards/accuracy_reward": 0.16249999590218067, |
| "rewards/cosine_rewards": -0.04634671099483967, |
| "rewards/format_reward": 0.734375, |
| "rewards/repetition_rewards": -0.0005285786173772067, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 732.84375, |
| "epoch": 0.1599592460519613, |
| "grad_norm": 41.79369545195822, |
| "kl": 0.0654296875, |
| "learning_rate": 9.200203769740194e-07, |
| "loss": 0.0026, |
| "reward": 1.359117031097412, |
| "reward_std": 1.1281075477600098, |
| "rewards/accuracy_reward": 0.49687501788139343, |
| "rewards/cosine_rewards": 0.06599474605172873, |
| "rewards/format_reward": 0.796875, |
| "rewards/repetition_rewards": -0.0006276974454522133, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.609375, |
| "epoch": 0.16046867040244522, |
| "grad_norm": 5.659472242303819, |
| "kl": 0.090087890625, |
| "learning_rate": 9.197656647987774e-07, |
| "loss": 0.0036, |
| "reward": 1.149334043264389, |
| "reward_std": 1.1551178693771362, |
| "rewards/accuracy_reward": 0.3593749962747097, |
| "rewards/cosine_rewards": 0.025284748524427414, |
| "rewards/format_reward": 0.765625, |
| "rewards/repetition_rewards": -0.0009507373906672001, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 704.09375, |
| "epoch": 0.16097809475292918, |
| "grad_norm": 5.702114455603425, |
| "kl": 0.071044921875, |
| "learning_rate": 9.195109526235354e-07, |
| "loss": 0.0028, |
| "reward": 1.3667227029800415, |
| "reward_std": 0.6237545907497406, |
| "rewards/accuracy_reward": 0.4031249713152647, |
| "rewards/cosine_rewards": 0.01146969199180603, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0009969472303055227, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.984375, |
| "epoch": 0.16148751910341314, |
| "grad_norm": 7.41926766137556, |
| "kl": 0.072998046875, |
| "learning_rate": 9.192562404482935e-07, |
| "loss": 0.0029, |
| "reward": 1.2655977010726929, |
| "reward_std": 0.7071200311183929, |
| "rewards/accuracy_reward": 0.37187498807907104, |
| "rewards/cosine_rewards": -0.0119027029722929, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.00062458252068609, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.65625, |
| "epoch": 0.1619969434538971, |
| "grad_norm": 6.476802877604566, |
| "kl": 0.072265625, |
| "learning_rate": 9.190015282730514e-07, |
| "loss": 0.0029, |
| "reward": 1.4926868677139282, |
| "reward_std": 0.5651115030050278, |
| "rewards/accuracy_reward": 0.4906250089406967, |
| "rewards/cosine_rewards": 0.05030408315360546, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0013672530185431242, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.4375, |
| "epoch": 0.16250636780438105, |
| "grad_norm": 5.651262358287829, |
| "kl": 0.078125, |
| "learning_rate": 9.187468160978094e-07, |
| "loss": 0.0031, |
| "reward": 1.6467930674552917, |
| "reward_std": 0.6130897700786591, |
| "rewards/accuracy_reward": 0.6343749761581421, |
| "rewards/cosine_rewards": 0.060116952285170555, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008239042945206165, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.1875, |
| "epoch": 0.163015792154865, |
| "grad_norm": 8.752414113774137, |
| "kl": 0.087890625, |
| "learning_rate": 9.184921039225674e-07, |
| "loss": 0.0035, |
| "reward": 1.2824658155441284, |
| "reward_std": 0.6804981231689453, |
| "rewards/accuracy_reward": 0.4281250089406967, |
| "rewards/cosine_rewards": -0.004211767576634884, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0008223777404054999, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.421875, |
| "epoch": 0.16352521650534896, |
| "grad_norm": 24.555397071153298, |
| "kl": 0.10791015625, |
| "learning_rate": 9.182373917473255e-07, |
| "loss": 0.0043, |
| "reward": 1.5703404545783997, |
| "reward_std": 0.6466428339481354, |
| "rewards/accuracy_reward": 0.5781249850988388, |
| "rewards/cosine_rewards": 0.023975687101483345, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0005103159819555003, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.453125, |
| "epoch": 0.1640346408558329, |
| "grad_norm": 10.65571265954387, |
| "kl": 0.0751953125, |
| "learning_rate": 9.179826795720835e-07, |
| "loss": 0.003, |
| "reward": 1.6510714292526245, |
| "reward_std": 1.0072646141052246, |
| "rewards/accuracy_reward": 0.7062499523162842, |
| "rewards/cosine_rewards": 0.0703657679259777, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.0005443187110358849, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 815.84375, |
| "epoch": 0.16454406520631687, |
| "grad_norm": 3.0291641455139042, |
| "kl": 0.0577392578125, |
| "learning_rate": 9.177279673968415e-07, |
| "loss": 0.0023, |
| "reward": 0.7281904220581055, |
| "reward_std": 0.7364227771759033, |
| "rewards/accuracy_reward": -0.07187500596046448, |
| "rewards/cosine_rewards": -0.15170371532440186, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0013558552600443363, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.84375, |
| "epoch": 0.16505348955680083, |
| "grad_norm": 4.950572350987556, |
| "kl": 0.081787109375, |
| "learning_rate": 9.174732552215996e-07, |
| "loss": 0.0033, |
| "reward": 1.5456467270851135, |
| "reward_std": 0.3990190625190735, |
| "rewards/accuracy_reward": 0.5750000178813934, |
| "rewards/cosine_rewards": 0.03384638950228691, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.000699635551427491, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.96875, |
| "epoch": 0.16556291390728478, |
| "grad_norm": 5.615028773473959, |
| "kl": 0.0675048828125, |
| "learning_rate": 9.172185430463576e-07, |
| "loss": 0.0027, |
| "reward": 1.4643962979316711, |
| "reward_std": 0.538501039147377, |
| "rewards/accuracy_reward": 0.4906250089406967, |
| "rewards/cosine_rewards": 0.021510865539312363, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.000864640751387924, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.71875, |
| "epoch": 0.1660723382577687, |
| "grad_norm": 12.207464085563803, |
| "kl": 0.071533203125, |
| "learning_rate": 9.169638308711156e-07, |
| "loss": 0.0029, |
| "reward": 1.1908642947673798, |
| "reward_std": 0.8150831162929535, |
| "rewards/accuracy_reward": 0.3156250100582838, |
| "rewards/cosine_rewards": 0.0008599106222391129, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.000620643695583567, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.828125, |
| "epoch": 0.16658176260825266, |
| "grad_norm": 4.633110149052728, |
| "kl": 0.067626953125, |
| "learning_rate": 9.167091186958737e-07, |
| "loss": 0.0027, |
| "reward": 1.3975687623023987, |
| "reward_std": 0.6602180898189545, |
| "rewards/accuracy_reward": 0.40937499701976776, |
| "rewards/cosine_rewards": 0.020020989701151848, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0005772198055638, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 886.234375, |
| "epoch": 0.16709118695873662, |
| "grad_norm": 11.480819900317456, |
| "kl": 0.0567626953125, |
| "learning_rate": 9.164544065206317e-07, |
| "loss": 0.0023, |
| "reward": 1.3322511315345764, |
| "reward_std": 0.7808408439159393, |
| "rewards/accuracy_reward": 0.3812499865889549, |
| "rewards/cosine_rewards": -0.0007541030645370483, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0013698027469217777, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 869.515625, |
| "epoch": 0.16760061130922058, |
| "grad_norm": 7.997306620734284, |
| "kl": 0.0577392578125, |
| "learning_rate": 9.161996943453897e-07, |
| "loss": 0.0023, |
| "reward": 1.1871361136436462, |
| "reward_std": 0.9155566692352295, |
| "rewards/accuracy_reward": 0.3218750059604645, |
| "rewards/cosine_rewards": -0.0396097619086504, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0013791794190183282, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 879.125, |
| "epoch": 0.16811003565970453, |
| "grad_norm": 3.8086916601970175, |
| "kl": 0.05810546875, |
| "learning_rate": 9.159449821701477e-07, |
| "loss": 0.0023, |
| "reward": 1.3816418051719666, |
| "reward_std": 0.8457719385623932, |
| "rewards/accuracy_reward": 0.43437500298023224, |
| "rewards/cosine_rewards": 0.026831649709492922, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0014398820349015296, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1077.125, |
| "epoch": 0.1686194600101885, |
| "grad_norm": 3.3635095209387, |
| "kl": 0.05029296875, |
| "learning_rate": 9.156902699949058e-07, |
| "loss": 0.002, |
| "reward": 1.4233552813529968, |
| "reward_std": 0.8923040926456451, |
| "rewards/accuracy_reward": 0.5718750357627869, |
| "rewards/cosine_rewards": 0.05618499033153057, |
| "rewards/format_reward": 0.796875, |
| "rewards/repetition_rewards": -0.0015796992811374366, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1025.625, |
| "epoch": 0.16912888436067244, |
| "grad_norm": 2.6795836472345886, |
| "kl": 0.053955078125, |
| "learning_rate": 9.154355578196637e-07, |
| "loss": 0.0022, |
| "reward": 1.5009884238243103, |
| "reward_std": 0.7616147696971893, |
| "rewards/accuracy_reward": 0.46562496945261955, |
| "rewards/cosine_rewards": 0.08306753821671009, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008290903642773628, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1130.390625, |
| "epoch": 0.1696383087111564, |
| "grad_norm": 2.8897374461041068, |
| "kl": 0.05615234375, |
| "learning_rate": 9.151808456444217e-07, |
| "loss": 0.0022, |
| "reward": 0.9568201899528503, |
| "reward_std": 0.883324146270752, |
| "rewards/accuracy_reward": 0.18437499552965164, |
| "rewards/cosine_rewards": -0.14675537310540676, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0026744193164631724, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1140.796875, |
| "epoch": 0.17014773306164035, |
| "grad_norm": 3.24780768226595, |
| "kl": 0.053955078125, |
| "learning_rate": 9.149261334691798e-07, |
| "loss": 0.0022, |
| "reward": 0.4763996750116348, |
| "reward_std": 1.3050541877746582, |
| "rewards/accuracy_reward": -0.07187501713633537, |
| "rewards/cosine_rewards": -0.26269275695085526, |
| "rewards/format_reward": 0.8125, |
| "rewards/repetition_rewards": -0.0015325736021623015, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1162.21875, |
| "epoch": 0.1706571574121243, |
| "grad_norm": 7.96370989509143, |
| "kl": 0.0509033203125, |
| "learning_rate": 9.146714212939378e-07, |
| "loss": 0.002, |
| "reward": 1.0168579816818237, |
| "reward_std": 1.0622537732124329, |
| "rewards/accuracy_reward": 0.23749998211860657, |
| "rewards/cosine_rewards": -0.06289426982402802, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.0014976929523982108, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1223.59375, |
| "epoch": 0.17116658176260827, |
| "grad_norm": 5.774523253643062, |
| "kl": 0.083251953125, |
| "learning_rate": 9.144167091186958e-07, |
| "loss": 0.0033, |
| "reward": 0.9260146915912628, |
| "reward_std": 1.3471828699111938, |
| "rewards/accuracy_reward": 0.26249998807907104, |
| "rewards/cosine_rewards": -0.11641103774309158, |
| "rewards/format_reward": 0.78125, |
| "rewards/repetition_rewards": -0.0013242715504020452, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1069.484375, |
| "epoch": 0.1716760061130922, |
| "grad_norm": 7.732047491992381, |
| "kl": 0.0555419921875, |
| "learning_rate": 9.141619969434538e-07, |
| "loss": 0.0022, |
| "reward": 1.0389263331890106, |
| "reward_std": 0.9250738620758057, |
| "rewards/accuracy_reward": 0.20937499403953552, |
| "rewards/cosine_rewards": -0.09034883230924606, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.001974849379621446, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 846.234375, |
| "epoch": 0.17218543046357615, |
| "grad_norm": 6.146520612031918, |
| "kl": 0.06689453125, |
| "learning_rate": 9.139072847682119e-07, |
| "loss": 0.0027, |
| "reward": 1.5287657380104065, |
| "reward_std": 0.7281034886837006, |
| "rewards/accuracy_reward": 0.5218749940395355, |
| "rewards/cosine_rewards": 0.055092147551476955, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0013264745939522982, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 878.59375, |
| "epoch": 0.1726948548140601, |
| "grad_norm": 5.859040533770109, |
| "kl": 0.059814453125, |
| "learning_rate": 9.136525725929699e-07, |
| "loss": 0.0024, |
| "reward": 1.309591829776764, |
| "reward_std": 0.8282720148563385, |
| "rewards/accuracy_reward": 0.3781249839812517, |
| "rewards/cosine_rewards": 0.02607971802353859, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0008628710638731718, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.234375, |
| "epoch": 0.17320427916454406, |
| "grad_norm": 4.068095402441544, |
| "kl": 0.066162109375, |
| "learning_rate": 9.133978604177279e-07, |
| "loss": 0.0026, |
| "reward": 1.1101016998291016, |
| "reward_std": 0.7019257247447968, |
| "rewards/accuracy_reward": 0.20624998956918716, |
| "rewards/cosine_rewards": -0.03270102944225073, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0009472573874518275, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 782.09375, |
| "epoch": 0.17371370351502802, |
| "grad_norm": 8.092723935778833, |
| "kl": 0.07080078125, |
| "learning_rate": 9.13143148242486e-07, |
| "loss": 0.0028, |
| "reward": 1.3624014258384705, |
| "reward_std": 0.6876442432403564, |
| "rewards/accuracy_reward": 0.40937499701976776, |
| "rewards/cosine_rewards": 0.0012194328010082245, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0013180217938497663, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.109375, |
| "epoch": 0.17422312786551197, |
| "grad_norm": 8.756999335905311, |
| "kl": 0.130126953125, |
| "learning_rate": 9.12888436067244e-07, |
| "loss": 0.0052, |
| "reward": 1.1084296703338623, |
| "reward_std": 1.0551597476005554, |
| "rewards/accuracy_reward": 0.2343750037252903, |
| "rewards/cosine_rewards": -0.062263866886496544, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0011814486351795495, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.5625, |
| "epoch": 0.17473255221599593, |
| "grad_norm": 4.670760280414413, |
| "kl": 0.07275390625, |
| "learning_rate": 9.12633723892002e-07, |
| "loss": 0.0029, |
| "reward": 1.379169523715973, |
| "reward_std": 0.6884946823120117, |
| "rewards/accuracy_reward": 0.40937499701976776, |
| "rewards/cosine_rewards": -0.014050468802452087, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0005300141347106546, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.359375, |
| "epoch": 0.17524197656647988, |
| "grad_norm": 27.098795999332967, |
| "kl": 0.08056640625, |
| "learning_rate": 9.123790117167601e-07, |
| "loss": 0.0032, |
| "reward": 1.6130830645561218, |
| "reward_std": 0.44565099477767944, |
| "rewards/accuracy_reward": 0.5781249701976776, |
| "rewards/cosine_rewards": 0.051279583014547825, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0006965193606447428, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.65625, |
| "epoch": 0.17575140091696384, |
| "grad_norm": 10.133659329005896, |
| "kl": 0.075439453125, |
| "learning_rate": 9.121242995415181e-07, |
| "loss": 0.003, |
| "reward": 1.6888669729232788, |
| "reward_std": 0.506424754858017, |
| "rewards/accuracy_reward": 0.690625011920929, |
| "rewards/cosine_rewards": 0.06190674379467964, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0011647465871647, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.6875, |
| "epoch": 0.1762608252674478, |
| "grad_norm": 5.129606334927334, |
| "kl": 0.07958984375, |
| "learning_rate": 9.11869587366276e-07, |
| "loss": 0.0032, |
| "reward": 1.2593636512756348, |
| "reward_std": 0.41098763048648834, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": -0.005569446831941605, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0006919201114214957, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.46875, |
| "epoch": 0.17677024961793172, |
| "grad_norm": 11.616711482358948, |
| "kl": 0.074462890625, |
| "learning_rate": 9.11614875191034e-07, |
| "loss": 0.003, |
| "reward": 1.4469356536865234, |
| "reward_std": 0.6090122163295746, |
| "rewards/accuracy_reward": 0.46562500298023224, |
| "rewards/cosine_rewards": -0.0018481542356312275, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.001216164615470916, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.0, |
| "epoch": 0.17727967396841568, |
| "grad_norm": 16.70982931235053, |
| "kl": 0.092041015625, |
| "learning_rate": 9.113601630157921e-07, |
| "loss": 0.0037, |
| "reward": 1.3860605359077454, |
| "reward_std": 0.5821886360645294, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/cosine_rewards": 0.01206381805241108, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0010032225982286036, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 819.25, |
| "epoch": 0.17778909831889964, |
| "grad_norm": 10.312325223777075, |
| "kl": 0.0694580078125, |
| "learning_rate": 9.111054508405501e-07, |
| "loss": 0.0028, |
| "reward": 1.3597615957260132, |
| "reward_std": 0.5677385032176971, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/cosine_rewards": 0.0019306838512420654, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0015441215364262462, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 816.125, |
| "epoch": 0.1782985226693836, |
| "grad_norm": 3.8656926374469416, |
| "kl": 0.07080078125, |
| "learning_rate": 9.108507386653081e-07, |
| "loss": 0.0028, |
| "reward": 1.1428874135017395, |
| "reward_std": 0.40452495217323303, |
| "rewards/accuracy_reward": 0.21249999105930328, |
| "rewards/cosine_rewards": -0.05335182696580887, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0006357444362947717, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 839.09375, |
| "epoch": 0.17880794701986755, |
| "grad_norm": 10.545025641089767, |
| "kl": 0.062744140625, |
| "learning_rate": 9.105960264900662e-07, |
| "loss": 0.0025, |
| "reward": 1.440682828426361, |
| "reward_std": 0.7122917473316193, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": 0.004494791850447655, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.001311894680839032, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.34375, |
| "epoch": 0.1793173713703515, |
| "grad_norm": 5.236449549563, |
| "kl": 0.081787109375, |
| "learning_rate": 9.103413143148242e-07, |
| "loss": 0.0033, |
| "reward": 1.7106852531433105, |
| "reward_std": 0.4475601017475128, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/cosine_rewards": 0.07085046917200089, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0007901439967099577, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 782.65625, |
| "epoch": 0.17982679572083546, |
| "grad_norm": 4.40390803368756, |
| "kl": 0.07568359375, |
| "learning_rate": 9.100866021395822e-07, |
| "loss": 0.003, |
| "reward": 1.321226179599762, |
| "reward_std": 0.5729265064001083, |
| "rewards/accuracy_reward": 0.3812500238418579, |
| "rewards/cosine_rewards": -0.04316529631614685, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.001233478484209627, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 911.4375, |
| "epoch": 0.18033622007131941, |
| "grad_norm": 4.585368026245459, |
| "kl": 0.083740234375, |
| "learning_rate": 9.098318899643402e-07, |
| "loss": 0.0034, |
| "reward": 1.2610972821712494, |
| "reward_std": 0.5936008393764496, |
| "rewards/accuracy_reward": 0.3812499828636646, |
| "rewards/cosine_rewards": -0.02527322620153427, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0011294231517240405, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 961.59375, |
| "epoch": 0.18084564442180337, |
| "grad_norm": 7.408796362493787, |
| "kl": 0.0693359375, |
| "learning_rate": 9.095771777890983e-07, |
| "loss": 0.0028, |
| "reward": 1.2509925812482834, |
| "reward_std": 0.5566798448562622, |
| "rewards/accuracy_reward": 0.3499999940395355, |
| "rewards/cosine_rewards": -0.034961797297000885, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.001545542269013822, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 875.421875, |
| "epoch": 0.18135506877228733, |
| "grad_norm": 6.46350391356739, |
| "kl": 0.08251953125, |
| "learning_rate": 9.093224656138563e-07, |
| "loss": 0.0033, |
| "reward": 1.1181039810180664, |
| "reward_std": 0.674926146864891, |
| "rewards/accuracy_reward": 0.23749998956918716, |
| "rewards/cosine_rewards": -0.05567748658359051, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0012184783699922264, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1032.265625, |
| "epoch": 0.18186449312277128, |
| "grad_norm": 6.831510020206218, |
| "kl": 0.0609130859375, |
| "learning_rate": 9.090677534386143e-07, |
| "loss": 0.0024, |
| "reward": 1.559360921382904, |
| "reward_std": 0.6407117247581482, |
| "rewards/accuracy_reward": 0.518750011920929, |
| "rewards/cosine_rewards": 0.057725198566913605, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0014893330517224967, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1343.859375, |
| "epoch": 0.1823739174732552, |
| "grad_norm": 4.523279934522272, |
| "kl": 0.05419921875, |
| "learning_rate": 9.088130412633724e-07, |
| "loss": 0.0022, |
| "reward": 1.3108936548233032, |
| "reward_std": 1.3749122023582458, |
| "rewards/accuracy_reward": 0.4843749850988388, |
| "rewards/cosine_rewards": -0.015468426048755646, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.0017629386857151985, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1283.875, |
| "epoch": 0.18288334182373917, |
| "grad_norm": 2.9278358224956733, |
| "kl": 0.046875, |
| "learning_rate": 9.085583290881304e-07, |
| "loss": 0.0019, |
| "reward": 0.899000346660614, |
| "reward_std": 1.2357721328735352, |
| "rewards/accuracy_reward": 0.20000001043081284, |
| "rewards/cosine_rewards": -0.12764177471399307, |
| "rewards/format_reward": 0.828125, |
| "rewards/repetition_rewards": -0.0014829274150542915, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1351.765625, |
| "epoch": 0.18339276617422312, |
| "grad_norm": 5.747269025294299, |
| "kl": 0.05029296875, |
| "learning_rate": 9.083036169128883e-07, |
| "loss": 0.002, |
| "reward": 0.685440868139267, |
| "reward_std": 1.0775729417800903, |
| "rewards/accuracy_reward": 0.062499986961483955, |
| "rewards/cosine_rewards": -0.23488027602434158, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0015538162551820278, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1328.359375, |
| "epoch": 0.18390219052470708, |
| "grad_norm": 5.5666444239146, |
| "kl": 0.046630859375, |
| "learning_rate": 9.080489047376463e-07, |
| "loss": 0.0019, |
| "reward": 1.440912902355194, |
| "reward_std": 1.3688839673995972, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/cosine_rewards": 0.05198000371456146, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.0016921277856454253, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1222.359375, |
| "epoch": 0.18441161487519103, |
| "grad_norm": 3.8469994374841496, |
| "kl": 0.063720703125, |
| "learning_rate": 9.077941925624044e-07, |
| "loss": 0.0025, |
| "reward": 1.2567678689956665, |
| "reward_std": 1.0096549689769745, |
| "rewards/accuracy_reward": 0.3531249761581421, |
| "rewards/cosine_rewards": -0.06352230161428452, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.001584898098371923, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1071.90625, |
| "epoch": 0.184921039225675, |
| "grad_norm": 10.748025785151507, |
| "kl": 0.080810546875, |
| "learning_rate": 9.075394803871624e-07, |
| "loss": 0.0032, |
| "reward": 1.4773434400558472, |
| "reward_std": 0.7611989676952362, |
| "rewards/accuracy_reward": 0.518750011920929, |
| "rewards/cosine_rewards": 0.022458821535110474, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0013653661007992923, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1068.84375, |
| "epoch": 0.18543046357615894, |
| "grad_norm": 6.961531569684862, |
| "kl": 0.0966796875, |
| "learning_rate": 9.072847682119204e-07, |
| "loss": 0.0039, |
| "reward": 1.3342331051826477, |
| "reward_std": 0.9612607657909393, |
| "rewards/accuracy_reward": 0.4906249940395355, |
| "rewards/cosine_rewards": -0.029875734820961952, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.0015161921037361026, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1151.765625, |
| "epoch": 0.1859398879266429, |
| "grad_norm": 4.787495962824196, |
| "kl": 0.0526123046875, |
| "learning_rate": 9.070300560366785e-07, |
| "loss": 0.0021, |
| "reward": 0.44851796329021454, |
| "reward_std": 0.6482652425765991, |
| "rewards/accuracy_reward": -0.18125002831220627, |
| "rewards/cosine_rewards": -0.3225611299276352, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0007959024223964661, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1087.453125, |
| "epoch": 0.18644931227712686, |
| "grad_norm": 3.5882611622656704, |
| "kl": 0.05517578125, |
| "learning_rate": 9.067753438614365e-07, |
| "loss": 0.0022, |
| "reward": 1.0280417203903198, |
| "reward_std": 0.8365518152713776, |
| "rewards/accuracy_reward": 0.2093750163912773, |
| "rewards/cosine_rewards": -0.10154062137007713, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0016676230588927865, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 921.3125, |
| "epoch": 0.1869587366276108, |
| "grad_norm": 10.03430669886217, |
| "kl": 0.07080078125, |
| "learning_rate": 9.065206316861945e-07, |
| "loss": 0.0028, |
| "reward": 1.1769609451293945, |
| "reward_std": 0.880241334438324, |
| "rewards/accuracy_reward": 0.2656249925494194, |
| "rewards/cosine_rewards": -0.04036855325102806, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0014205531333573163, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 841.6875, |
| "epoch": 0.18746816097809477, |
| "grad_norm": 27.23508906311087, |
| "kl": 0.07861328125, |
| "learning_rate": 9.062659195109526e-07, |
| "loss": 0.0031, |
| "reward": 1.685433030128479, |
| "reward_std": 0.49864277243614197, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/cosine_rewards": 0.1084844060242176, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0011764070368371904, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 737.03125, |
| "epoch": 0.1879775853285787, |
| "grad_norm": 18.41490109995637, |
| "kl": 0.08740234375, |
| "learning_rate": 9.060112073357106e-07, |
| "loss": 0.0035, |
| "reward": 1.3734083771705627, |
| "reward_std": 0.4119359850883484, |
| "rewards/accuracy_reward": 0.37812499701976776, |
| "rewards/cosine_rewards": 0.01141296117566526, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.000504543146234937, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 707.796875, |
| "epoch": 0.18848700967906265, |
| "grad_norm": 35.58611363543668, |
| "kl": 0.084716796875, |
| "learning_rate": 9.057564951604686e-07, |
| "loss": 0.0034, |
| "reward": 1.6782256960868835, |
| "reward_std": 0.5156250298023224, |
| "rewards/accuracy_reward": 0.6343749761581421, |
| "rewards/cosine_rewards": 0.07835755217820406, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0032568235765211284, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.015625, |
| "epoch": 0.1889964340295466, |
| "grad_norm": 7.982503076191807, |
| "kl": 0.086669921875, |
| "learning_rate": 9.055017829852266e-07, |
| "loss": 0.0035, |
| "reward": 1.760904848575592, |
| "reward_std": 0.49070215225219727, |
| "rewards/accuracy_reward": 0.690625011920929, |
| "rewards/cosine_rewards": 0.0864610131829977, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0005561279249377549, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 716.359375, |
| "epoch": 0.18950585838003056, |
| "grad_norm": 10.914576943859945, |
| "kl": 0.077880859375, |
| "learning_rate": 9.052470708099847e-07, |
| "loss": 0.0031, |
| "reward": 1.9701185822486877, |
| "reward_std": 0.40086938440799713, |
| "rewards/accuracy_reward": 0.831250011920929, |
| "rewards/cosine_rewards": 0.1397455483675003, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0008769762353040278, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.546875, |
| "epoch": 0.19001528273051452, |
| "grad_norm": 6.034059111318338, |
| "kl": 0.08544921875, |
| "learning_rate": 9.049923586347427e-07, |
| "loss": 0.0034, |
| "reward": 1.8058127164840698, |
| "reward_std": 0.41330619156360626, |
| "rewards/accuracy_reward": 0.7468750178813934, |
| "rewards/cosine_rewards": 0.1067701168358326, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0009574841533321887, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.0, |
| "epoch": 0.19052470708099847, |
| "grad_norm": 8.532753416320839, |
| "kl": 0.07861328125, |
| "learning_rate": 9.047376464595006e-07, |
| "loss": 0.0031, |
| "reward": 1.0842646658420563, |
| "reward_std": 0.44039003551006317, |
| "rewards/accuracy_reward": 0.15312500298023224, |
| "rewards/cosine_rewards": -0.052397772669792175, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0008375749748665839, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 804.21875, |
| "epoch": 0.19103413143148243, |
| "grad_norm": 7.4599789196405375, |
| "kl": 0.078125, |
| "learning_rate": 9.044829342842587e-07, |
| "loss": 0.0031, |
| "reward": 0.974018394947052, |
| "reward_std": 0.5849625766277313, |
| "rewards/accuracy_reward": 0.09999999403953552, |
| "rewards/cosine_rewards": -0.10963174607604742, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0007248484616866335, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 767.84375, |
| "epoch": 0.19154355578196638, |
| "grad_norm": 7.073657321582574, |
| "kl": 0.0703125, |
| "learning_rate": 9.042282221090167e-07, |
| "loss": 0.0028, |
| "reward": 0.914261519908905, |
| "reward_std": 0.7159627079963684, |
| "rewards/accuracy_reward": 0.09999998658895493, |
| "rewards/cosine_rewards": -0.13804471492767334, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008187246276065707, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 898.65625, |
| "epoch": 0.19205298013245034, |
| "grad_norm": 5.152812978669099, |
| "kl": 0.060791015625, |
| "learning_rate": 9.039735099337747e-07, |
| "loss": 0.0024, |
| "reward": 1.3070060014724731, |
| "reward_std": 0.5369542390108109, |
| "rewards/accuracy_reward": 0.3531250059604645, |
| "rewards/cosine_rewards": 0.01811320334672928, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0017322039348073304, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 824.65625, |
| "epoch": 0.1925624044829343, |
| "grad_norm": 4.4264228290413055, |
| "kl": 0.071044921875, |
| "learning_rate": 9.037187977585327e-07, |
| "loss": 0.0028, |
| "reward": 1.9969289302825928, |
| "reward_std": 0.36238182336091995, |
| "rewards/accuracy_reward": 0.887499988079071, |
| "rewards/cosine_rewards": 0.1572401076555252, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0009361990523757413, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1041.234375, |
| "epoch": 0.19307182883341822, |
| "grad_norm": 2.966764644317296, |
| "kl": 0.0531005859375, |
| "learning_rate": 9.034640855832908e-07, |
| "loss": 0.0021, |
| "reward": 1.9203879237174988, |
| "reward_std": 0.6297050192952156, |
| "rewards/accuracy_reward": 0.831250011920929, |
| "rewards/cosine_rewards": 0.1525670364499092, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0009290309972129762, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1066.578125, |
| "epoch": 0.19358125318390218, |
| "grad_norm": 6.45607299399273, |
| "kl": 0.0604248046875, |
| "learning_rate": 9.032093734080488e-07, |
| "loss": 0.0024, |
| "reward": 1.5978580713272095, |
| "reward_std": 0.7550583779811859, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/cosine_rewards": 0.0831909030675888, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0009578557801432908, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1159.84375, |
| "epoch": 0.19409067753438614, |
| "grad_norm": 15.218618980246333, |
| "kl": 0.0557861328125, |
| "learning_rate": 9.029546612328068e-07, |
| "loss": 0.0022, |
| "reward": 1.495898723602295, |
| "reward_std": 0.8071758449077606, |
| "rewards/accuracy_reward": 0.5187499970197678, |
| "rewards/cosine_rewards": 0.04102367162704468, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0013749129138886929, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1371.671875, |
| "epoch": 0.1946001018848701, |
| "grad_norm": 2.5671889777891415, |
| "kl": 0.0416259765625, |
| "learning_rate": 9.026999490575649e-07, |
| "loss": 0.0017, |
| "reward": 1.4654145240783691, |
| "reward_std": 0.9137448668479919, |
| "rewards/accuracy_reward": 0.5468749925494194, |
| "rewards/cosine_rewards": 0.029642254114151, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0017276888247579336, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1303.015625, |
| "epoch": 0.19510952623535405, |
| "grad_norm": 3.460987727073742, |
| "kl": 0.0421142578125, |
| "learning_rate": 9.024452368823229e-07, |
| "loss": 0.0017, |
| "reward": 1.348323106765747, |
| "reward_std": 0.42551596462726593, |
| "rewards/accuracy_reward": 0.40937499701976776, |
| "rewards/cosine_rewards": 0.0025482475757598877, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0011001455131918192, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1507.828125, |
| "epoch": 0.195618950585838, |
| "grad_norm": 2.4731324069110165, |
| "kl": 0.040771484375, |
| "learning_rate": 9.021905247070809e-07, |
| "loss": 0.0016, |
| "reward": 1.2840899229049683, |
| "reward_std": 1.3389369249343872, |
| "rewards/accuracy_reward": 0.43437498807907104, |
| "rewards/cosine_rewards": 0.007216873578727245, |
| "rewards/format_reward": 0.84375, |
| "rewards/repetition_rewards": -0.0012519625015556812, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1392.5625, |
| "epoch": 0.19612837493632196, |
| "grad_norm": 3.9847528096417464, |
| "kl": 0.0401611328125, |
| "learning_rate": 9.019358125318391e-07, |
| "loss": 0.0016, |
| "reward": 0.952269122004509, |
| "reward_std": 1.110903412103653, |
| "rewards/accuracy_reward": 0.21249999478459358, |
| "rewards/cosine_rewards": -0.16534814983606339, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.001132699428126216, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1509.515625, |
| "epoch": 0.19663779928680591, |
| "grad_norm": 1.3917929153423205, |
| "kl": 0.0386962890625, |
| "learning_rate": 9.016811003565971e-07, |
| "loss": 0.0015, |
| "reward": 1.3951207399368286, |
| "reward_std": 1.3895853757858276, |
| "rewards/accuracy_reward": 0.49062497913837433, |
| "rewards/cosine_rewards": 0.030975546687841415, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.0014798620832152665, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1453.75, |
| "epoch": 0.19714722363728987, |
| "grad_norm": 4.5123123100553215, |
| "kl": 0.040283203125, |
| "learning_rate": 9.014263881813551e-07, |
| "loss": 0.0016, |
| "reward": 1.0250075459480286, |
| "reward_std": 1.1121925115585327, |
| "rewards/accuracy_reward": 0.2656249925494194, |
| "rewards/cosine_rewards": -0.06678299978375435, |
| "rewards/format_reward": 0.828125, |
| "rewards/repetition_rewards": -0.001959475106559694, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1502.953125, |
| "epoch": 0.19765664798777383, |
| "grad_norm": 6.3687413391252665, |
| "kl": 0.0384521484375, |
| "learning_rate": 9.011716760061131e-07, |
| "loss": 0.0015, |
| "reward": 0.6134699061512947, |
| "reward_std": 0.8420631885528564, |
| "rewards/accuracy_reward": 0.015624985098838806, |
| "rewards/cosine_rewards": -0.2909963075071573, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0017838198109529912, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1398.71875, |
| "epoch": 0.19816607233825778, |
| "grad_norm": 3.252398002176852, |
| "kl": 0.04052734375, |
| "learning_rate": 9.009169638308711e-07, |
| "loss": 0.0016, |
| "reward": 0.6156338006258011, |
| "reward_std": 1.171474575996399, |
| "rewards/accuracy_reward": -0.012500010430812836, |
| "rewards/cosine_rewards": -0.2769355773925781, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0011806105903815478, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1356.796875, |
| "epoch": 0.1986754966887417, |
| "grad_norm": 2.9564973631467386, |
| "kl": 0.0411376953125, |
| "learning_rate": 9.006622516556291e-07, |
| "loss": 0.0016, |
| "reward": 1.4809187650680542, |
| "reward_std": 0.4241075813770294, |
| "rewards/accuracy_reward": 0.4656249899417162, |
| "rewards/cosine_rewards": 0.0635819137096405, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0014131638454273343, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1314.8125, |
| "epoch": 0.19918492103922567, |
| "grad_norm": 1.9868954496027869, |
| "kl": 0.040283203125, |
| "learning_rate": 9.004075394803871e-07, |
| "loss": 0.0016, |
| "reward": 0.3289404660463333, |
| "reward_std": 0.6832451522350311, |
| "rewards/accuracy_reward": -0.23750004172325134, |
| "rewards/cosine_rewards": -0.38559940457344055, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0010851426632143557, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1211.65625, |
| "epoch": 0.19969434538970962, |
| "grad_norm": 2.3384012636409524, |
| "kl": 0.0426025390625, |
| "learning_rate": 9.001528273051452e-07, |
| "loss": 0.0017, |
| "reward": 1.7431849241256714, |
| "reward_std": 0.5287438631057739, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.0974309928715229, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0011210814118385315, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1199.40625, |
| "epoch": 0.20020376974019358, |
| "grad_norm": 8.037383660003005, |
| "kl": 0.0426025390625, |
| "learning_rate": 8.998981151299032e-07, |
| "loss": 0.0017, |
| "reward": 1.205706238746643, |
| "reward_std": 0.5482289791107178, |
| "rewards/accuracy_reward": 0.2968749925494194, |
| "rewards/cosine_rewards": -0.09018014371395111, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0009886454208754003, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1215.25, |
| "epoch": 0.20071319409067753, |
| "grad_norm": 2.7015176132022205, |
| "kl": 0.04150390625, |
| "learning_rate": 8.996434029546612e-07, |
| "loss": 0.0017, |
| "reward": 1.3461086750030518, |
| "reward_std": 0.36276355385780334, |
| "rewards/accuracy_reward": 0.3812499940395355, |
| "rewards/cosine_rewards": -0.033333455212414265, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0018078879220411181, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1148.140625, |
| "epoch": 0.2012226184411615, |
| "grad_norm": 2.4525739585224064, |
| "kl": 0.0447998046875, |
| "learning_rate": 8.993886907794193e-07, |
| "loss": 0.0018, |
| "reward": 1.6304560899734497, |
| "reward_std": 0.6783818304538727, |
| "rewards/accuracy_reward": 0.5781249850988388, |
| "rewards/cosine_rewards": 0.0690329410135746, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0010768624488264322, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1234.03125, |
| "epoch": 0.20173204279164544, |
| "grad_norm": 2.620518407503657, |
| "kl": 0.0426025390625, |
| "learning_rate": 8.991339786041773e-07, |
| "loss": 0.0017, |
| "reward": 1.0580366849899292, |
| "reward_std": 0.45367684960365295, |
| "rewards/accuracy_reward": 0.18437499552965164, |
| "rewards/cosine_rewards": -0.09430436789989471, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0007839706668164581, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1255.140625, |
| "epoch": 0.2022414671421294, |
| "grad_norm": 2.848324792333859, |
| "kl": 0.0416259765625, |
| "learning_rate": 8.988792664289353e-07, |
| "loss": 0.0017, |
| "reward": 1.396336853504181, |
| "reward_std": 0.6851004362106323, |
| "rewards/accuracy_reward": 0.40937498584389687, |
| "rewards/cosine_rewards": 0.003251887857913971, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00066499671083875, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1243.234375, |
| "epoch": 0.20275089149261336, |
| "grad_norm": 2.5122988909394457, |
| "kl": 0.04150390625, |
| "learning_rate": 8.986245542536933e-07, |
| "loss": 0.0017, |
| "reward": 2.053937077522278, |
| "reward_std": 0.5187530070543289, |
| "rewards/accuracy_reward": 0.8312499821186066, |
| "rewards/cosine_rewards": 0.22372649610042572, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0010393889679107815, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1395.28125, |
| "epoch": 0.2032603158430973, |
| "grad_norm": 8.131421667160394, |
| "kl": 0.039306640625, |
| "learning_rate": 8.983698420784514e-07, |
| "loss": 0.0016, |
| "reward": 1.9118317365646362, |
| "reward_std": 0.3381110727787018, |
| "rewards/accuracy_reward": 0.7187500149011612, |
| "rewards/cosine_rewards": 0.19487697072327137, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.001795282296370715, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1477.328125, |
| "epoch": 0.20376974019358127, |
| "grad_norm": 2.5663546513961992, |
| "kl": 0.0489501953125, |
| "learning_rate": 8.981151299032094e-07, |
| "loss": 0.002, |
| "reward": 0.616385743021965, |
| "reward_std": 0.5365406274795532, |
| "rewards/accuracy_reward": -0.012500017881393433, |
| "rewards/cosine_rewards": -0.27611421793699265, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0012499869335442781, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1641.53125, |
| "epoch": 0.2042791645440652, |
| "grad_norm": 2.3476982545455947, |
| "kl": 0.0382080078125, |
| "learning_rate": 8.978604177279674e-07, |
| "loss": 0.0015, |
| "reward": 0.38990160822868347, |
| "reward_std": 1.22097048163414, |
| "rewards/accuracy_reward": -0.06875001266598701, |
| "rewards/cosine_rewards": -0.352715402841568, |
| "rewards/format_reward": 0.8125, |
| "rewards/repetition_rewards": -0.0011329837725497782, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1740.96875, |
| "epoch": 0.20478858889454915, |
| "grad_norm": 1.6789909982175664, |
| "kl": 0.036376953125, |
| "learning_rate": 8.976057055527255e-07, |
| "loss": 0.0015, |
| "reward": 0.7690124660730362, |
| "reward_std": 1.7883394956588745, |
| "rewards/accuracy_reward": 0.24062499403953552, |
| "rewards/cosine_rewards": -0.15724666975438595, |
| "rewards/format_reward": 0.6875, |
| "rewards/repetition_rewards": -0.001865879981778562, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1715.625, |
| "epoch": 0.2052980132450331, |
| "grad_norm": 1.732486740072958, |
| "kl": 0.035400390625, |
| "learning_rate": 8.973509933774834e-07, |
| "loss": 0.0014, |
| "reward": 0.6791011095046997, |
| "reward_std": 1.0334843397140503, |
| "rewards/accuracy_reward": 0.1249999925494194, |
| "rewards/cosine_rewards": -0.21049801260232925, |
| "rewards/format_reward": 0.765625, |
| "rewards/repetition_rewards": -0.0010258048423565924, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1585.046875, |
| "epoch": 0.20580743759551706, |
| "grad_norm": 1.6162362158227377, |
| "kl": 0.037109375, |
| "learning_rate": 8.970962812022414e-07, |
| "loss": 0.0015, |
| "reward": 0.9881232976913452, |
| "reward_std": 1.0253838300704956, |
| "rewards/accuracy_reward": 0.24062499403953552, |
| "rewards/cosine_rewards": -0.12615075334906578, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.001350913429632783, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1497.734375, |
| "epoch": 0.20631686194600102, |
| "grad_norm": 5.362930427796704, |
| "kl": 0.039306640625, |
| "learning_rate": 8.968415690269994e-07, |
| "loss": 0.0016, |
| "reward": 1.5187935531139374, |
| "reward_std": 0.5071015954017639, |
| "rewards/accuracy_reward": 0.5218749940395355, |
| "rewards/cosine_rewards": 0.07592727243900299, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0008836896740831435, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1471.890625, |
| "epoch": 0.20682628629648497, |
| "grad_norm": 2.5474971754837896, |
| "kl": 0.0374755859375, |
| "learning_rate": 8.965868568517575e-07, |
| "loss": 0.0015, |
| "reward": 1.7093470096588135, |
| "reward_std": 0.26929083466529846, |
| "rewards/accuracy_reward": 0.6062499955296516, |
| "rewards/cosine_rewards": 0.13566255569458008, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0013155650231055915, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1517.765625, |
| "epoch": 0.20733571064696893, |
| "grad_norm": 2.3954440093211695, |
| "kl": 0.0372314453125, |
| "learning_rate": 8.963321446765155e-07, |
| "loss": 0.0015, |
| "reward": 1.6693125367164612, |
| "reward_std": 0.8508188724517822, |
| "rewards/accuracy_reward": 0.5781250149011612, |
| "rewards/cosine_rewards": 0.12336396798491478, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.000926460576010868, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1427.40625, |
| "epoch": 0.20784513499745289, |
| "grad_norm": 4.487502302070771, |
| "kl": 0.037109375, |
| "learning_rate": 8.960774325012735e-07, |
| "loss": 0.0015, |
| "reward": 1.6373254656791687, |
| "reward_std": 0.37433764338493347, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.11931294947862625, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0007375250570476055, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1500.875, |
| "epoch": 0.20835455934793684, |
| "grad_norm": 5.555469475832445, |
| "kl": 0.0374755859375, |
| "learning_rate": 8.958227203260316e-07, |
| "loss": 0.0015, |
| "reward": 1.398006021976471, |
| "reward_std": 1.336867332458496, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": 0.02401774376630783, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.001011726533761248, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1434.859375, |
| "epoch": 0.2088639836984208, |
| "grad_norm": 3.6143044040934105, |
| "kl": 0.0435791015625, |
| "learning_rate": 8.955680081507896e-07, |
| "loss": 0.0017, |
| "reward": 1.618862271308899, |
| "reward_std": 0.7050271332263947, |
| "rewards/accuracy_reward": 0.5468750074505806, |
| "rewards/cosine_rewards": 0.1038745865225792, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.000637321179965511, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1506.828125, |
| "epoch": 0.20937340804890472, |
| "grad_norm": 3.854404990598997, |
| "kl": 0.0361328125, |
| "learning_rate": 8.953132959755476e-07, |
| "loss": 0.0014, |
| "reward": 1.6651726961135864, |
| "reward_std": 0.45976050198078156, |
| "rewards/accuracy_reward": 0.5781250074505806, |
| "rewards/cosine_rewards": 0.11916181445121765, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0008640679297968745, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1526.96875, |
| "epoch": 0.20988283239938868, |
| "grad_norm": 2.3422021364641736, |
| "kl": 0.03662109375, |
| "learning_rate": 8.950585838003057e-07, |
| "loss": 0.0015, |
| "reward": 0.6352521181106567, |
| "reward_std": 1.1320685744285583, |
| "rewards/accuracy_reward": -0.012500017881393433, |
| "rewards/cosine_rewards": -0.28875819593667984, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0009897005802486092, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1510.03125, |
| "epoch": 0.21039225674987264, |
| "grad_norm": 2.1794044547587275, |
| "kl": 0.0567626953125, |
| "learning_rate": 8.948038716250637e-07, |
| "loss": 0.0023, |
| "reward": 1.4266446828842163, |
| "reward_std": 0.8459653854370117, |
| "rewards/accuracy_reward": 0.4624999910593033, |
| "rewards/cosine_rewards": 0.07424483820796013, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0007251804636325687, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1505.8125, |
| "epoch": 0.2109016811003566, |
| "grad_norm": 2.019375037035424, |
| "kl": 0.042236328125, |
| "learning_rate": 8.945491594498217e-07, |
| "loss": 0.0017, |
| "reward": 1.4379878044128418, |
| "reward_std": 0.6174334287643433, |
| "rewards/accuracy_reward": 0.4374999888241291, |
| "rewards/cosine_rewards": 0.04812653362751007, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0007637535745743662, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1476.9375, |
| "epoch": 0.21141110545084055, |
| "grad_norm": 2.647595808512136, |
| "kl": 0.041259765625, |
| "learning_rate": 8.942944472745797e-07, |
| "loss": 0.0016, |
| "reward": 0.9988905191421509, |
| "reward_std": 0.6921209692955017, |
| "rewards/accuracy_reward": 0.20937499403953552, |
| "rewards/cosine_rewards": -0.1462814100086689, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0017031602037604898, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1508.578125, |
| "epoch": 0.2119205298013245, |
| "grad_norm": 2.6566052420282933, |
| "kl": 0.03466796875, |
| "learning_rate": 8.940397350993378e-07, |
| "loss": 0.0014, |
| "reward": 1.211571991443634, |
| "reward_std": 1.0560529828071594, |
| "rewards/accuracy_reward": 0.32499998807907104, |
| "rewards/cosine_rewards": -0.0497976616024971, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.00113033052184619, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1513.40625, |
| "epoch": 0.21242995415180846, |
| "grad_norm": 2.084079343038072, |
| "kl": 0.0411376953125, |
| "learning_rate": 8.937850229240957e-07, |
| "loss": 0.0016, |
| "reward": 0.5206416845321655, |
| "reward_std": 0.49498558044433594, |
| "rewards/accuracy_reward": -0.09687501192092896, |
| "rewards/cosine_rewards": -0.36537329852581024, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0014850463485345244, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1506.921875, |
| "epoch": 0.21293937850229241, |
| "grad_norm": 1.7101934155068432, |
| "kl": 0.036865234375, |
| "learning_rate": 8.935303107488537e-07, |
| "loss": 0.0015, |
| "reward": 1.16130793094635, |
| "reward_std": 0.738935075700283, |
| "rewards/accuracy_reward": 0.2968749850988388, |
| "rewards/cosine_rewards": -0.08809526264667511, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0005968308250885457, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1452.859375, |
| "epoch": 0.21344880285277637, |
| "grad_norm": 2.6364264984634236, |
| "kl": 0.037109375, |
| "learning_rate": 8.932755985736118e-07, |
| "loss": 0.0015, |
| "reward": 1.4882609844207764, |
| "reward_std": 0.6527669131755829, |
| "rewards/accuracy_reward": 0.4937499836087227, |
| "rewards/cosine_rewards": 0.042041175067424774, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0006552368577104062, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1425.0, |
| "epoch": 0.21395822720326033, |
| "grad_norm": 22.24294483419425, |
| "kl": 0.0374755859375, |
| "learning_rate": 8.930208863983698e-07, |
| "loss": 0.0015, |
| "reward": 1.5828353762626648, |
| "reward_std": 0.6265529096126556, |
| "rewards/accuracy_reward": 0.5468749850988388, |
| "rewards/cosine_rewards": 0.08372939098626375, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008939505496528, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1396.171875, |
| "epoch": 0.21446765155374428, |
| "grad_norm": 2.8168572000468366, |
| "kl": 0.049560546875, |
| "learning_rate": 8.927661742231278e-07, |
| "loss": 0.002, |
| "reward": 1.6206639409065247, |
| "reward_std": 0.5450826287269592, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/cosine_rewards": 0.12136101722717285, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0006969515234231949, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1395.9375, |
| "epoch": 0.2149770759042282, |
| "grad_norm": 1.840733711397487, |
| "kl": 0.0379638671875, |
| "learning_rate": 8.925114620478858e-07, |
| "loss": 0.0015, |
| "reward": 1.8798171877861023, |
| "reward_std": 0.5979900360107422, |
| "rewards/accuracy_reward": 0.690625011920929, |
| "rewards/cosine_rewards": 0.18991604819893837, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0007238158723339438, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1503.703125, |
| "epoch": 0.21548650025471217, |
| "grad_norm": 2.327429653832842, |
| "kl": 0.0377197265625, |
| "learning_rate": 8.922567498726439e-07, |
| "loss": 0.0015, |
| "reward": 1.1887712478637695, |
| "reward_std": 0.615043044090271, |
| "rewards/accuracy_reward": 0.2968749850988388, |
| "rewards/cosine_rewards": -0.09194361418485641, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0005351053987396881, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1528.765625, |
| "epoch": 0.21599592460519612, |
| "grad_norm": 3.1639646848610017, |
| "kl": 0.0347900390625, |
| "learning_rate": 8.920020376974019e-07, |
| "loss": 0.0014, |
| "reward": 1.1957539916038513, |
| "reward_std": 1.2394747734069824, |
| "rewards/accuracy_reward": 0.3531249985098839, |
| "rewards/cosine_rewards": -0.03115752711892128, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.001213467272464186, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1674.765625, |
| "epoch": 0.21650534895568008, |
| "grad_norm": 2.5043711144165126, |
| "kl": 0.0338134765625, |
| "learning_rate": 8.917473255221599e-07, |
| "loss": 0.0014, |
| "reward": 1.1731443107128143, |
| "reward_std": 0.8068048655986786, |
| "rewards/accuracy_reward": 0.3218749836087227, |
| "rewards/cosine_rewards": -0.038288604468107224, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0010670205520000309, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1664.5, |
| "epoch": 0.21701477330616403, |
| "grad_norm": 3.6500533940846327, |
| "kl": 0.03515625, |
| "learning_rate": 8.91492613346918e-07, |
| "loss": 0.0014, |
| "reward": 0.6680706441402435, |
| "reward_std": 1.1470927596092224, |
| "rewards/accuracy_reward": 0.012499995529651642, |
| "rewards/cosine_rewards": -0.28094063699245453, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.000988698098808527, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1733.359375, |
| "epoch": 0.217524197656648, |
| "grad_norm": 1.7264860203774577, |
| "kl": 0.033203125, |
| "learning_rate": 8.91237901171676e-07, |
| "loss": 0.0013, |
| "reward": 1.151515543460846, |
| "reward_std": 1.0065627694129944, |
| "rewards/accuracy_reward": 0.37812499701976776, |
| "rewards/cosine_rewards": -0.02256488800048828, |
| "rewards/format_reward": 0.796875, |
| "rewards/repetition_rewards": -0.0009195689344778657, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1738.171875, |
| "epoch": 0.21803362200713194, |
| "grad_norm": 1.8028678441679502, |
| "kl": 0.033447265625, |
| "learning_rate": 8.90983188996434e-07, |
| "loss": 0.0013, |
| "reward": 0.33622707426548004, |
| "reward_std": 1.554500699043274, |
| "rewards/accuracy_reward": -0.046875011175870895, |
| "rewards/cosine_rewards": -0.35029861330986023, |
| "rewards/format_reward": 0.734375, |
| "rewards/repetition_rewards": -0.0009743365517351776, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1710.0, |
| "epoch": 0.2185430463576159, |
| "grad_norm": 1.7315216890300187, |
| "kl": 0.0386962890625, |
| "learning_rate": 8.90728476821192e-07, |
| "loss": 0.0015, |
| "reward": 1.2531213760375977, |
| "reward_std": 1.7619973421096802, |
| "rewards/accuracy_reward": 0.4624999910593033, |
| "rewards/cosine_rewards": 0.010814379900693893, |
| "rewards/format_reward": 0.78125, |
| "rewards/repetition_rewards": -0.0014429978909902275, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1681.5625, |
| "epoch": 0.21905247070809986, |
| "grad_norm": 1.448581293364632, |
| "kl": 0.0350341796875, |
| "learning_rate": 8.904737646459501e-07, |
| "loss": 0.0014, |
| "reward": 0.5936174094676971, |
| "reward_std": 1.1982838213443756, |
| "rewards/accuracy_reward": 0.015624940395355225, |
| "rewards/cosine_rewards": -0.2807646095752716, |
| "rewards/format_reward": 0.859375, |
| "rewards/repetition_rewards": -0.0006179730116855353, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1480.1875, |
| "epoch": 0.2195618950585838, |
| "grad_norm": 4.830088485887878, |
| "kl": 0.0394287109375, |
| "learning_rate": 8.90219052470708e-07, |
| "loss": 0.0016, |
| "reward": 1.1779060363769531, |
| "reward_std": 1.0625053942203522, |
| "rewards/accuracy_reward": 0.31562499701976776, |
| "rewards/cosine_rewards": -0.05869085341691971, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0009031399386003613, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1440.796875, |
| "epoch": 0.22007131940906777, |
| "grad_norm": 2.4405157931321124, |
| "kl": 0.037109375, |
| "learning_rate": 8.89964340295466e-07, |
| "loss": 0.0015, |
| "reward": 0.9002698361873627, |
| "reward_std": 0.7880153059959412, |
| "rewards/accuracy_reward": 0.09999998845160007, |
| "rewards/cosine_rewards": -0.18298358470201492, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0011215846752747893, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1348.515625, |
| "epoch": 0.2205807437595517, |
| "grad_norm": 2.1326390000811806, |
| "kl": 0.0418701171875, |
| "learning_rate": 8.897096281202241e-07, |
| "loss": 0.0017, |
| "reward": 0.7409723997116089, |
| "reward_std": 0.7918355762958527, |
| "rewards/accuracy_reward": 0.015624990686774254, |
| "rewards/cosine_rewards": -0.21155225485563278, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.000600404484430328, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1298.125, |
| "epoch": 0.22109016811003565, |
| "grad_norm": 3.815337891096848, |
| "kl": 0.0418701171875, |
| "learning_rate": 8.894549159449821e-07, |
| "loss": 0.0017, |
| "reward": 1.8587952256202698, |
| "reward_std": 0.6939655542373657, |
| "rewards/accuracy_reward": 0.7187499701976776, |
| "rewards/cosine_rewards": 0.1717987135052681, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0005034840432927012, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1156.0, |
| "epoch": 0.2215995924605196, |
| "grad_norm": 4.479065196602373, |
| "kl": 0.0440673828125, |
| "learning_rate": 8.892002037697401e-07, |
| "loss": 0.0018, |
| "reward": 1.4347090125083923, |
| "reward_std": 0.3772214949131012, |
| "rewards/accuracy_reward": 0.43749997206032276, |
| "rewards/cosine_rewards": -0.0022302046418190002, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0005607931379927322, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1158.46875, |
| "epoch": 0.22210901681100356, |
| "grad_norm": 4.149290240827553, |
| "kl": 0.0455322265625, |
| "learning_rate": 8.889454915944982e-07, |
| "loss": 0.0018, |
| "reward": 1.0905642956495285, |
| "reward_std": 0.5234603583812714, |
| "rewards/accuracy_reward": 0.2124999761581421, |
| "rewards/cosine_rewards": -0.10600101202726364, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.00030972264357842505, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1058.734375, |
| "epoch": 0.22261844116148752, |
| "grad_norm": 6.6780990750455205, |
| "kl": 0.046630859375, |
| "learning_rate": 8.886907794192562e-07, |
| "loss": 0.0019, |
| "reward": 0.9731817841529846, |
| "reward_std": 0.8772869110107422, |
| "rewards/accuracy_reward": 0.09687498956918716, |
| "rewards/cosine_rewards": -0.09189720638096333, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0005459659732878208, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1056.5, |
| "epoch": 0.22312786551197147, |
| "grad_norm": 2.737447142182153, |
| "kl": 0.044189453125, |
| "learning_rate": 8.884360672440142e-07, |
| "loss": 0.0018, |
| "reward": 1.1522070169448853, |
| "reward_std": 0.6963326930999756, |
| "rewards/accuracy_reward": 0.24062499403953552, |
| "rewards/cosine_rewards": -0.07206200063228607, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0007310137443710119, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1072.171875, |
| "epoch": 0.22363728986245543, |
| "grad_norm": 2.6298041678582953, |
| "kl": 0.046875, |
| "learning_rate": 8.881813550687722e-07, |
| "loss": 0.0019, |
| "reward": 1.4510762691497803, |
| "reward_std": 0.5045955777168274, |
| "rewards/accuracy_reward": 0.49375002086162567, |
| "rewards/cosine_rewards": 0.020084097981452942, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0002578186395112425, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1094.546875, |
| "epoch": 0.22414671421293939, |
| "grad_norm": 2.047145097208152, |
| "kl": 0.0438232421875, |
| "learning_rate": 8.879266428935303e-07, |
| "loss": 0.0018, |
| "reward": 1.5189008712768555, |
| "reward_std": 0.34239334613084793, |
| "rewards/accuracy_reward": 0.4906250089406967, |
| "rewards/cosine_rewards": 0.0758383758366108, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0006874670943943784, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1083.09375, |
| "epoch": 0.22465613856342334, |
| "grad_norm": 2.966919929118076, |
| "kl": 0.0457763671875, |
| "learning_rate": 8.876719307182883e-07, |
| "loss": 0.0018, |
| "reward": 1.2056291699409485, |
| "reward_std": 0.828714907169342, |
| "rewards/accuracy_reward": 0.29687498696148396, |
| "rewards/cosine_rewards": -0.043857116252183914, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0005137350672157481, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1103.375, |
| "epoch": 0.2251655629139073, |
| "grad_norm": 3.389083005059361, |
| "kl": 0.042724609375, |
| "learning_rate": 8.874172185430463e-07, |
| "loss": 0.0017, |
| "reward": 1.483572542667389, |
| "reward_std": 0.5207121074199677, |
| "rewards/accuracy_reward": 0.4624999910593033, |
| "rewards/cosine_rewards": 0.053015733137726784, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0006932187097845599, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1217.421875, |
| "epoch": 0.22567498726439122, |
| "grad_norm": 1.8084576051013326, |
| "kl": 0.0426025390625, |
| "learning_rate": 8.871625063678044e-07, |
| "loss": 0.0017, |
| "reward": 1.604416847229004, |
| "reward_std": 0.68864506483078, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/cosine_rewards": 0.10491618514060974, |
| "rewards/format_reward": 0.921875, |
| "rewards/repetition_rewards": -0.0004992800822947174, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1285.484375, |
| "epoch": 0.22618441161487518, |
| "grad_norm": 20.129612374292474, |
| "kl": 0.042236328125, |
| "learning_rate": 8.869077941925624e-07, |
| "loss": 0.0017, |
| "reward": 1.7994786500930786, |
| "reward_std": 0.3120774105191231, |
| "rewards/accuracy_reward": 0.6624999940395355, |
| "rewards/cosine_rewards": 0.15342308580875397, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0008193884277716279, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1370.578125, |
| "epoch": 0.22669383596535914, |
| "grad_norm": 3.234143174544009, |
| "kl": 0.0433349609375, |
| "learning_rate": 8.866530820173203e-07, |
| "loss": 0.0017, |
| "reward": 1.3398171067237854, |
| "reward_std": 0.7532171607017517, |
| "rewards/accuracy_reward": 0.3812500014901161, |
| "rewards/cosine_rewards": -0.0252380333840847, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0005697726446669549, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1474.875, |
| "epoch": 0.2272032603158431, |
| "grad_norm": 1.7926893560129409, |
| "kl": 0.040283203125, |
| "learning_rate": 8.863983698420783e-07, |
| "loss": 0.0016, |
| "reward": 1.4312800765037537, |
| "reward_std": 0.6859093904495239, |
| "rewards/accuracy_reward": 0.43437500298023224, |
| "rewards/cosine_rewards": 0.02916320227086544, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0010081499349325895, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1589.5625, |
| "epoch": 0.22771268466632705, |
| "grad_norm": 1.796069908482133, |
| "kl": 0.036865234375, |
| "learning_rate": 8.861436576668364e-07, |
| "loss": 0.0015, |
| "reward": 1.443231225013733, |
| "reward_std": 0.6114392578601837, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/cosine_rewards": 0.038026634603738785, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.001045387762133032, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1672.09375, |
| "epoch": 0.228222109016811, |
| "grad_norm": 2.1300405555940802, |
| "kl": 0.0377197265625, |
| "learning_rate": 8.858889454915944e-07, |
| "loss": 0.0015, |
| "reward": 1.5443891882896423, |
| "reward_std": 0.5743480771780014, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.10473084449768066, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0009665640536695719, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1766.0, |
| "epoch": 0.22873153336729496, |
| "grad_norm": 4.287974732646472, |
| "kl": 0.037841796875, |
| "learning_rate": 8.856342333163524e-07, |
| "loss": 0.0015, |
| "reward": 1.3031042218208313, |
| "reward_std": 1.7085354328155518, |
| "rewards/accuracy_reward": 0.4593749940395355, |
| "rewards/cosine_rewards": 0.06326716393232346, |
| "rewards/format_reward": 0.78125, |
| "rewards/repetition_rewards": -0.0007880023040343076, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1844.640625, |
| "epoch": 0.22924095771777891, |
| "grad_norm": 1.6528382190432698, |
| "kl": 0.0341796875, |
| "learning_rate": 8.853795211411105e-07, |
| "loss": 0.0014, |
| "reward": 0.5885469168424606, |
| "reward_std": 1.7182486653327942, |
| "rewards/accuracy_reward": 0.17812500149011612, |
| "rewards/cosine_rewards": -0.1980201005935669, |
| "rewards/format_reward": 0.609375, |
| "rewards/repetition_rewards": -0.0009329892345704138, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1887.9375, |
| "epoch": 0.22975038206826287, |
| "grad_norm": 1.8787523511209072, |
| "kl": 0.0335693359375, |
| "learning_rate": 8.851248089658685e-07, |
| "loss": 0.0013, |
| "reward": 0.7307622581720352, |
| "reward_std": 1.600571632385254, |
| "rewards/accuracy_reward": 0.24062500894069672, |
| "rewards/cosine_rewards": -0.13401341438293457, |
| "rewards/format_reward": 0.625, |
| "rewards/repetition_rewards": -0.0008493586792610586, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1833.78125, |
| "epoch": 0.23025980641874683, |
| "grad_norm": 6.893897138558536, |
| "kl": 0.0357666015625, |
| "learning_rate": 8.848700967906265e-07, |
| "loss": 0.0014, |
| "reward": 1.0294001996517181, |
| "reward_std": 1.7062013149261475, |
| "rewards/accuracy_reward": 0.40312499552965164, |
| "rewards/cosine_rewards": 0.0025026053190231323, |
| "rewards/format_reward": 0.625, |
| "rewards/repetition_rewards": -0.0012274246546439826, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1943.453125, |
| "epoch": 0.23076923076923078, |
| "grad_norm": 2.3588444541934273, |
| "kl": 0.0322265625, |
| "learning_rate": 8.846153846153846e-07, |
| "loss": 0.0013, |
| "reward": 0.19596866890788078, |
| "reward_std": 1.8146210312843323, |
| "rewards/accuracy_reward": -0.02500000223517418, |
| "rewards/cosine_rewards": -0.3406580686569214, |
| "rewards/format_reward": 0.5625, |
| "rewards/repetition_rewards": -0.0008732638962101191, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1863.109375, |
| "epoch": 0.2312786551197147, |
| "grad_norm": 1.5475872027740656, |
| "kl": 0.0400390625, |
| "learning_rate": 8.843606724401426e-07, |
| "loss": 0.0016, |
| "reward": 0.2073364406824112, |
| "reward_std": 1.7386137247085571, |
| "rewards/accuracy_reward": -0.043750010430812836, |
| "rewards/cosine_rewards": -0.3414689302444458, |
| "rewards/format_reward": 0.59375, |
| "rewards/repetition_rewards": -0.0011946168669965118, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1659.65625, |
| "epoch": 0.23178807947019867, |
| "grad_norm": 3.381623642320965, |
| "kl": 0.0540771484375, |
| "learning_rate": 8.841059602649006e-07, |
| "loss": 0.0022, |
| "reward": 1.5414963960647583, |
| "reward_std": 1.3864411413669586, |
| "rewards/accuracy_reward": 0.6218750178813934, |
| "rewards/cosine_rewards": 0.20184022560715675, |
| "rewards/format_reward": 0.71875, |
| "rewards/repetition_rewards": -0.000968798267422244, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1617.046875, |
| "epoch": 0.23229750382068262, |
| "grad_norm": 7.137141646753771, |
| "kl": 0.0372314453125, |
| "learning_rate": 8.838512480896586e-07, |
| "loss": 0.0015, |
| "reward": 1.1092736423015594, |
| "reward_std": 0.9871836006641388, |
| "rewards/accuracy_reward": 0.2687499839812517, |
| "rewards/cosine_rewards": -0.09614543057978153, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0008309493132401258, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1511.703125, |
| "epoch": 0.23280692817116658, |
| "grad_norm": 2.858407735203425, |
| "kl": 0.0447998046875, |
| "learning_rate": 8.835965359144167e-07, |
| "loss": 0.0018, |
| "reward": 1.4474474489688873, |
| "reward_std": 0.8567388504743576, |
| "rewards/accuracy_reward": 0.4937500078231096, |
| "rewards/cosine_rewards": 0.06434839963912964, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0012760092504322529, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1550.828125, |
| "epoch": 0.23331635252165053, |
| "grad_norm": 2.432372091234863, |
| "kl": 0.0408935546875, |
| "learning_rate": 8.833418237391747e-07, |
| "loss": 0.0016, |
| "reward": 1.0046057403087616, |
| "reward_std": 1.0828097462654114, |
| "rewards/accuracy_reward": 0.20937498658895493, |
| "rewards/cosine_rewards": -0.14123845472931862, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0010308316559530795, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1519.9375, |
| "epoch": 0.2338257768721345, |
| "grad_norm": 2.65385943980829, |
| "kl": 0.0380859375, |
| "learning_rate": 8.830871115639326e-07, |
| "loss": 0.0015, |
| "reward": 1.5737290382385254, |
| "reward_std": 0.676769882440567, |
| "rewards/accuracy_reward": 0.5187499821186066, |
| "rewards/cosine_rewards": 0.10273971408605576, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008856799395289272, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1412.171875, |
| "epoch": 0.23433520122261844, |
| "grad_norm": 7.668944991953695, |
| "kl": 0.03955078125, |
| "learning_rate": 8.828323993886907e-07, |
| "loss": 0.0016, |
| "reward": 1.2575648427009583, |
| "reward_std": 0.8219007402658463, |
| "rewards/accuracy_reward": 0.3499999940395355, |
| "rewards/cosine_rewards": -0.029430712573230267, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0005044575809733942, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1428.671875, |
| "epoch": 0.2348446255731024, |
| "grad_norm": 3.3174885087942747, |
| "kl": 0.041259765625, |
| "learning_rate": 8.825776872134487e-07, |
| "loss": 0.0017, |
| "reward": 0.5390121340751648, |
| "reward_std": 0.6499587297439575, |
| "rewards/accuracy_reward": -0.09687501192092896, |
| "rewards/cosine_rewards": -0.3163621127605438, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008757157484069467, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1389.328125, |
| "epoch": 0.23535404992358636, |
| "grad_norm": 1.976395582002063, |
| "kl": 0.040771484375, |
| "learning_rate": 8.823229750382067e-07, |
| "loss": 0.0016, |
| "reward": 1.6086109280586243, |
| "reward_std": 0.5066869556903839, |
| "rewards/accuracy_reward": 0.5218749716877937, |
| "rewards/cosine_rewards": 0.08817524462938309, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0014392710290849209, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1436.625, |
| "epoch": 0.2358634742740703, |
| "grad_norm": 2.4713376444103146, |
| "kl": 0.039794921875, |
| "learning_rate": 8.820682628629647e-07, |
| "loss": 0.0016, |
| "reward": 1.111421525478363, |
| "reward_std": 0.9693822264671326, |
| "rewards/accuracy_reward": 0.24062500149011612, |
| "rewards/cosine_rewards": -0.11270357295870781, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0008748299151193351, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1397.96875, |
| "epoch": 0.23637289862455427, |
| "grad_norm": 2.688736588573913, |
| "kl": 0.0450439453125, |
| "learning_rate": 8.818135506877228e-07, |
| "loss": 0.0018, |
| "reward": 1.0273907780647278, |
| "reward_std": 0.6092932820320129, |
| "rewards/accuracy_reward": 0.20937500149011612, |
| "rewards/cosine_rewards": -0.11841067671775818, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0010735246760305017, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1502.5, |
| "epoch": 0.2368823229750382, |
| "grad_norm": 2.233484328017706, |
| "kl": 0.03955078125, |
| "learning_rate": 8.815588385124808e-07, |
| "loss": 0.0016, |
| "reward": 2.0390628576278687, |
| "reward_std": 0.4271709471940994, |
| "rewards/accuracy_reward": 0.7750000059604645, |
| "rewards/cosine_rewards": 0.26496873423457146, |
| "rewards/format_reward": 1.0, |
| "rewards/repetition_rewards": -0.0009058607101906091, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1610.734375, |
| "epoch": 0.23739174732552215, |
| "grad_norm": 3.1291938497316867, |
| "kl": 0.0394287109375, |
| "learning_rate": 8.813041263372388e-07, |
| "loss": 0.0016, |
| "reward": 1.7407687306404114, |
| "reward_std": 0.8337388634681702, |
| "rewards/accuracy_reward": 0.659375011920929, |
| "rewards/cosine_rewards": 0.17613628506660461, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0009925005142576993, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1568.890625, |
| "epoch": 0.2379011716760061, |
| "grad_norm": 2.121477514968572, |
| "kl": 0.0380859375, |
| "learning_rate": 8.810494141619969e-07, |
| "loss": 0.0015, |
| "reward": 1.3490102887153625, |
| "reward_std": 0.7418502867221832, |
| "rewards/accuracy_reward": 0.37812500447034836, |
| "rewards/cosine_rewards": 0.003062829375267029, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0009275085176341236, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1698.8125, |
| "epoch": 0.23841059602649006, |
| "grad_norm": 2.4246076523727025, |
| "kl": 0.03662109375, |
| "learning_rate": 8.807947019867549e-07, |
| "loss": 0.0015, |
| "reward": 1.3666119575500488, |
| "reward_std": 1.175959825515747, |
| "rewards/accuracy_reward": 0.46562501788139343, |
| "rewards/cosine_rewards": 0.02737235650420189, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.0013854140415787697, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1691.90625, |
| "epoch": 0.23892002037697402, |
| "grad_norm": 1.3535821296606787, |
| "kl": 0.039306640625, |
| "learning_rate": 8.805399898115129e-07, |
| "loss": 0.0016, |
| "reward": 1.2414605617523193, |
| "reward_std": 1.0560136437416077, |
| "rewards/accuracy_reward": 0.3812499940395355, |
| "rewards/cosine_rewards": -0.02948123589158058, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0009331759065389633, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1686.46875, |
| "epoch": 0.23942944472745797, |
| "grad_norm": 1.6084637763929415, |
| "kl": 0.0467529296875, |
| "learning_rate": 8.802852776362711e-07, |
| "loss": 0.0019, |
| "reward": 2.0340508222579956, |
| "reward_std": 1.1313848793506622, |
| "rewards/accuracy_reward": 0.8312499821186066, |
| "rewards/cosine_rewards": 0.3132530748844147, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.001077289809472859, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1638.4375, |
| "epoch": 0.23993886907794193, |
| "grad_norm": 3.9206215218528553, |
| "kl": 0.0384521484375, |
| "learning_rate": 8.800305654610291e-07, |
| "loss": 0.0015, |
| "reward": 1.417995810508728, |
| "reward_std": 0.7844535112380981, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_rewards": 0.04390082508325577, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0009050128574017435, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1583.171875, |
| "epoch": 0.24044829342842589, |
| "grad_norm": 1.862615408125007, |
| "kl": 0.0404052734375, |
| "learning_rate": 8.797758532857871e-07, |
| "loss": 0.0016, |
| "reward": 1.3552428185939789, |
| "reward_std": 0.831163614988327, |
| "rewards/accuracy_reward": 0.40937500074505806, |
| "rewards/cosine_rewards": -0.005982518196105957, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.001274671230930835, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1606.359375, |
| "epoch": 0.24095771777890984, |
| "grad_norm": 6.217207017794225, |
| "kl": 0.039794921875, |
| "learning_rate": 8.795211411105451e-07, |
| "loss": 0.0016, |
| "reward": 1.677711844444275, |
| "reward_std": 0.8612502366304398, |
| "rewards/accuracy_reward": 0.5781249701976776, |
| "rewards/cosine_rewards": 0.11638512089848518, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0011732576531358063, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1521.84375, |
| "epoch": 0.2414671421293938, |
| "grad_norm": 2.9743621746841677, |
| "kl": 0.0421142578125, |
| "learning_rate": 8.792664289353031e-07, |
| "loss": 0.0017, |
| "reward": 1.578629732131958, |
| "reward_std": 0.6186130940914154, |
| "rewards/accuracy_reward": 0.5218749940395355, |
| "rewards/cosine_rewards": 0.08938230201601982, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0013775942497886717, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1560.25, |
| "epoch": 0.24197656647987772, |
| "grad_norm": 3.931202850710427, |
| "kl": 0.0396728515625, |
| "learning_rate": 8.790117167600611e-07, |
| "loss": 0.0016, |
| "reward": 1.8553311824798584, |
| "reward_std": 0.5484062433242798, |
| "rewards/accuracy_reward": 0.6906249970197678, |
| "rewards/cosine_rewards": 0.1969544254243374, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0009982050396502018, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1487.171875, |
| "epoch": 0.24248599083036168, |
| "grad_norm": 1.8115703582475124, |
| "kl": 0.0428466796875, |
| "learning_rate": 8.787570045848191e-07, |
| "loss": 0.0017, |
| "reward": 1.0939862728118896, |
| "reward_std": 0.652959406375885, |
| "rewards/accuracy_reward": 0.24062499776482582, |
| "rewards/cosine_rewards": -0.0988575927913189, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0009061352466233075, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1421.203125, |
| "epoch": 0.24299541518084564, |
| "grad_norm": 50.509798046645564, |
| "kl": 0.0455322265625, |
| "learning_rate": 8.785022924095772e-07, |
| "loss": 0.0018, |
| "reward": 1.1556105613708496, |
| "reward_std": 0.8080581426620483, |
| "rewards/accuracy_reward": 0.26874998956918716, |
| "rewards/cosine_rewards": -0.06538418680429459, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008802659867797047, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1423.25, |
| "epoch": 0.2435048395313296, |
| "grad_norm": 2.143196092673566, |
| "kl": 0.042724609375, |
| "learning_rate": 8.782475802343352e-07, |
| "loss": 0.0017, |
| "reward": 1.4646123051643372, |
| "reward_std": 0.4019291028380394, |
| "rewards/accuracy_reward": 0.4374999925494194, |
| "rewards/cosine_rewards": 0.04411640763282776, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0013792455429211259, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1525.359375, |
| "epoch": 0.24401426388181355, |
| "grad_norm": 1.4650345234014313, |
| "kl": 0.043701171875, |
| "learning_rate": 8.779928680590932e-07, |
| "loss": 0.0018, |
| "reward": 1.7237411737442017, |
| "reward_std": 0.6819100677967072, |
| "rewards/accuracy_reward": 0.6031249761581421, |
| "rewards/cosine_rewards": 0.13805609196424484, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0018149468814954162, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1518.6875, |
| "epoch": 0.2445236882322975, |
| "grad_norm": 2.4194184684625166, |
| "kl": 0.0440673828125, |
| "learning_rate": 8.777381558838512e-07, |
| "loss": 0.0018, |
| "reward": 1.4634617269039154, |
| "reward_std": 0.4558331221342087, |
| "rewards/accuracy_reward": 0.46562497317790985, |
| "rewards/cosine_rewards": 0.03047458827495575, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0013878352474421263, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1559.859375, |
| "epoch": 0.24503311258278146, |
| "grad_norm": 5.778258363606284, |
| "kl": 0.041015625, |
| "learning_rate": 8.774834437086093e-07, |
| "loss": 0.0016, |
| "reward": 1.1754435896873474, |
| "reward_std": 0.629539430141449, |
| "rewards/accuracy_reward": 0.2968749850988388, |
| "rewards/cosine_rewards": -0.058087632060050964, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.000843802816234529, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1613.171875, |
| "epoch": 0.24554253693326542, |
| "grad_norm": 1.6143411312623293, |
| "kl": 0.0389404296875, |
| "learning_rate": 8.772287315333673e-07, |
| "loss": 0.0016, |
| "reward": 0.8586589694023132, |
| "reward_std": 0.45200832188129425, |
| "rewards/accuracy_reward": 0.09999999403953552, |
| "rewards/cosine_rewards": -0.22472049295902252, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0009954352863132954, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1670.359375, |
| "epoch": 0.24605196128374937, |
| "grad_norm": 2.239924603374423, |
| "kl": 0.0592041015625, |
| "learning_rate": 8.769740193581253e-07, |
| "loss": 0.0024, |
| "reward": 1.522126853466034, |
| "reward_std": 0.8742709904909134, |
| "rewards/accuracy_reward": 0.4937499910593033, |
| "rewards/cosine_rewards": 0.060823358595371246, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0011965514277108014, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1628.625, |
| "epoch": 0.24656138563423333, |
| "grad_norm": 8.106075584628705, |
| "kl": 0.0419921875, |
| "learning_rate": 8.767193071828834e-07, |
| "loss": 0.0017, |
| "reward": 0.9253878593444824, |
| "reward_std": 1.307717740535736, |
| "rewards/accuracy_reward": 0.18437500298023224, |
| "rewards/cosine_rewards": -0.14811599627137184, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.0014961253036744893, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1648.875, |
| "epoch": 0.24707080998471728, |
| "grad_norm": 1.6847159437413002, |
| "kl": 0.0389404296875, |
| "learning_rate": 8.764645950076414e-07, |
| "loss": 0.0016, |
| "reward": 1.4973651766777039, |
| "reward_std": 0.9533334523439407, |
| "rewards/accuracy_reward": 0.5218749791383743, |
| "rewards/cosine_rewards": 0.0704129058867693, |
| "rewards/format_reward": 0.90625, |
| "rewards/repetition_rewards": -0.0011727037781383842, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1511.140625, |
| "epoch": 0.2475802343352012, |
| "grad_norm": 1.736665525794252, |
| "kl": 0.0399169921875, |
| "learning_rate": 8.762098828323994e-07, |
| "loss": 0.0016, |
| "reward": 0.7075473368167877, |
| "reward_std": 0.8960316479206085, |
| "rewards/accuracy_reward": 0.015624992549419403, |
| "rewards/cosine_rewards": -0.2443552017211914, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.001222497143317014, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1567.671875, |
| "epoch": 0.24808965868568517, |
| "grad_norm": 3.031931829741236, |
| "kl": 0.0386962890625, |
| "learning_rate": 8.759551706571575e-07, |
| "loss": 0.0015, |
| "reward": 1.3812061548233032, |
| "reward_std": 0.8888083398342133, |
| "rewards/accuracy_reward": 0.4093749672174454, |
| "rewards/cosine_rewards": 0.004479339346289635, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0013981764786876738, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1457.359375, |
| "epoch": 0.24859908303616912, |
| "grad_norm": 5.524727047487839, |
| "kl": 0.0506591796875, |
| "learning_rate": 8.757004584819154e-07, |
| "loss": 0.002, |
| "reward": 1.8092041611671448, |
| "reward_std": 0.5159921646118164, |
| "rewards/accuracy_reward": 0.6343750059604645, |
| "rewards/cosine_rewards": 0.19165128469467163, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0011971485218964517, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1379.078125, |
| "epoch": 0.24910850738665308, |
| "grad_norm": 7.278746642143023, |
| "kl": 0.055419921875, |
| "learning_rate": 8.754457463066734e-07, |
| "loss": 0.0022, |
| "reward": 1.194389447569847, |
| "reward_std": 0.5000828057527542, |
| "rewards/accuracy_reward": 0.26874998211860657, |
| "rewards/cosine_rewards": -0.04208715260028839, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0010234276414848864, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1397.78125, |
| "epoch": 0.24961793173713703, |
| "grad_norm": 2.9100594613125352, |
| "kl": 0.0543212890625, |
| "learning_rate": 8.751910341314314e-07, |
| "loss": 0.0022, |
| "reward": 1.6665399670600891, |
| "reward_std": 0.6835527420043945, |
| "rewards/accuracy_reward": 0.6625000089406967, |
| "rewards/cosine_rewards": 0.11463410407304764, |
| "rewards/format_reward": 0.890625, |
| "rewards/repetition_rewards": -0.00121912601753138, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1359.703125, |
| "epoch": 0.250127356087621, |
| "grad_norm": 12.325171247664876, |
| "kl": 0.0457763671875, |
| "learning_rate": 8.749363219561895e-07, |
| "loss": 0.0018, |
| "reward": 1.8410940766334534, |
| "reward_std": 0.4001428484916687, |
| "rewards/accuracy_reward": 0.690625011920929, |
| "rewards/cosine_rewards": 0.18309018202126026, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0013711884384974837, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1470.75, |
| "epoch": 0.25063678043810494, |
| "grad_norm": 15.076912789505334, |
| "kl": 0.041015625, |
| "learning_rate": 8.746816097809475e-07, |
| "loss": 0.0016, |
| "reward": 1.2887136340141296, |
| "reward_std": 0.8450455367565155, |
| "rewards/accuracy_reward": 0.3531249761581421, |
| "rewards/cosine_rewards": -0.03244372457265854, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0007176562794484198, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1348.140625, |
| "epoch": 0.2511462047885889, |
| "grad_norm": 10.32545410862146, |
| "kl": 0.05810546875, |
| "learning_rate": 8.744268976057055e-07, |
| "loss": 0.0023, |
| "reward": 1.227342277765274, |
| "reward_std": 1.0202240645885468, |
| "rewards/accuracy_reward": 0.3500000238418579, |
| "rewards/cosine_rewards": 0.002939566969871521, |
| "rewards/format_reward": 0.875, |
| "rewards/repetition_rewards": -0.0005973072838969529, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1301.828125, |
| "epoch": 0.25165562913907286, |
| "grad_norm": 3.667270735290933, |
| "kl": 0.0645751953125, |
| "learning_rate": 8.741721854304636e-07, |
| "loss": 0.0026, |
| "reward": 1.2533040046691895, |
| "reward_std": 0.7434202134609222, |
| "rewards/accuracy_reward": 0.32500000298023224, |
| "rewards/cosine_rewards": -0.03934769332408905, |
| "rewards/format_reward": 0.96875, |
| "rewards/repetition_rewards": -0.0010982811218127608, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1290.1875, |
| "epoch": 0.2521650534895568, |
| "grad_norm": 4.3465734782527345, |
| "kl": 0.0535888671875, |
| "learning_rate": 8.739174732552216e-07, |
| "loss": 0.0021, |
| "reward": 0.6353173404932022, |
| "reward_std": 0.6600025594234467, |
| "rewards/accuracy_reward": -0.040625013411045074, |
| "rewards/cosine_rewards": -0.2607284113764763, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0008293068385683, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1355.375, |
| "epoch": 0.25267447784004077, |
| "grad_norm": 4.901543791199254, |
| "kl": 0.060546875, |
| "learning_rate": 8.736627610799796e-07, |
| "loss": 0.0024, |
| "reward": 1.1404387950897217, |
| "reward_std": 0.670623242855072, |
| "rewards/accuracy_reward": 0.26874999701976776, |
| "rewards/cosine_rewards": -0.11177334189414978, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0009129364043474197, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1328.6875, |
| "epoch": 0.2531839021905247, |
| "grad_norm": 3.9728562721335745, |
| "kl": 0.0489501953125, |
| "learning_rate": 8.734080489047376e-07, |
| "loss": 0.002, |
| "reward": 1.1998997032642365, |
| "reward_std": 0.630705714225769, |
| "rewards/accuracy_reward": 0.32500001788139343, |
| "rewards/cosine_rewards": -0.06150183826684952, |
| "rewards/format_reward": 0.9375, |
| "rewards/repetition_rewards": -0.0010984738764818758, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1373.671875, |
| "epoch": 0.2536933265410087, |
| "grad_norm": 3.325255325148765, |
| "kl": 0.04833984375, |
| "learning_rate": 8.731533367294957e-07, |
| "loss": 0.0019, |
| "reward": 1.2019822597503662, |
| "reward_std": 0.38447779417037964, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/cosine_rewards": -0.04712319001555443, |
| "rewards/format_reward": 0.953125, |
| "rewards/repetition_rewards": -0.0008945107110776007, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1309.28125, |
| "epoch": 0.25420275089149263, |
| "grad_norm": 5.499632802088616, |
| "kl": 0.072265625, |
| "learning_rate": 8.728986245542537e-07, |
| "loss": 0.0029, |
| "reward": 1.6111189126968384, |
| "reward_std": 0.1892632469534874, |
| "rewards/accuracy_reward": 0.550000011920929, |
| "rewards/cosine_rewards": 0.07773812115192413, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0009941596072167158, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1363.625, |
| "epoch": 0.2547121752419766, |
| "grad_norm": 7.195546109062687, |
| "kl": 0.0482177734375, |
| "learning_rate": 8.726439123790117e-07, |
| "loss": 0.0019, |
| "reward": 1.9320534467697144, |
| "reward_std": 0.4148600548505783, |
| "rewards/accuracy_reward": 0.7468750178813934, |
| "rewards/cosine_rewards": 0.20207761228084564, |
| "rewards/format_reward": 0.984375, |
| "rewards/repetition_rewards": -0.0012741541431751102, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 3926, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|