| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 687, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 146.640625, |
| "epoch": 0.001455604075691412, |
| "grad_norm": 2.0520484071530207, |
| "kl": 0.0, |
| "learning_rate": 4.7619047619047613e-08, |
| "loss": 0.0027, |
| "reward": -0.6811327934265137, |
| "reward_std": 1.2931699752807617, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.4427083134651184, |
| "step": 1 |
| }, |
| { |
| "completion_length": 141.84375, |
| "epoch": 0.002911208151382824, |
| "grad_norm": 2.0555621453821775, |
| "kl": 0.0, |
| "learning_rate": 9.523809523809523e-08, |
| "loss": 0.0025, |
| "reward": -0.39337241649627686, |
| "reward_std": 1.4007469415664673, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.5158854722976685, |
| "step": 2 |
| }, |
| { |
| "completion_length": 131.109375, |
| "epoch": 0.004366812227074236, |
| "grad_norm": 2.0052335093689617, |
| "kl": 0.000347137451171875, |
| "learning_rate": 1.4285714285714285e-07, |
| "loss": 0.0032, |
| "reward": -0.4100520610809326, |
| "reward_std": 1.3564808368682861, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5098437666893005, |
| "step": 3 |
| }, |
| { |
| "completion_length": 147.109375, |
| "epoch": 0.005822416302765648, |
| "grad_norm": 9.897016495998724, |
| "kl": 0.0006103515625, |
| "learning_rate": 1.9047619047619045e-07, |
| "loss": -0.0005, |
| "reward": -0.5236002206802368, |
| "reward_std": 1.1223150491714478, |
| "rewards/accuracy_reward": 0.203125, |
| "rewards/format_reward": 0.5694466233253479, |
| "step": 4 |
| }, |
| { |
| "completion_length": 115.609375, |
| "epoch": 0.00727802037845706, |
| "grad_norm": 2.2619632214263254, |
| "kl": 0.000347137451171875, |
| "learning_rate": 2.3809523809523806e-07, |
| "loss": 0.0001, |
| "reward": -0.4710937440395355, |
| "reward_std": 1.1760644912719727, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.48657551407814026, |
| "step": 5 |
| }, |
| { |
| "completion_length": 128.390625, |
| "epoch": 0.008733624454148471, |
| "grad_norm": 2.025201991853294, |
| "kl": 0.000396728515625, |
| "learning_rate": 2.857142857142857e-07, |
| "loss": -0.0004, |
| "reward": -0.3249348998069763, |
| "reward_std": 1.27274489402771, |
| "rewards/accuracy_reward": 0.265625, |
| "rewards/format_reward": 0.5628255009651184, |
| "step": 6 |
| }, |
| { |
| "completion_length": 124.640625, |
| "epoch": 0.010189228529839884, |
| "grad_norm": 3.3130345990864427, |
| "kl": 0.00043487548828125, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": -0.0007, |
| "reward": -0.6294205188751221, |
| "reward_std": 1.305110216140747, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.44719398021698, |
| "step": 7 |
| }, |
| { |
| "completion_length": 125.171875, |
| "epoch": 0.011644832605531296, |
| "grad_norm": 2.2278530283663818, |
| "kl": 0.0003452301025390625, |
| "learning_rate": 3.809523809523809e-07, |
| "loss": -0.0019, |
| "reward": -0.396751344203949, |
| "reward_std": 1.1712387800216675, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.5593684911727905, |
| "step": 8 |
| }, |
| { |
| "completion_length": 142.28125, |
| "epoch": 0.013100436681222707, |
| "grad_norm": 1.9409494681196875, |
| "kl": 0.0004138946533203125, |
| "learning_rate": 4.285714285714285e-07, |
| "loss": 0.001, |
| "reward": -0.7128320336341858, |
| "reward_std": 0.9856235980987549, |
| "rewards/accuracy_reward": 0.140625, |
| "rewards/format_reward": 0.5527409315109253, |
| "step": 9 |
| }, |
| { |
| "completion_length": 139.53125, |
| "epoch": 0.01455604075691412, |
| "grad_norm": 4.067951440500351, |
| "kl": 0.000446319580078125, |
| "learning_rate": 4.761904761904761e-07, |
| "loss": 0.0003, |
| "reward": -0.6937109231948853, |
| "reward_std": 1.2031134366989136, |
| "rewards/accuracy_reward": 0.171875, |
| "rewards/format_reward": 0.483502596616745, |
| "step": 10 |
| }, |
| { |
| "completion_length": 131.96875, |
| "epoch": 0.01601164483260553, |
| "grad_norm": 1.9513622824677586, |
| "kl": 0.0004024505615234375, |
| "learning_rate": 5.238095238095238e-07, |
| "loss": -0.0019, |
| "reward": -0.44268879294395447, |
| "reward_std": 1.2512993812561035, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5169466137886047, |
| "step": 11 |
| }, |
| { |
| "completion_length": 128.296875, |
| "epoch": 0.017467248908296942, |
| "grad_norm": 2.6073182015857483, |
| "kl": 0.0003662109375, |
| "learning_rate": 5.714285714285714e-07, |
| "loss": 0.0019, |
| "reward": -0.11339845508337021, |
| "reward_std": 1.379326581954956, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.5592578053474426, |
| "step": 12 |
| }, |
| { |
| "completion_length": 131.5625, |
| "epoch": 0.018922852983988356, |
| "grad_norm": 2.038224554514447, |
| "kl": 0.0004138946533203125, |
| "learning_rate": 6.19047619047619e-07, |
| "loss": 0.0019, |
| "reward": -0.5053775906562805, |
| "reward_std": 1.0422844886779785, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.5684505105018616, |
| "step": 13 |
| }, |
| { |
| "completion_length": 130.015625, |
| "epoch": 0.020378457059679767, |
| "grad_norm": 8.376191441689384, |
| "kl": 0.00080108642578125, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.0024, |
| "reward": -0.5506510734558105, |
| "reward_std": 1.289644479751587, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.45373696088790894, |
| "step": 14 |
| }, |
| { |
| "completion_length": 143.125, |
| "epoch": 0.021834061135371178, |
| "grad_norm": 8.674544330550578, |
| "kl": 0.0009918212890625, |
| "learning_rate": 7.142857142857143e-07, |
| "loss": 0.001, |
| "reward": -0.517591118812561, |
| "reward_std": 1.3039864301681519, |
| "rewards/accuracy_reward": 0.265625, |
| "rewards/format_reward": 0.48772138357162476, |
| "step": 15 |
| }, |
| { |
| "completion_length": 152.59375, |
| "epoch": 0.023289665211062592, |
| "grad_norm": 2.5986008608286215, |
| "kl": 0.0004673004150390625, |
| "learning_rate": 7.619047619047618e-07, |
| "loss": 0.0034, |
| "reward": -0.47635418176651, |
| "reward_std": 1.3350398540496826, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.4758723974227905, |
| "step": 16 |
| }, |
| { |
| "completion_length": 126.203125, |
| "epoch": 0.024745269286754003, |
| "grad_norm": 4.121392021849402, |
| "kl": 0.00116729736328125, |
| "learning_rate": 8.095238095238095e-07, |
| "loss": -0.0004, |
| "reward": -0.13250651955604553, |
| "reward_std": 1.1838587522506714, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.5381835699081421, |
| "step": 17 |
| }, |
| { |
| "completion_length": 130.640625, |
| "epoch": 0.026200873362445413, |
| "grad_norm": 3.933006083601506, |
| "kl": 0.000957489013671875, |
| "learning_rate": 8.57142857142857e-07, |
| "loss": -0.0009, |
| "reward": -0.29349610209465027, |
| "reward_std": 1.1805915832519531, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.5601236820220947, |
| "step": 18 |
| }, |
| { |
| "completion_length": 134.53125, |
| "epoch": 0.027656477438136828, |
| "grad_norm": 2.7373891464842437, |
| "kl": 0.00140380859375, |
| "learning_rate": 9.047619047619047e-07, |
| "loss": 0.0008, |
| "reward": -0.2783724069595337, |
| "reward_std": 1.172222375869751, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.5397005081176758, |
| "step": 19 |
| }, |
| { |
| "completion_length": 130.90625, |
| "epoch": 0.02911208151382824, |
| "grad_norm": 2.400503043349488, |
| "kl": 0.000904083251953125, |
| "learning_rate": 9.523809523809522e-07, |
| "loss": -0.0003, |
| "reward": -0.4976627826690674, |
| "reward_std": 1.1664050817489624, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.4811263084411621, |
| "step": 20 |
| }, |
| { |
| "completion_length": 139.421875, |
| "epoch": 0.03056768558951965, |
| "grad_norm": 1.7123015669045267, |
| "kl": 0.0011749267578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0013, |
| "reward": -0.6022005081176758, |
| "reward_std": 0.8686124682426453, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.5883203148841858, |
| "step": 21 |
| }, |
| { |
| "completion_length": 144.796875, |
| "epoch": 0.03202328966521106, |
| "grad_norm": 1.9937512224942535, |
| "kl": 0.00160980224609375, |
| "learning_rate": 9.99994437237857e-07, |
| "loss": 0.0011, |
| "reward": -0.14617840945720673, |
| "reward_std": 1.3580482006072998, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.5967773199081421, |
| "step": 22 |
| }, |
| { |
| "completion_length": 124.6875, |
| "epoch": 0.033478893740902474, |
| "grad_norm": 2.0814826578918852, |
| "kl": 0.00128936767578125, |
| "learning_rate": 9.999777490752055e-07, |
| "loss": 0.0029, |
| "reward": -0.21195964515209198, |
| "reward_std": 1.2764006853103638, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.5546939969062805, |
| "step": 23 |
| }, |
| { |
| "completion_length": 140.765625, |
| "epoch": 0.034934497816593885, |
| "grad_norm": 1.77589421887042, |
| "kl": 0.0024871826171875, |
| "learning_rate": 9.999499358833744e-07, |
| "loss": 0.0023, |
| "reward": -0.4434700608253479, |
| "reward_std": 1.2595456838607788, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.589902400970459, |
| "step": 24 |
| }, |
| { |
| "completion_length": 135.53125, |
| "epoch": 0.036390101892285295, |
| "grad_norm": 1.9978141335810846, |
| "kl": 0.0023956298828125, |
| "learning_rate": 9.999109982812366e-07, |
| "loss": 0.0004, |
| "reward": 0.10095702111721039, |
| "reward_std": 1.251359462738037, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.6653059720993042, |
| "step": 25 |
| }, |
| { |
| "completion_length": 143.03125, |
| "epoch": 0.03784570596797671, |
| "grad_norm": 2.05478507206434, |
| "kl": 0.001953125, |
| "learning_rate": 9.998609371351943e-07, |
| "loss": 0.0017, |
| "reward": -0.4979492127895355, |
| "reward_std": 1.096949815750122, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.6168684959411621, |
| "step": 26 |
| }, |
| { |
| "completion_length": 142.390625, |
| "epoch": 0.039301310043668124, |
| "grad_norm": 9.38751585986725, |
| "kl": 0.2451171875, |
| "learning_rate": 9.997997535591607e-07, |
| "loss": 0.0004, |
| "reward": -0.13885416090488434, |
| "reward_std": 1.2641448974609375, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.5863932371139526, |
| "step": 27 |
| }, |
| { |
| "completion_length": 122.46875, |
| "epoch": 0.040756914119359534, |
| "grad_norm": 2.02901198267971, |
| "kl": 0.002410888671875, |
| "learning_rate": 9.997274489145347e-07, |
| "loss": 0.0019, |
| "reward": 0.0855598971247673, |
| "reward_std": 1.314124345779419, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.5462110042572021, |
| "step": 28 |
| }, |
| { |
| "completion_length": 138.21875, |
| "epoch": 0.042212518195050945, |
| "grad_norm": 1.9186274674092676, |
| "kl": 0.003692626953125, |
| "learning_rate": 9.9964402481017e-07, |
| "loss": 0.0007, |
| "reward": 0.14319661259651184, |
| "reward_std": 1.2557392120361328, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.6084700226783752, |
| "step": 29 |
| }, |
| { |
| "completion_length": 133.484375, |
| "epoch": 0.043668122270742356, |
| "grad_norm": 1.8000804655795466, |
| "kl": 0.00665283203125, |
| "learning_rate": 9.995494831023408e-07, |
| "loss": -0.0004, |
| "reward": -0.0694987028837204, |
| "reward_std": 1.264149785041809, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.6460742354393005, |
| "step": 30 |
| }, |
| { |
| "completion_length": 132.125, |
| "epoch": 0.04512372634643377, |
| "grad_norm": 2.7510703510304624, |
| "kl": 0.0225830078125, |
| "learning_rate": 9.994438258946988e-07, |
| "loss": -0.0004, |
| "reward": -0.2966731786727905, |
| "reward_std": 1.048037052154541, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.6053190231323242, |
| "step": 31 |
| }, |
| { |
| "completion_length": 138.5625, |
| "epoch": 0.046579330422125184, |
| "grad_norm": 1.702872778334861, |
| "kl": 0.0030059814453125, |
| "learning_rate": 9.993270555382281e-07, |
| "loss": -0.0019, |
| "reward": 0.10822266340255737, |
| "reward_std": 1.0069178342819214, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.6695116758346558, |
| "step": 32 |
| }, |
| { |
| "completion_length": 153.203125, |
| "epoch": 0.048034934497816595, |
| "grad_norm": 1.6790629896240423, |
| "kl": 0.0033721923828125, |
| "learning_rate": 9.991991746311915e-07, |
| "loss": -0.0004, |
| "reward": 0.37479168176651, |
| "reward_std": 1.2309496402740479, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.7032031416893005, |
| "step": 33 |
| }, |
| { |
| "completion_length": 122.59375, |
| "epoch": 0.049490538573508006, |
| "grad_norm": 1.926350880972545, |
| "kl": 0.004180908203125, |
| "learning_rate": 9.99060186019073e-07, |
| "loss": -0.0042, |
| "reward": 0.2343815118074417, |
| "reward_std": 0.9639967083930969, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.6073763370513916, |
| "step": 34 |
| }, |
| { |
| "completion_length": 121.46875, |
| "epoch": 0.050946142649199416, |
| "grad_norm": 1.9507933134145956, |
| "kl": 0.006256103515625, |
| "learning_rate": 9.989100927945153e-07, |
| "loss": -0.002, |
| "reward": 0.36109375953674316, |
| "reward_std": 1.2289752960205078, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.6778646111488342, |
| "step": 35 |
| }, |
| { |
| "completion_length": 113.640625, |
| "epoch": 0.05240174672489083, |
| "grad_norm": 1.7649346907748196, |
| "kl": 0.00714111328125, |
| "learning_rate": 9.9874889829725e-07, |
| "loss": -0.0019, |
| "reward": -0.09406250715255737, |
| "reward_std": 0.98655104637146, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.6245573163032532, |
| "step": 36 |
| }, |
| { |
| "completion_length": 140.671875, |
| "epoch": 0.053857350800582245, |
| "grad_norm": 1.6599989465316414, |
| "kl": 0.004180908203125, |
| "learning_rate": 9.985766061140232e-07, |
| "loss": 0.0024, |
| "reward": -0.051347650587558746, |
| "reward_std": 1.1097118854522705, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.6869857311248779, |
| "step": 37 |
| }, |
| { |
| "completion_length": 131.609375, |
| "epoch": 0.055312954876273655, |
| "grad_norm": 1.9002315583578684, |
| "kl": 0.004150390625, |
| "learning_rate": 9.983932200785172e-07, |
| "loss": -0.0051, |
| "reward": 0.016217432916164398, |
| "reward_std": 1.0292843580245972, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.6955012679100037, |
| "step": 38 |
| }, |
| { |
| "completion_length": 133.46875, |
| "epoch": 0.056768558951965066, |
| "grad_norm": 1.703318134865483, |
| "kl": 0.005767822265625, |
| "learning_rate": 9.98198744271263e-07, |
| "loss": -0.0047, |
| "reward": 0.3018620014190674, |
| "reward_std": 1.063158392906189, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.669726550579071, |
| "step": 39 |
| }, |
| { |
| "completion_length": 106.578125, |
| "epoch": 0.05822416302765648, |
| "grad_norm": 1.9072082640759345, |
| "kl": 0.020751953125, |
| "learning_rate": 9.979931830195522e-07, |
| "loss": -0.0023, |
| "reward": -0.04225911945104599, |
| "reward_std": 0.9956592321395874, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.6362695693969727, |
| "step": 40 |
| }, |
| { |
| "completion_length": 116.96875, |
| "epoch": 0.05967976710334789, |
| "grad_norm": 1.8801906920255198, |
| "kl": 0.006744384765625, |
| "learning_rate": 9.977765408973374e-07, |
| "loss": -0.005, |
| "reward": 0.21731121838092804, |
| "reward_std": 0.9288095235824585, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.6459180116653442, |
| "step": 41 |
| }, |
| { |
| "completion_length": 118.265625, |
| "epoch": 0.0611353711790393, |
| "grad_norm": 1.9537453615984006, |
| "kl": 0.009765625, |
| "learning_rate": 9.975488227251329e-07, |
| "loss": -0.0017, |
| "reward": -0.03567056730389595, |
| "reward_std": 1.0284496545791626, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.645800769329071, |
| "step": 42 |
| }, |
| { |
| "completion_length": 106.71875, |
| "epoch": 0.06259097525473072, |
| "grad_norm": 2.0269318680154806, |
| "kl": 0.00640869140625, |
| "learning_rate": 9.973100335699073e-07, |
| "loss": -0.0026, |
| "reward": 0.33292967081069946, |
| "reward_std": 1.1819396018981934, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.6497005224227905, |
| "step": 43 |
| }, |
| { |
| "completion_length": 120.84375, |
| "epoch": 0.06404657933042213, |
| "grad_norm": 1.954056839145857, |
| "kl": 0.00958251953125, |
| "learning_rate": 9.970601787449696e-07, |
| "loss": 0.0031, |
| "reward": 0.3369661271572113, |
| "reward_std": 1.1963883638381958, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.6441406011581421, |
| "step": 44 |
| }, |
| { |
| "completion_length": 119.484375, |
| "epoch": 0.06550218340611354, |
| "grad_norm": 1.9048660482499535, |
| "kl": 0.007110595703125, |
| "learning_rate": 9.967992638098515e-07, |
| "loss": 0.0017, |
| "reward": -0.12632161378860474, |
| "reward_std": 0.811303973197937, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.6902539134025574, |
| "step": 45 |
| }, |
| { |
| "completion_length": 108.234375, |
| "epoch": 0.06695778748180495, |
| "grad_norm": 1.954242990541008, |
| "kl": 0.01055908203125, |
| "learning_rate": 9.965272945701838e-07, |
| "loss": 0.0053, |
| "reward": 0.1287955790758133, |
| "reward_std": 1.0276615619659424, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.6505273580551147, |
| "step": 46 |
| }, |
| { |
| "completion_length": 111.203125, |
| "epoch": 0.06841339155749636, |
| "grad_norm": 2.0265711266703805, |
| "kl": 0.0101318359375, |
| "learning_rate": 9.962442770775673e-07, |
| "loss": -0.0007, |
| "reward": 0.44425129890441895, |
| "reward_std": 1.1991455554962158, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.675071656703949, |
| "step": 47 |
| }, |
| { |
| "completion_length": 109.75, |
| "epoch": 0.06986899563318777, |
| "grad_norm": 2.071460406757796, |
| "kl": 0.0103759765625, |
| "learning_rate": 9.959502176294382e-07, |
| "loss": 0.0048, |
| "reward": -0.17327472567558289, |
| "reward_std": 0.8208831548690796, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.678769588470459, |
| "step": 48 |
| }, |
| { |
| "completion_length": 101.453125, |
| "epoch": 0.07132459970887918, |
| "grad_norm": 2.1718985191781695, |
| "kl": 0.021728515625, |
| "learning_rate": 9.956451227689277e-07, |
| "loss": -0.0001, |
| "reward": 0.15954425930976868, |
| "reward_std": 0.9772539734840393, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.6399348974227905, |
| "step": 49 |
| }, |
| { |
| "completion_length": 106.046875, |
| "epoch": 0.07278020378457059, |
| "grad_norm": 2.1050743827136533, |
| "kl": 0.01318359375, |
| "learning_rate": 9.953289992847158e-07, |
| "loss": -0.0002, |
| "reward": 0.5692838430404663, |
| "reward_std": 1.0812323093414307, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.6715234518051147, |
| "step": 50 |
| }, |
| { |
| "completion_length": 102.359375, |
| "epoch": 0.07423580786026202, |
| "grad_norm": 1.9030170365144239, |
| "kl": 0.0135498046875, |
| "learning_rate": 9.950018542108817e-07, |
| "loss": 0.0007, |
| "reward": 0.5510026216506958, |
| "reward_std": 1.1338927745819092, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.6648176908493042, |
| "step": 51 |
| }, |
| { |
| "completion_length": 89.515625, |
| "epoch": 0.07569141193595343, |
| "grad_norm": 2.120469098312219, |
| "kl": 0.0157470703125, |
| "learning_rate": 9.946636948267467e-07, |
| "loss": 0.0016, |
| "reward": 0.5175260901451111, |
| "reward_std": 1.038745641708374, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.6268749833106995, |
| "step": 52 |
| }, |
| { |
| "completion_length": 91.75, |
| "epoch": 0.07714701601164484, |
| "grad_norm": 2.086860625748496, |
| "kl": 0.0145263671875, |
| "learning_rate": 9.943145286567113e-07, |
| "loss": -0.0006, |
| "reward": 0.42378902435302734, |
| "reward_std": 1.124879240989685, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.6217838525772095, |
| "step": 53 |
| }, |
| { |
| "completion_length": 97.9375, |
| "epoch": 0.07860262008733625, |
| "grad_norm": 2.019967385216881, |
| "kl": 0.0146484375, |
| "learning_rate": 9.93954363470089e-07, |
| "loss": 0.0035, |
| "reward": 0.3268880248069763, |
| "reward_std": 1.1483426094055176, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.6423567533493042, |
| "step": 54 |
| }, |
| { |
| "completion_length": 99.625, |
| "epoch": 0.08005822416302766, |
| "grad_norm": 2.046031430253623, |
| "kl": 0.0186767578125, |
| "learning_rate": 9.935832072809327e-07, |
| "loss": 0.0026, |
| "reward": 0.30184245109558105, |
| "reward_std": 1.0335478782653809, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.6567773818969727, |
| "step": 55 |
| }, |
| { |
| "completion_length": 93.578125, |
| "epoch": 0.08151382823871907, |
| "grad_norm": 2.1224187100626404, |
| "kl": 0.0186767578125, |
| "learning_rate": 9.932010683478573e-07, |
| "loss": -0.0015, |
| "reward": 0.7045247554779053, |
| "reward_std": 1.007187843322754, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6370638608932495, |
| "step": 56 |
| }, |
| { |
| "completion_length": 99.71875, |
| "epoch": 0.08296943231441048, |
| "grad_norm": 2.0226555290069905, |
| "kl": 0.019287109375, |
| "learning_rate": 9.928079551738541e-07, |
| "loss": 0.004, |
| "reward": 0.24904297292232513, |
| "reward_std": 1.0046416521072388, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.6374022960662842, |
| "step": 57 |
| }, |
| { |
| "completion_length": 92.59375, |
| "epoch": 0.08442503639010189, |
| "grad_norm": 2.031040356216584, |
| "kl": 0.0264892578125, |
| "learning_rate": 9.92403876506104e-07, |
| "loss": -0.0008, |
| "reward": 0.5398828387260437, |
| "reward_std": 0.9229820966720581, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.639635443687439, |
| "step": 58 |
| }, |
| { |
| "completion_length": 91.3125, |
| "epoch": 0.0858806404657933, |
| "grad_norm": 2.018669545596409, |
| "kl": 0.026611328125, |
| "learning_rate": 9.919888413357807e-07, |
| "loss": 0.0024, |
| "reward": 0.05692709982395172, |
| "reward_std": 1.0041440725326538, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.6213802099227905, |
| "step": 59 |
| }, |
| { |
| "completion_length": 90.390625, |
| "epoch": 0.08733624454148471, |
| "grad_norm": 2.0871119564538034, |
| "kl": 0.0245361328125, |
| "learning_rate": 9.91562858897852e-07, |
| "loss": 0.0021, |
| "reward": 0.27358072996139526, |
| "reward_std": 0.9488109350204468, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.6085677146911621, |
| "step": 60 |
| }, |
| { |
| "completion_length": 90.953125, |
| "epoch": 0.08879184861717612, |
| "grad_norm": 2.0846084451671727, |
| "kl": 0.022216796875, |
| "learning_rate": 9.91125938670874e-07, |
| "loss": 0.0028, |
| "reward": 0.9214128255844116, |
| "reward_std": 0.9024526476860046, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.6374154090881348, |
| "step": 61 |
| }, |
| { |
| "completion_length": 87.796875, |
| "epoch": 0.09024745269286755, |
| "grad_norm": 2.0274598852707553, |
| "kl": 0.02490234375, |
| "learning_rate": 9.906780903767798e-07, |
| "loss": -0.0017, |
| "reward": 0.5340690016746521, |
| "reward_std": 0.9889509677886963, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.6374675035476685, |
| "step": 62 |
| }, |
| { |
| "completion_length": 88.484375, |
| "epoch": 0.09170305676855896, |
| "grad_norm": 2.37260281994688, |
| "kl": 0.02880859375, |
| "learning_rate": 9.902193239806634e-07, |
| "loss": 0.0023, |
| "reward": 0.42514973878860474, |
| "reward_std": 0.8481540679931641, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.6194596290588379, |
| "step": 63 |
| }, |
| { |
| "completion_length": 86.8125, |
| "epoch": 0.09315866084425037, |
| "grad_norm": 2.128381129079585, |
| "kl": 0.023193359375, |
| "learning_rate": 9.897496496905583e-07, |
| "loss": -0.0011, |
| "reward": 0.3611133098602295, |
| "reward_std": 1.0007762908935547, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.6315299272537231, |
| "step": 64 |
| }, |
| { |
| "completion_length": 83.96875, |
| "epoch": 0.09461426491994178, |
| "grad_norm": 2.1833555342889226, |
| "kl": 0.0262451171875, |
| "learning_rate": 9.892690779572096e-07, |
| "loss": 0.0028, |
| "reward": 0.4069466292858124, |
| "reward_std": 0.8976852297782898, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.6169465780258179, |
| "step": 65 |
| }, |
| { |
| "completion_length": 100.078125, |
| "epoch": 0.09606986899563319, |
| "grad_norm": 1.95497475262489, |
| "kl": 0.02783203125, |
| "learning_rate": 9.887776194738431e-07, |
| "loss": -0.0006, |
| "reward": 0.8320702910423279, |
| "reward_std": 1.0947431325912476, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.6451822519302368, |
| "step": 66 |
| }, |
| { |
| "completion_length": 89.546875, |
| "epoch": 0.0975254730713246, |
| "grad_norm": 2.4296708038911046, |
| "kl": 0.027099609375, |
| "learning_rate": 9.882752851759247e-07, |
| "loss": 0.0042, |
| "reward": 0.3521158695220947, |
| "reward_std": 0.9503006935119629, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.6041340827941895, |
| "step": 67 |
| }, |
| { |
| "completion_length": 90.5, |
| "epoch": 0.09898107714701601, |
| "grad_norm": 2.116653369847408, |
| "kl": 0.0250244140625, |
| "learning_rate": 9.877620862409192e-07, |
| "loss": 0.0022, |
| "reward": 0.1717708259820938, |
| "reward_std": 0.9714279174804688, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.6394791603088379, |
| "step": 68 |
| }, |
| { |
| "completion_length": 88.53125, |
| "epoch": 0.10043668122270742, |
| "grad_norm": 2.1712245174516704, |
| "kl": 0.031982421875, |
| "learning_rate": 9.872380340880416e-07, |
| "loss": -0.0013, |
| "reward": 0.5086783766746521, |
| "reward_std": 1.0054187774658203, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.6380794048309326, |
| "step": 69 |
| }, |
| { |
| "completion_length": 88.265625, |
| "epoch": 0.10189228529839883, |
| "grad_norm": 2.206351390007687, |
| "kl": 0.03466796875, |
| "learning_rate": 9.867031403780013e-07, |
| "loss": -0.006, |
| "reward": 0.5890104174613953, |
| "reward_std": 0.8417867422103882, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.6225130558013916, |
| "step": 70 |
| }, |
| { |
| "completion_length": 91.484375, |
| "epoch": 0.10334788937409024, |
| "grad_norm": 2.0985651707368183, |
| "kl": 0.0296630859375, |
| "learning_rate": 9.861574170127444e-07, |
| "loss": -0.0003, |
| "reward": 0.848574161529541, |
| "reward_std": 0.9138556718826294, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.6420508027076721, |
| "step": 71 |
| }, |
| { |
| "completion_length": 81.1875, |
| "epoch": 0.10480349344978165, |
| "grad_norm": 2.3307465788240993, |
| "kl": 0.037109375, |
| "learning_rate": 9.85600876135188e-07, |
| "loss": 0.0021, |
| "reward": 0.5502018332481384, |
| "reward_std": 0.8912982940673828, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.6144856214523315, |
| "step": 72 |
| }, |
| { |
| "completion_length": 86.5625, |
| "epoch": 0.10625909752547306, |
| "grad_norm": 2.1316996208970833, |
| "kl": 0.02880859375, |
| "learning_rate": 9.850335301289504e-07, |
| "loss": 0.0014, |
| "reward": -0.0640755146741867, |
| "reward_std": 0.8301602602005005, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.6316145658493042, |
| "step": 73 |
| }, |
| { |
| "completion_length": 78.46875, |
| "epoch": 0.10771470160116449, |
| "grad_norm": 2.147131800814305, |
| "kl": 0.03857421875, |
| "learning_rate": 9.844553916180746e-07, |
| "loss": -0.0031, |
| "reward": 0.55866539478302, |
| "reward_std": 1.0137114524841309, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.606178343296051, |
| "step": 74 |
| }, |
| { |
| "completion_length": 94.375, |
| "epoch": 0.1091703056768559, |
| "grad_norm": 2.134513729452084, |
| "kl": 0.037109375, |
| "learning_rate": 9.838664734667495e-07, |
| "loss": 0.0012, |
| "reward": 0.6748111844062805, |
| "reward_std": 0.863685131072998, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.6456054449081421, |
| "step": 75 |
| }, |
| { |
| "completion_length": 77.546875, |
| "epoch": 0.11062590975254731, |
| "grad_norm": 2.1654090092198013, |
| "kl": 0.033447265625, |
| "learning_rate": 9.832667887790206e-07, |
| "loss": -0.0003, |
| "reward": 0.39906901121139526, |
| "reward_std": 0.7819440364837646, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.6166341304779053, |
| "step": 76 |
| }, |
| { |
| "completion_length": 87.6875, |
| "epoch": 0.11208151382823872, |
| "grad_norm": 2.2117240462456476, |
| "kl": 0.033935546875, |
| "learning_rate": 9.826563508985016e-07, |
| "loss": -0.0008, |
| "reward": 0.10059896111488342, |
| "reward_std": 0.775175929069519, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.591393232345581, |
| "step": 77 |
| }, |
| { |
| "completion_length": 84.84375, |
| "epoch": 0.11353711790393013, |
| "grad_norm": 2.216126758365653, |
| "kl": 0.034912109375, |
| "learning_rate": 9.820351734080754e-07, |
| "loss": -0.0009, |
| "reward": 0.46383464336395264, |
| "reward_std": 0.6518522500991821, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.6175326108932495, |
| "step": 78 |
| }, |
| { |
| "completion_length": 71.5, |
| "epoch": 0.11499272197962154, |
| "grad_norm": 2.68226792040073, |
| "kl": 0.04345703125, |
| "learning_rate": 9.81403270129592e-07, |
| "loss": -0.0057, |
| "reward": 1.0123958587646484, |
| "reward_std": 0.7545869946479797, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.5960806608200073, |
| "step": 79 |
| }, |
| { |
| "completion_length": 80.859375, |
| "epoch": 0.11644832605531295, |
| "grad_norm": 2.249003158750042, |
| "kl": 0.03564453125, |
| "learning_rate": 9.807606551235627e-07, |
| "loss": -0.0056, |
| "reward": 0.9187760353088379, |
| "reward_std": 0.778544545173645, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.620690107345581, |
| "step": 80 |
| }, |
| { |
| "completion_length": 81.265625, |
| "epoch": 0.11790393013100436, |
| "grad_norm": 2.4482747382151695, |
| "kl": 0.043212890625, |
| "learning_rate": 9.801073426888446e-07, |
| "loss": 0.0019, |
| "reward": 0.7133723497390747, |
| "reward_std": 0.7031675577163696, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6206510663032532, |
| "step": 81 |
| }, |
| { |
| "completion_length": 77.8125, |
| "epoch": 0.11935953420669577, |
| "grad_norm": 2.2325416864250265, |
| "kl": 0.04052734375, |
| "learning_rate": 9.794433473623248e-07, |
| "loss": 0.0035, |
| "reward": 0.6323372721672058, |
| "reward_std": 0.9128393530845642, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.6164127588272095, |
| "step": 82 |
| }, |
| { |
| "completion_length": 82.046875, |
| "epoch": 0.12081513828238719, |
| "grad_norm": 2.1020592126259126, |
| "kl": 0.032958984375, |
| "learning_rate": 9.787686839185954e-07, |
| "loss": -0.0001, |
| "reward": 1.0958268642425537, |
| "reward_std": 0.6903193593025208, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.622962236404419, |
| "step": 83 |
| }, |
| { |
| "completion_length": 75.625, |
| "epoch": 0.1222707423580786, |
| "grad_norm": 2.225966097945577, |
| "kl": 0.041015625, |
| "learning_rate": 9.780833673696254e-07, |
| "loss": 0.0035, |
| "reward": 0.7522070407867432, |
| "reward_std": 0.7671902179718018, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.6155664920806885, |
| "step": 84 |
| }, |
| { |
| "completion_length": 83.359375, |
| "epoch": 0.12372634643377002, |
| "grad_norm": 2.2693402594500034, |
| "kl": 0.0458984375, |
| "learning_rate": 9.773874129644267e-07, |
| "loss": -0.0006, |
| "reward": 0.6304752826690674, |
| "reward_std": 0.8777204751968384, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.6244465708732605, |
| "step": 85 |
| }, |
| { |
| "completion_length": 76.90625, |
| "epoch": 0.12518195050946143, |
| "grad_norm": 2.138087833605439, |
| "kl": 0.0478515625, |
| "learning_rate": 9.766808361887148e-07, |
| "loss": -0.0009, |
| "reward": 0.8740299344062805, |
| "reward_std": 0.823813796043396, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.6129882335662842, |
| "step": 86 |
| }, |
| { |
| "completion_length": 78.640625, |
| "epoch": 0.12663755458515283, |
| "grad_norm": 2.644901616780376, |
| "kl": 0.0400390625, |
| "learning_rate": 9.759636527645632e-07, |
| "loss": 0.0005, |
| "reward": 0.7980924248695374, |
| "reward_std": 0.8134920001029968, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.6185481548309326, |
| "step": 87 |
| }, |
| { |
| "completion_length": 73.578125, |
| "epoch": 0.12809315866084425, |
| "grad_norm": 2.308714342568071, |
| "kl": 0.047607421875, |
| "learning_rate": 9.752358786500558e-07, |
| "loss": 0.0015, |
| "reward": 0.7030664682388306, |
| "reward_std": 0.874236524105072, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6089127659797668, |
| "step": 88 |
| }, |
| { |
| "completion_length": 82.390625, |
| "epoch": 0.12954876273653565, |
| "grad_norm": 2.2405963594949982, |
| "kl": 0.038330078125, |
| "learning_rate": 9.744975300389293e-07, |
| "loss": 0.0043, |
| "reward": 0.5240559577941895, |
| "reward_std": 0.8964687585830688, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.6264387369155884, |
| "step": 89 |
| }, |
| { |
| "completion_length": 80.78125, |
| "epoch": 0.13100436681222707, |
| "grad_norm": 2.2976726395774385, |
| "kl": 0.034423828125, |
| "learning_rate": 9.737486233602147e-07, |
| "loss": 0.0015, |
| "reward": 0.5632421970367432, |
| "reward_std": 0.7829184532165527, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.6213281750679016, |
| "step": 90 |
| }, |
| { |
| "completion_length": 78.078125, |
| "epoch": 0.1324599708879185, |
| "grad_norm": 2.102179867053138, |
| "kl": 0.05712890625, |
| "learning_rate": 9.729891752778711e-07, |
| "loss": -0.0009, |
| "reward": 0.9788346290588379, |
| "reward_std": 0.6263606548309326, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.6095768213272095, |
| "step": 91 |
| }, |
| { |
| "completion_length": 89.0625, |
| "epoch": 0.1339155749636099, |
| "grad_norm": 2.4407305687279295, |
| "kl": 0.034912109375, |
| "learning_rate": 9.722192026904144e-07, |
| "loss": -0.0027, |
| "reward": 0.9294661283493042, |
| "reward_std": 0.9660316109657288, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.6371874809265137, |
| "step": 92 |
| }, |
| { |
| "completion_length": 82.0625, |
| "epoch": 0.13537117903930132, |
| "grad_norm": 2.0885102072083717, |
| "kl": 0.0400390625, |
| "learning_rate": 9.71438722730542e-07, |
| "loss": 0.0019, |
| "reward": 1.2173632383346558, |
| "reward_std": 0.5946205854415894, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.6133268475532532, |
| "step": 93 |
| }, |
| { |
| "completion_length": 77.515625, |
| "epoch": 0.13682678311499272, |
| "grad_norm": 2.356967419590387, |
| "kl": 0.038330078125, |
| "learning_rate": 9.706477527647516e-07, |
| "loss": -0.0035, |
| "reward": 0.7038216590881348, |
| "reward_std": 0.6569962501525879, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6144986748695374, |
| "step": 94 |
| }, |
| { |
| "completion_length": 81.21875, |
| "epoch": 0.13828238719068414, |
| "grad_norm": 2.297586439480163, |
| "kl": 0.03857421875, |
| "learning_rate": 9.698463103929541e-07, |
| "loss": 0.0037, |
| "reward": 0.02611328661441803, |
| "reward_std": 0.5660784840583801, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.6138085722923279, |
| "step": 95 |
| }, |
| { |
| "completion_length": 78.6875, |
| "epoch": 0.13973799126637554, |
| "grad_norm": 2.183856708817705, |
| "kl": 0.04736328125, |
| "learning_rate": 9.69034413448083e-07, |
| "loss": -0.0024, |
| "reward": 0.8519986867904663, |
| "reward_std": 0.8133624792098999, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.6173242330551147, |
| "step": 96 |
| }, |
| { |
| "completion_length": 71.03125, |
| "epoch": 0.14119359534206696, |
| "grad_norm": 2.497616068899775, |
| "kl": 0.08154296875, |
| "learning_rate": 9.682120799956961e-07, |
| "loss": 0.0024, |
| "reward": 0.7480989694595337, |
| "reward_std": 0.7188245058059692, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.59375, |
| "step": 97 |
| }, |
| { |
| "completion_length": 75.328125, |
| "epoch": 0.14264919941775836, |
| "grad_norm": 2.521282427156024, |
| "kl": 0.0556640625, |
| "learning_rate": 9.673793283335756e-07, |
| "loss": -0.002, |
| "reward": 1.0954035520553589, |
| "reward_std": 0.6585423946380615, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.6114453077316284, |
| "step": 98 |
| }, |
| { |
| "completion_length": 82.234375, |
| "epoch": 0.14410480349344978, |
| "grad_norm": 2.1738986479204794, |
| "kl": 0.04345703125, |
| "learning_rate": 9.665361769913186e-07, |
| "loss": -0.0008, |
| "reward": 0.48210933804512024, |
| "reward_std": 0.6862488389015198, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.5999479293823242, |
| "step": 99 |
| }, |
| { |
| "completion_length": 85.5625, |
| "epoch": 0.14556040756914118, |
| "grad_norm": 2.1837000916722658, |
| "kl": 0.044189453125, |
| "learning_rate": 9.656826447299271e-07, |
| "loss": 0.0001, |
| "reward": 0.8835351467132568, |
| "reward_std": 0.5615145564079285, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.6321679949760437, |
| "step": 100 |
| }, |
| { |
| "completion_length": 83.546875, |
| "epoch": 0.1470160116448326, |
| "grad_norm": 2.087738988339914, |
| "kl": 0.04052734375, |
| "learning_rate": 9.648187505413884e-07, |
| "loss": 0.0008, |
| "reward": 0.7421875, |
| "reward_std": 0.6311359405517578, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.6250260472297668, |
| "step": 101 |
| }, |
| { |
| "completion_length": 88.03125, |
| "epoch": 0.14847161572052403, |
| "grad_norm": 2.1250846849492526, |
| "kl": 0.04638671875, |
| "learning_rate": 9.639445136482546e-07, |
| "loss": -0.0005, |
| "reward": 0.9741601943969727, |
| "reward_std": 0.681174635887146, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.6371548771858215, |
| "step": 102 |
| }, |
| { |
| "completion_length": 74.359375, |
| "epoch": 0.14992721979621543, |
| "grad_norm": 2.409258859083457, |
| "kl": 0.04150390625, |
| "learning_rate": 9.63059953503213e-07, |
| "loss": 0.0013, |
| "reward": 1.1404622793197632, |
| "reward_std": 0.38158488273620605, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.6089909672737122, |
| "step": 103 |
| }, |
| { |
| "completion_length": 87.359375, |
| "epoch": 0.15138282387190685, |
| "grad_norm": 2.088104027671848, |
| "kl": 0.0380859375, |
| "learning_rate": 9.621650897886541e-07, |
| "loss": -0.0018, |
| "reward": 0.7542252540588379, |
| "reward_std": 0.6290829181671143, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.6356835961341858, |
| "step": 104 |
| }, |
| { |
| "completion_length": 80.703125, |
| "epoch": 0.15283842794759825, |
| "grad_norm": 1.965318350453434, |
| "kl": 0.035400390625, |
| "learning_rate": 9.612599424162343e-07, |
| "loss": -0.0008, |
| "reward": 1.0488801002502441, |
| "reward_std": 0.48284074664115906, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.6244661808013916, |
| "step": 105 |
| }, |
| { |
| "completion_length": 93.875, |
| "epoch": 0.15429403202328967, |
| "grad_norm": 2.1219245641446975, |
| "kl": 0.038818359375, |
| "learning_rate": 9.603445315264316e-07, |
| "loss": 0.0014, |
| "reward": 1.0476692914962769, |
| "reward_std": 0.5120445489883423, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.6368489265441895, |
| "step": 106 |
| }, |
| { |
| "completion_length": 86.890625, |
| "epoch": 0.15574963609898107, |
| "grad_norm": 2.036182966894686, |
| "kl": 0.044189453125, |
| "learning_rate": 9.59418877488098e-07, |
| "loss": -0.004, |
| "reward": 0.955091118812561, |
| "reward_std": 0.804840624332428, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.6345182657241821, |
| "step": 107 |
| }, |
| { |
| "completion_length": 90.484375, |
| "epoch": 0.1572052401746725, |
| "grad_norm": 2.042159784075266, |
| "kl": 0.03173828125, |
| "learning_rate": 9.584830008980067e-07, |
| "loss": 0.0002, |
| "reward": 0.3213476538658142, |
| "reward_std": 0.6555300354957581, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.63895183801651, |
| "step": 108 |
| }, |
| { |
| "completion_length": 94.625, |
| "epoch": 0.1586608442503639, |
| "grad_norm": 1.9758420259115719, |
| "kl": 0.040771484375, |
| "learning_rate": 9.57536922580393e-07, |
| "loss": -0.0006, |
| "reward": 1.0781381130218506, |
| "reward_std": 0.7523989677429199, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.6387500762939453, |
| "step": 109 |
| }, |
| { |
| "completion_length": 91.6875, |
| "epoch": 0.16011644832605532, |
| "grad_norm": 2.331594748973951, |
| "kl": 0.0419921875, |
| "learning_rate": 9.565806635864917e-07, |
| "loss": -0.0045, |
| "reward": 0.5411393046379089, |
| "reward_std": 0.7696890830993652, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.6179623007774353, |
| "step": 110 |
| }, |
| { |
| "completion_length": 90.84375, |
| "epoch": 0.1615720524017467, |
| "grad_norm": 2.2436142517878648, |
| "kl": 0.038330078125, |
| "learning_rate": 9.556142451940679e-07, |
| "loss": 0.0008, |
| "reward": 0.2418619841337204, |
| "reward_std": 0.7631776332855225, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.6416015625, |
| "step": 111 |
| }, |
| { |
| "completion_length": 90.125, |
| "epoch": 0.16302765647743814, |
| "grad_norm": 1.8856149828565558, |
| "kl": 0.0478515625, |
| "learning_rate": 9.546376889069443e-07, |
| "loss": -0.0035, |
| "reward": 0.37279295921325684, |
| "reward_std": 0.6837524175643921, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.6407356858253479, |
| "step": 112 |
| }, |
| { |
| "completion_length": 98.203125, |
| "epoch": 0.16448326055312956, |
| "grad_norm": 2.173251868707325, |
| "kl": 0.03466796875, |
| "learning_rate": 9.536510164545222e-07, |
| "loss": 0.0007, |
| "reward": 1.06194007396698, |
| "reward_std": 0.6035024523735046, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.6475651264190674, |
| "step": 113 |
| }, |
| { |
| "completion_length": 103.3125, |
| "epoch": 0.16593886462882096, |
| "grad_norm": 2.0219223010119887, |
| "kl": 0.0439453125, |
| "learning_rate": 9.526542497912983e-07, |
| "loss": 0.0012, |
| "reward": 0.4283137917518616, |
| "reward_std": 0.6393612027168274, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.6627669334411621, |
| "step": 114 |
| }, |
| { |
| "completion_length": 100.796875, |
| "epoch": 0.16739446870451238, |
| "grad_norm": 2.0798140558182134, |
| "kl": 0.031005859375, |
| "learning_rate": 9.516474110963761e-07, |
| "loss": -0.0026, |
| "reward": 1.1091991662979126, |
| "reward_std": 0.4762105345726013, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.66357421875, |
| "step": 115 |
| }, |
| { |
| "completion_length": 100.421875, |
| "epoch": 0.16885007278020378, |
| "grad_norm": 2.2236939272257445, |
| "kl": 0.0478515625, |
| "learning_rate": 9.506305227729723e-07, |
| "loss": 0.0058, |
| "reward": 0.6438932418823242, |
| "reward_std": 0.509583592414856, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.6582422256469727, |
| "step": 116 |
| }, |
| { |
| "completion_length": 101.171875, |
| "epoch": 0.1703056768558952, |
| "grad_norm": 2.1980519442731468, |
| "kl": 0.0281982421875, |
| "learning_rate": 9.496036074479184e-07, |
| "loss": -0.001, |
| "reward": 0.827063798904419, |
| "reward_std": 0.9014047384262085, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.65771484375, |
| "step": 117 |
| }, |
| { |
| "completion_length": 96.125, |
| "epoch": 0.1717612809315866, |
| "grad_norm": 2.037191204255324, |
| "kl": 0.04150390625, |
| "learning_rate": 9.48566687971157e-07, |
| "loss": -0.0004, |
| "reward": 1.3227994441986084, |
| "reward_std": 0.5190377235412598, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.6540234684944153, |
| "step": 118 |
| }, |
| { |
| "completion_length": 109.09375, |
| "epoch": 0.17321688500727803, |
| "grad_norm": 2.0272853115442544, |
| "kl": 0.03466796875, |
| "learning_rate": 9.475197874152339e-07, |
| "loss": -0.0, |
| "reward": -0.18306639790534973, |
| "reward_std": 0.5607576966285706, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.664681077003479, |
| "step": 119 |
| }, |
| { |
| "completion_length": 96.515625, |
| "epoch": 0.17467248908296942, |
| "grad_norm": 2.0492268918355374, |
| "kl": 0.041748046875, |
| "learning_rate": 9.464629290747842e-07, |
| "loss": -0.0008, |
| "reward": 0.7684569954872131, |
| "reward_std": 0.7265533208847046, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.6427409052848816, |
| "step": 120 |
| }, |
| { |
| "completion_length": 102.375, |
| "epoch": 0.17612809315866085, |
| "grad_norm": 1.9898855505081523, |
| "kl": 0.041259765625, |
| "learning_rate": 9.453961364660142e-07, |
| "loss": -0.001, |
| "reward": 0.7294921875, |
| "reward_std": 0.3104066252708435, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6654947996139526, |
| "step": 121 |
| }, |
| { |
| "completion_length": 100.15625, |
| "epoch": 0.17758369723435224, |
| "grad_norm": 2.3735086249364046, |
| "kl": 0.037109375, |
| "learning_rate": 9.443194333261779e-07, |
| "loss": -0.0009, |
| "reward": 1.1522916555404663, |
| "reward_std": 0.48791223764419556, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.660559892654419, |
| "step": 122 |
| }, |
| { |
| "completion_length": 98.1875, |
| "epoch": 0.17903930131004367, |
| "grad_norm": 2.2233975652336175, |
| "kl": 0.02880859375, |
| "learning_rate": 9.432328436130493e-07, |
| "loss": -0.0013, |
| "reward": 1.3009765148162842, |
| "reward_std": 0.29463040828704834, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.656653642654419, |
| "step": 123 |
| }, |
| { |
| "completion_length": 114.890625, |
| "epoch": 0.1804949053857351, |
| "grad_norm": 2.0022084670374865, |
| "kl": 0.037841796875, |
| "learning_rate": 9.421363915043889e-07, |
| "loss": 0.0007, |
| "reward": 0.9197134971618652, |
| "reward_std": 0.3231443166732788, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.6893749833106995, |
| "step": 124 |
| }, |
| { |
| "completion_length": 99.171875, |
| "epoch": 0.1819505094614265, |
| "grad_norm": 1.8630637664051692, |
| "kl": 0.029296875, |
| "learning_rate": 9.410301013974056e-07, |
| "loss": -0.0012, |
| "reward": 0.63825523853302, |
| "reward_std": 0.3934669494628906, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.6578906178474426, |
| "step": 125 |
| }, |
| { |
| "completion_length": 100.859375, |
| "epoch": 0.18340611353711792, |
| "grad_norm": 1.9824110470356529, |
| "kl": 0.039794921875, |
| "learning_rate": 9.399139979082147e-07, |
| "loss": -0.0003, |
| "reward": 0.7100846171379089, |
| "reward_std": 0.5975295901298523, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6606184244155884, |
| "step": 126 |
| }, |
| { |
| "completion_length": 116.953125, |
| "epoch": 0.1848617176128093, |
| "grad_norm": 1.8889996265777742, |
| "kl": 0.027587890625, |
| "learning_rate": 9.387881058712888e-07, |
| "loss": 0.005, |
| "reward": 0.8783528804779053, |
| "reward_std": 0.8908267021179199, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.6875325441360474, |
| "step": 127 |
| }, |
| { |
| "completion_length": 107.375, |
| "epoch": 0.18631732168850074, |
| "grad_norm": 2.077089346407135, |
| "kl": 0.03173828125, |
| "learning_rate": 9.376524503389065e-07, |
| "loss": -0.002, |
| "reward": 0.4938216507434845, |
| "reward_std": 0.5928758978843689, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.669837236404419, |
| "step": 128 |
| }, |
| { |
| "completion_length": 117.109375, |
| "epoch": 0.18777292576419213, |
| "grad_norm": 1.8893938122618363, |
| "kl": 0.0294189453125, |
| "learning_rate": 9.36507056580594e-07, |
| "loss": -0.0018, |
| "reward": 0.7212304472923279, |
| "reward_std": 0.593620777130127, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.6912304759025574, |
| "step": 129 |
| }, |
| { |
| "completion_length": 123.234375, |
| "epoch": 0.18922852983988356, |
| "grad_norm": 1.7898651835340285, |
| "kl": 0.03173828125, |
| "learning_rate": 9.353519500825637e-07, |
| "loss": 0.0031, |
| "reward": 0.4010286331176758, |
| "reward_std": 0.5465176105499268, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.7014323472976685, |
| "step": 130 |
| }, |
| { |
| "completion_length": 104.0, |
| "epoch": 0.19068413391557495, |
| "grad_norm": 2.2172071907289346, |
| "kl": 0.0277099609375, |
| "learning_rate": 9.341871565471463e-07, |
| "loss": 0.0001, |
| "reward": 0.7860090732574463, |
| "reward_std": 0.4027497172355652, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.6570768356323242, |
| "step": 131 |
| }, |
| { |
| "completion_length": 114.546875, |
| "epoch": 0.19213973799126638, |
| "grad_norm": 1.886899003286242, |
| "kl": 0.0296630859375, |
| "learning_rate": 9.330127018922193e-07, |
| "loss": -0.0045, |
| "reward": 0.7923893332481384, |
| "reward_std": 0.5422953367233276, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.6841601133346558, |
| "step": 132 |
| }, |
| { |
| "completion_length": 127.171875, |
| "epoch": 0.19359534206695778, |
| "grad_norm": 1.940672486847996, |
| "kl": 0.0286865234375, |
| "learning_rate": 9.318286122506302e-07, |
| "loss": 0.0018, |
| "reward": 0.7649609446525574, |
| "reward_std": 0.9051897525787354, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.6996744871139526, |
| "step": 133 |
| }, |
| { |
| "completion_length": 122.59375, |
| "epoch": 0.1950509461426492, |
| "grad_norm": 2.012027311337077, |
| "kl": 0.03271484375, |
| "learning_rate": 9.306349139696154e-07, |
| "loss": 0.0021, |
| "reward": 0.9356836080551147, |
| "reward_std": 0.3557165265083313, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.6972851157188416, |
| "step": 134 |
| }, |
| { |
| "completion_length": 125.34375, |
| "epoch": 0.1965065502183406, |
| "grad_norm": 1.9803313783521628, |
| "kl": 0.03466796875, |
| "learning_rate": 9.29431633610213e-07, |
| "loss": -0.0039, |
| "reward": 1.0579817295074463, |
| "reward_std": 0.7984186410903931, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.6721354126930237, |
| "step": 135 |
| }, |
| { |
| "completion_length": 151.75, |
| "epoch": 0.19796215429403202, |
| "grad_norm": 1.6537670662618684, |
| "kl": 0.0308837890625, |
| "learning_rate": 9.282187979466729e-07, |
| "loss": 0.0042, |
| "reward": 0.8291862607002258, |
| "reward_std": 0.5554116368293762, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.7382357120513916, |
| "step": 136 |
| }, |
| { |
| "completion_length": 147.265625, |
| "epoch": 0.19941775836972345, |
| "grad_norm": 1.7201446218582972, |
| "kl": 0.031982421875, |
| "learning_rate": 9.269964339658604e-07, |
| "loss": 0.006, |
| "reward": 1.283261775970459, |
| "reward_std": 0.5926542282104492, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.7361132502555847, |
| "step": 137 |
| }, |
| { |
| "completion_length": 134.375, |
| "epoch": 0.20087336244541484, |
| "grad_norm": 1.874315712431397, |
| "kl": 0.03173828125, |
| "learning_rate": 9.257645688666555e-07, |
| "loss": -0.0038, |
| "reward": 1.069654941558838, |
| "reward_std": 0.37698203325271606, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.7255665063858032, |
| "step": 138 |
| }, |
| { |
| "completion_length": 136.75, |
| "epoch": 0.20232896652110627, |
| "grad_norm": 1.814100036642101, |
| "kl": 0.0301513671875, |
| "learning_rate": 9.245232300593488e-07, |
| "loss": 0.0007, |
| "reward": 0.5383853912353516, |
| "reward_std": 0.32863178849220276, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.7274609804153442, |
| "step": 139 |
| }, |
| { |
| "completion_length": 156.84375, |
| "epoch": 0.20378457059679767, |
| "grad_norm": 1.7660157876424847, |
| "kl": 0.042236328125, |
| "learning_rate": 9.232724451650302e-07, |
| "loss": -0.0023, |
| "reward": 1.042018175125122, |
| "reward_std": 0.4296218752861023, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.7598958611488342, |
| "step": 140 |
| }, |
| { |
| "completion_length": 152.078125, |
| "epoch": 0.2052401746724891, |
| "grad_norm": 1.9326818117418496, |
| "kl": 0.03662109375, |
| "learning_rate": 9.220122420149752e-07, |
| "loss": -0.0015, |
| "reward": 0.7897005081176758, |
| "reward_std": 0.8649002313613892, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.7523828148841858, |
| "step": 141 |
| }, |
| { |
| "completion_length": 146.171875, |
| "epoch": 0.2066957787481805, |
| "grad_norm": 2.3170747889179975, |
| "kl": 0.04150390625, |
| "learning_rate": 9.207426486500251e-07, |
| "loss": 0.0029, |
| "reward": 0.9602214097976685, |
| "reward_std": 0.48202353715896606, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.7439453601837158, |
| "step": 142 |
| }, |
| { |
| "completion_length": 153.78125, |
| "epoch": 0.2081513828238719, |
| "grad_norm": 1.8295711779834491, |
| "kl": 0.0380859375, |
| "learning_rate": 9.194636933199637e-07, |
| "loss": -0.0005, |
| "reward": 0.6159765720367432, |
| "reward_std": 0.6443432569503784, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.7611978650093079, |
| "step": 143 |
| }, |
| { |
| "completion_length": 167.6875, |
| "epoch": 0.2096069868995633, |
| "grad_norm": 1.8055857818112881, |
| "kl": 0.03466796875, |
| "learning_rate": 9.18175404482888e-07, |
| "loss": -0.0004, |
| "reward": 0.7853385210037231, |
| "reward_std": 0.6822813153266907, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.7819271087646484, |
| "step": 144 |
| }, |
| { |
| "completion_length": 149.59375, |
| "epoch": 0.21106259097525473, |
| "grad_norm": 1.7611225555314238, |
| "kl": 0.0361328125, |
| "learning_rate": 9.168778108045758e-07, |
| "loss": 0.0014, |
| "reward": 1.3780207633972168, |
| "reward_std": 0.5230543613433838, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.7534895539283752, |
| "step": 145 |
| }, |
| { |
| "completion_length": 154.515625, |
| "epoch": 0.21251819505094613, |
| "grad_norm": 1.8484664379659073, |
| "kl": 0.03515625, |
| "learning_rate": 9.155709411578467e-07, |
| "loss": -0.0028, |
| "reward": 0.663769543170929, |
| "reward_std": 0.8413479328155518, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.7554622292518616, |
| "step": 146 |
| }, |
| { |
| "completion_length": 169.25, |
| "epoch": 0.21397379912663755, |
| "grad_norm": 1.5659465541368485, |
| "kl": 0.03125, |
| "learning_rate": 9.14254824621921e-07, |
| "loss": 0.0025, |
| "reward": 0.8898242115974426, |
| "reward_std": 0.61174476146698, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.7810482382774353, |
| "step": 147 |
| }, |
| { |
| "completion_length": 183.078125, |
| "epoch": 0.21542940320232898, |
| "grad_norm": 1.9240333532034173, |
| "kl": 0.0299072265625, |
| "learning_rate": 9.129294904817715e-07, |
| "loss": 0.0055, |
| "reward": -0.32472002506256104, |
| "reward_std": 0.5251954793930054, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.804186224937439, |
| "step": 148 |
| }, |
| { |
| "completion_length": 188.078125, |
| "epoch": 0.21688500727802038, |
| "grad_norm": 1.584600042627141, |
| "kl": 0.037353515625, |
| "learning_rate": 9.115949682274727e-07, |
| "loss": -0.0006, |
| "reward": 1.0250911712646484, |
| "reward_std": 0.7966851592063904, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.7925260066986084, |
| "step": 149 |
| }, |
| { |
| "completion_length": 192.75, |
| "epoch": 0.2183406113537118, |
| "grad_norm": 1.7968297696360862, |
| "kl": 0.034912109375, |
| "learning_rate": 9.102512875535438e-07, |
| "loss": 0.0024, |
| "reward": 0.9074022769927979, |
| "reward_std": 0.5267736315727234, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.8198763132095337, |
| "step": 150 |
| }, |
| { |
| "completion_length": 181.109375, |
| "epoch": 0.2197962154294032, |
| "grad_norm": 1.487459256780106, |
| "kl": 0.034912109375, |
| "learning_rate": 9.088984783582889e-07, |
| "loss": 0.0008, |
| "reward": 1.1278971433639526, |
| "reward_std": 0.6906546950340271, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.771959662437439, |
| "step": 151 |
| }, |
| { |
| "completion_length": 168.859375, |
| "epoch": 0.22125181950509462, |
| "grad_norm": 1.6688905020817022, |
| "kl": 0.033203125, |
| "learning_rate": 9.075365707431311e-07, |
| "loss": 0.0037, |
| "reward": 0.38999348878860474, |
| "reward_std": 0.7020710110664368, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.7868424654006958, |
| "step": 152 |
| }, |
| { |
| "completion_length": 189.75, |
| "epoch": 0.22270742358078602, |
| "grad_norm": 1.5057560252284368, |
| "kl": 0.03369140625, |
| "learning_rate": 9.061655950119429e-07, |
| "loss": -0.0008, |
| "reward": 0.7166080474853516, |
| "reward_std": 0.7406556010246277, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.7738346457481384, |
| "step": 153 |
| }, |
| { |
| "completion_length": 195.203125, |
| "epoch": 0.22416302765647744, |
| "grad_norm": 1.6050100488071573, |
| "kl": 0.029296875, |
| "learning_rate": 9.04785581670372e-07, |
| "loss": -0.0032, |
| "reward": 0.00641275942325592, |
| "reward_std": 0.9503659605979919, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.8109830617904663, |
| "step": 154 |
| }, |
| { |
| "completion_length": 209.5625, |
| "epoch": 0.22561863173216884, |
| "grad_norm": 1.671842517453717, |
| "kl": 0.030517578125, |
| "learning_rate": 9.033965614251622e-07, |
| "loss": 0.002, |
| "reward": 1.051744818687439, |
| "reward_std": 0.6885015964508057, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.8231119513511658, |
| "step": 155 |
| }, |
| { |
| "completion_length": 182.359375, |
| "epoch": 0.22707423580786026, |
| "grad_norm": 1.7685493760830322, |
| "kl": 0.03466796875, |
| "learning_rate": 9.019985651834703e-07, |
| "loss": 0.0004, |
| "reward": 1.1568944454193115, |
| "reward_std": 0.6391496658325195, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.7899413704872131, |
| "step": 156 |
| }, |
| { |
| "completion_length": 203.671875, |
| "epoch": 0.22852983988355166, |
| "grad_norm": 1.6158229805841648, |
| "kl": 0.04541015625, |
| "learning_rate": 9.005916240521787e-07, |
| "loss": -0.0008, |
| "reward": 1.3469856977462769, |
| "reward_std": 0.5584310293197632, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.8243294358253479, |
| "step": 157 |
| }, |
| { |
| "completion_length": 198.0, |
| "epoch": 0.22998544395924309, |
| "grad_norm": 1.5287888769470326, |
| "kl": 0.03173828125, |
| "learning_rate": 8.99175769337203e-07, |
| "loss": -0.0002, |
| "reward": 0.5042838454246521, |
| "reward_std": 0.3807409107685089, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.8300260901451111, |
| "step": 158 |
| }, |
| { |
| "completion_length": 198.5625, |
| "epoch": 0.2314410480349345, |
| "grad_norm": 1.6980632238166218, |
| "kl": 0.045654296875, |
| "learning_rate": 8.97751032542795e-07, |
| "loss": 0.0048, |
| "reward": 0.7278580665588379, |
| "reward_std": 0.8472484350204468, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.809654951095581, |
| "step": 159 |
| }, |
| { |
| "completion_length": 209.734375, |
| "epoch": 0.2328966521106259, |
| "grad_norm": 1.4763866341269176, |
| "kl": 0.03759765625, |
| "learning_rate": 8.963174453708424e-07, |
| "loss": -0.0002, |
| "reward": 1.0725455284118652, |
| "reward_std": 0.9471028447151184, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.8029752969741821, |
| "step": 160 |
| }, |
| { |
| "completion_length": 204.84375, |
| "epoch": 0.23435225618631733, |
| "grad_norm": 1.509135110734139, |
| "kl": 0.039794921875, |
| "learning_rate": 8.94875039720163e-07, |
| "loss": 0.0021, |
| "reward": 0.7245508432388306, |
| "reward_std": 0.28923317790031433, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8137694597244263, |
| "step": 161 |
| }, |
| { |
| "completion_length": 205.6875, |
| "epoch": 0.23580786026200873, |
| "grad_norm": 1.670333061374936, |
| "kl": 0.03466796875, |
| "learning_rate": 8.934238476857949e-07, |
| "loss": -0.0015, |
| "reward": 1.22831392288208, |
| "reward_std": 0.6582134366035461, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.7960742115974426, |
| "step": 162 |
| }, |
| { |
| "completion_length": 219.171875, |
| "epoch": 0.23726346433770015, |
| "grad_norm": 1.4108741219708183, |
| "kl": 0.04345703125, |
| "learning_rate": 8.919639015582828e-07, |
| "loss": 0.0007, |
| "reward": 0.8317968845367432, |
| "reward_std": 0.8654596209526062, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.8558332920074463, |
| "step": 163 |
| }, |
| { |
| "completion_length": 221.921875, |
| "epoch": 0.23871906841339155, |
| "grad_norm": 1.5210214995683051, |
| "kl": 0.038330078125, |
| "learning_rate": 8.904952338229587e-07, |
| "loss": 0.0028, |
| "reward": 1.3629167079925537, |
| "reward_std": 0.4224995970726013, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.8020312786102295, |
| "step": 164 |
| }, |
| { |
| "completion_length": 221.109375, |
| "epoch": 0.24017467248908297, |
| "grad_norm": 1.570373533347191, |
| "kl": 0.047119140625, |
| "learning_rate": 8.890178771592197e-07, |
| "loss": -0.003, |
| "reward": 1.1900064945220947, |
| "reward_std": 0.45047634840011597, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8544987440109253, |
| "step": 165 |
| }, |
| { |
| "completion_length": 226.859375, |
| "epoch": 0.24163027656477437, |
| "grad_norm": 1.4369155698170966, |
| "kl": 0.041748046875, |
| "learning_rate": 8.875318644398007e-07, |
| "loss": 0.0001, |
| "reward": 1.1164518594741821, |
| "reward_std": 0.6411557793617249, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.8440819978713989, |
| "step": 166 |
| }, |
| { |
| "completion_length": 217.828125, |
| "epoch": 0.2430858806404658, |
| "grad_norm": 1.5294020027875026, |
| "kl": 0.04150390625, |
| "learning_rate": 8.860372287300431e-07, |
| "loss": -0.0025, |
| "reward": 0.8133528828620911, |
| "reward_std": 0.3728664517402649, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.8488085269927979, |
| "step": 167 |
| }, |
| { |
| "completion_length": 232.46875, |
| "epoch": 0.2445414847161572, |
| "grad_norm": 1.4493765933491527, |
| "kl": 0.043701171875, |
| "learning_rate": 8.845340032871583e-07, |
| "loss": 0.0056, |
| "reward": 0.8572330474853516, |
| "reward_std": 0.6895395517349243, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.82414710521698, |
| "step": 168 |
| }, |
| { |
| "completion_length": 226.5, |
| "epoch": 0.24599708879184862, |
| "grad_norm": 1.4764300782122841, |
| "kl": 0.0556640625, |
| "learning_rate": 8.83022221559489e-07, |
| "loss": 0.0009, |
| "reward": 0.47426432371139526, |
| "reward_std": 0.8444293737411499, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.8511523604393005, |
| "step": 169 |
| }, |
| { |
| "completion_length": 237.8125, |
| "epoch": 0.24745269286754004, |
| "grad_norm": 1.6769320026302001, |
| "kl": 0.0498046875, |
| "learning_rate": 8.815019171857637e-07, |
| "loss": -0.0027, |
| "reward": 1.0890560150146484, |
| "reward_std": 0.5113028287887573, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.8479622602462769, |
| "step": 170 |
| }, |
| { |
| "completion_length": 261.34375, |
| "epoch": 0.24890829694323144, |
| "grad_norm": 1.2657644862990098, |
| "kl": 0.039794921875, |
| "learning_rate": 8.799731239943487e-07, |
| "loss": 0.0036, |
| "reward": 1.0662890672683716, |
| "reward_std": 0.6503676176071167, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.8455598950386047, |
| "step": 171 |
| }, |
| { |
| "completion_length": 242.8125, |
| "epoch": 0.25036390101892286, |
| "grad_norm": 1.5140610941306871, |
| "kl": 0.0439453125, |
| "learning_rate": 8.784358760024959e-07, |
| "loss": 0.0, |
| "reward": 1.0661003589630127, |
| "reward_std": 0.7506436705589294, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.7991602420806885, |
| "step": 172 |
| }, |
| { |
| "completion_length": 272.265625, |
| "epoch": 0.25181950509461426, |
| "grad_norm": 1.3002869973291449, |
| "kl": 0.04345703125, |
| "learning_rate": 8.768902074155847e-07, |
| "loss": 0.0012, |
| "reward": 0.5323176383972168, |
| "reward_std": 0.7601783275604248, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8805208206176758, |
| "step": 173 |
| }, |
| { |
| "completion_length": 260.171875, |
| "epoch": 0.25327510917030566, |
| "grad_norm": 1.2395481937954542, |
| "kl": 0.04345703125, |
| "learning_rate": 8.753361526263621e-07, |
| "loss": 0.0013, |
| "reward": 1.0828580856323242, |
| "reward_std": 0.8798565864562988, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.8691210746765137, |
| "step": 174 |
| }, |
| { |
| "completion_length": 268.65625, |
| "epoch": 0.2547307132459971, |
| "grad_norm": 1.324781469590268, |
| "kl": 0.046875, |
| "learning_rate": 8.737737462141768e-07, |
| "loss": -0.0019, |
| "reward": 1.4227733612060547, |
| "reward_std": 0.8888717293739319, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.8309636116027832, |
| "step": 175 |
| }, |
| { |
| "completion_length": 266.9375, |
| "epoch": 0.2561863173216885, |
| "grad_norm": 1.2486364117958004, |
| "kl": 0.049072265625, |
| "learning_rate": 8.722030229442095e-07, |
| "loss": 0.0013, |
| "reward": 1.1527018547058105, |
| "reward_std": 0.9395639300346375, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8161133527755737, |
| "step": 176 |
| }, |
| { |
| "completion_length": 287.296875, |
| "epoch": 0.2576419213973799, |
| "grad_norm": 1.3449980174695708, |
| "kl": 0.0458984375, |
| "learning_rate": 8.706240177667001e-07, |
| "loss": -0.0028, |
| "reward": 0.6147265434265137, |
| "reward_std": 0.7540205717086792, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.8909245133399963, |
| "step": 177 |
| }, |
| { |
| "completion_length": 270.5, |
| "epoch": 0.2590975254730713, |
| "grad_norm": 1.6585671029978815, |
| "kl": 0.052490234375, |
| "learning_rate": 8.690367658161694e-07, |
| "loss": 0.0004, |
| "reward": 0.9076562523841858, |
| "reward_std": 0.698926568031311, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.88106769323349, |
| "step": 178 |
| }, |
| { |
| "completion_length": 299.03125, |
| "epoch": 0.26055312954876275, |
| "grad_norm": 1.27502354364528, |
| "kl": 0.04443359375, |
| "learning_rate": 8.674413024106379e-07, |
| "loss": 0.0011, |
| "reward": 0.06214843690395355, |
| "reward_std": 0.6285444498062134, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.8771094083786011, |
| "step": 179 |
| }, |
| { |
| "completion_length": 311.5625, |
| "epoch": 0.26200873362445415, |
| "grad_norm": 1.1519559026695043, |
| "kl": 0.044677734375, |
| "learning_rate": 8.658376630508391e-07, |
| "loss": 0.0012, |
| "reward": 0.49518883228302, |
| "reward_std": 1.1731948852539062, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8484830856323242, |
| "step": 180 |
| }, |
| { |
| "completion_length": 338.453125, |
| "epoch": 0.26346433770014555, |
| "grad_norm": 1.1630608606801263, |
| "kl": 0.03662109375, |
| "learning_rate": 8.642258834194305e-07, |
| "loss": 0.0, |
| "reward": 0.5747005343437195, |
| "reward_std": 1.185742974281311, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.855039119720459, |
| "step": 181 |
| }, |
| { |
| "completion_length": 298.203125, |
| "epoch": 0.264919941775837, |
| "grad_norm": 1.092799001313928, |
| "kl": 0.05078125, |
| "learning_rate": 8.626059993801986e-07, |
| "loss": 0.0013, |
| "reward": 1.1474609375, |
| "reward_std": 1.3674826622009277, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.7989974021911621, |
| "step": 182 |
| }, |
| { |
| "completion_length": 316.59375, |
| "epoch": 0.2663755458515284, |
| "grad_norm": 1.1905408752898774, |
| "kl": 0.03662109375, |
| "learning_rate": 8.609780469772621e-07, |
| "loss": -0.0003, |
| "reward": 0.4997330904006958, |
| "reward_std": 1.4191782474517822, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.7479101419448853, |
| "step": 183 |
| }, |
| { |
| "completion_length": 343.40625, |
| "epoch": 0.2678311499272198, |
| "grad_norm": 1.1407997883526075, |
| "kl": 0.038818359375, |
| "learning_rate": 8.593420624342691e-07, |
| "loss": -0.0005, |
| "reward": -0.2410416603088379, |
| "reward_std": 1.0026159286499023, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.806236982345581, |
| "step": 184 |
| }, |
| { |
| "completion_length": 292.59375, |
| "epoch": 0.2692867540029112, |
| "grad_norm": 1.3091775192956971, |
| "kl": 0.057373046875, |
| "learning_rate": 8.57698082153591e-07, |
| "loss": 0.0022, |
| "reward": 0.5668359398841858, |
| "reward_std": 1.0860553979873657, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.8349218368530273, |
| "step": 185 |
| }, |
| { |
| "completion_length": 297.1875, |
| "epoch": 0.27074235807860264, |
| "grad_norm": 1.1878094968164294, |
| "kl": 0.03857421875, |
| "learning_rate": 8.560461427155128e-07, |
| "loss": 0.0024, |
| "reward": 0.8997591137886047, |
| "reward_std": 1.2781544923782349, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.8541991710662842, |
| "step": 186 |
| }, |
| { |
| "completion_length": 283.4375, |
| "epoch": 0.27219796215429404, |
| "grad_norm": 1.3178336609529575, |
| "kl": 0.048828125, |
| "learning_rate": 8.543862808774191e-07, |
| "loss": -0.0003, |
| "reward": 1.5494986772537231, |
| "reward_std": 0.4223147928714752, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9061654210090637, |
| "step": 187 |
| }, |
| { |
| "completion_length": 296.890625, |
| "epoch": 0.27365356622998543, |
| "grad_norm": 1.2645455173847566, |
| "kl": 0.04541015625, |
| "learning_rate": 8.527185335729765e-07, |
| "loss": 0.0035, |
| "reward": 0.7015169262886047, |
| "reward_std": 1.2943627834320068, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8435481786727905, |
| "step": 188 |
| }, |
| { |
| "completion_length": 329.90625, |
| "epoch": 0.27510917030567683, |
| "grad_norm": 1.1581867103902632, |
| "kl": 0.0498046875, |
| "learning_rate": 8.510429379113113e-07, |
| "loss": -0.004, |
| "reward": 0.47220054268836975, |
| "reward_std": 0.8791263103485107, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8547526001930237, |
| "step": 189 |
| }, |
| { |
| "completion_length": 330.734375, |
| "epoch": 0.2765647743813683, |
| "grad_norm": 1.242052928211484, |
| "kl": 0.0419921875, |
| "learning_rate": 8.493595311761836e-07, |
| "loss": 0.0019, |
| "reward": 0.0584830716252327, |
| "reward_std": 0.9290468096733093, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.7730534076690674, |
| "step": 190 |
| }, |
| { |
| "completion_length": 298.8125, |
| "epoch": 0.2780203784570597, |
| "grad_norm": 1.0587431693427292, |
| "kl": 0.04931640625, |
| "learning_rate": 8.47668350825159e-07, |
| "loss": 0.002, |
| "reward": 0.8388606309890747, |
| "reward_std": 0.9359092712402344, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.8803450465202332, |
| "step": 191 |
| }, |
| { |
| "completion_length": 295.765625, |
| "epoch": 0.2794759825327511, |
| "grad_norm": 1.2325069141615899, |
| "kl": 0.044189453125, |
| "learning_rate": 8.459694344887731e-07, |
| "loss": -0.0002, |
| "reward": 1.281217336654663, |
| "reward_std": 0.7809577584266663, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.8709959983825684, |
| "step": 192 |
| }, |
| { |
| "completion_length": 344.203125, |
| "epoch": 0.28093158660844253, |
| "grad_norm": 1.0295971882868977, |
| "kl": 0.03857421875, |
| "learning_rate": 8.44262819969696e-07, |
| "loss": 0.0018, |
| "reward": -0.01013021171092987, |
| "reward_std": 1.1098777055740356, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.8218880295753479, |
| "step": 193 |
| }, |
| { |
| "completion_length": 341.453125, |
| "epoch": 0.2823871906841339, |
| "grad_norm": 1.1002232540991268, |
| "kl": 0.043212890625, |
| "learning_rate": 8.425485452418905e-07, |
| "loss": -0.0031, |
| "reward": 1.3040039539337158, |
| "reward_std": 1.2101850509643555, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.8196288347244263, |
| "step": 194 |
| }, |
| { |
| "completion_length": 313.125, |
| "epoch": 0.2838427947598253, |
| "grad_norm": 1.2142916830344008, |
| "kl": 0.04443359375, |
| "learning_rate": 8.408266484497664e-07, |
| "loss": 0.0029, |
| "reward": 0.5552148222923279, |
| "reward_std": 0.7991162538528442, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.8420246839523315, |
| "step": 195 |
| }, |
| { |
| "completion_length": 295.796875, |
| "epoch": 0.2852983988355167, |
| "grad_norm": 1.0476158305891778, |
| "kl": 0.0439453125, |
| "learning_rate": 8.39097167907333e-07, |
| "loss": -0.0038, |
| "reward": 0.6949739456176758, |
| "reward_std": 1.1475563049316406, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8432031273841858, |
| "step": 196 |
| }, |
| { |
| "completion_length": 286.109375, |
| "epoch": 0.2867540029112082, |
| "grad_norm": 1.326917469312428, |
| "kl": 0.04931640625, |
| "learning_rate": 8.373601420973463e-07, |
| "loss": -0.0043, |
| "reward": 0.5129296779632568, |
| "reward_std": 0.5196168422698975, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9051302075386047, |
| "step": 197 |
| }, |
| { |
| "completion_length": 267.21875, |
| "epoch": 0.28820960698689957, |
| "grad_norm": 1.4099736885345076, |
| "kl": 0.054443359375, |
| "learning_rate": 8.356156096704514e-07, |
| "loss": 0.0035, |
| "reward": 0.9967187643051147, |
| "reward_std": 0.6953111886978149, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.8652864694595337, |
| "step": 198 |
| }, |
| { |
| "completion_length": 291.3125, |
| "epoch": 0.28966521106259097, |
| "grad_norm": 1.3657308201508438, |
| "kl": 0.04296875, |
| "learning_rate": 8.338636094443241e-07, |
| "loss": 0.0016, |
| "reward": 0.6036978960037231, |
| "reward_std": 0.5057134628295898, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.8577994704246521, |
| "step": 199 |
| }, |
| { |
| "completion_length": 301.484375, |
| "epoch": 0.29112081513828236, |
| "grad_norm": 1.1206020594596815, |
| "kl": 0.04345703125, |
| "learning_rate": 8.32104180402807e-07, |
| "loss": 0.001, |
| "reward": 0.6353580951690674, |
| "reward_std": 1.240022897720337, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8110611438751221, |
| "step": 200 |
| }, |
| { |
| "completion_length": 276.0, |
| "epoch": 0.2925764192139738, |
| "grad_norm": 1.4814987178126335, |
| "kl": 0.041015625, |
| "learning_rate": 8.303373616950406e-07, |
| "loss": 0.0003, |
| "reward": 0.9168750047683716, |
| "reward_std": 0.5409839749336243, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.8954036235809326, |
| "step": 201 |
| }, |
| { |
| "completion_length": 284.796875, |
| "epoch": 0.2940320232896652, |
| "grad_norm": 1.3014402895334682, |
| "kl": 0.04296875, |
| "learning_rate": 8.285631926345943e-07, |
| "loss": 0.0009, |
| "reward": 1.4197134971618652, |
| "reward_std": 0.7836724519729614, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9230338335037231, |
| "step": 202 |
| }, |
| { |
| "completion_length": 303.765625, |
| "epoch": 0.2954876273653566, |
| "grad_norm": 1.4100146478905302, |
| "kl": 0.044921875, |
| "learning_rate": 8.267817126985897e-07, |
| "loss": 0.0032, |
| "reward": 1.148378849029541, |
| "reward_std": 0.8374971151351929, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8262174725532532, |
| "step": 203 |
| }, |
| { |
| "completion_length": 259.140625, |
| "epoch": 0.29694323144104806, |
| "grad_norm": 1.3834197464383897, |
| "kl": 0.046630859375, |
| "learning_rate": 8.249929615268233e-07, |
| "loss": 0.0032, |
| "reward": 0.3902604579925537, |
| "reward_std": 0.24537065625190735, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9234635829925537, |
| "step": 204 |
| }, |
| { |
| "completion_length": 309.109375, |
| "epoch": 0.29839883551673946, |
| "grad_norm": 1.2269049796652634, |
| "kl": 0.03857421875, |
| "learning_rate": 8.231969789208845e-07, |
| "loss": -0.0007, |
| "reward": 0.6886327862739563, |
| "reward_std": 1.174818992614746, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8217057585716248, |
| "step": 205 |
| }, |
| { |
| "completion_length": 285.484375, |
| "epoch": 0.29985443959243085, |
| "grad_norm": 1.2547594996135933, |
| "kl": 0.0458984375, |
| "learning_rate": 8.213938048432696e-07, |
| "loss": 0.0074, |
| "reward": 0.8353320360183716, |
| "reward_std": 0.9169821739196777, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.8625064492225647, |
| "step": 206 |
| }, |
| { |
| "completion_length": 269.609375, |
| "epoch": 0.30131004366812225, |
| "grad_norm": 1.3754430566964044, |
| "kl": 0.046142578125, |
| "learning_rate": 8.195834794164924e-07, |
| "loss": 0.0009, |
| "reward": 1.1299283504486084, |
| "reward_std": 0.8097370862960815, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.91706383228302, |
| "step": 207 |
| }, |
| { |
| "completion_length": 278.21875, |
| "epoch": 0.3027656477438137, |
| "grad_norm": 1.5758484032719748, |
| "kl": 0.045166015625, |
| "learning_rate": 8.17766042922192e-07, |
| "loss": -0.006, |
| "reward": 1.1924219131469727, |
| "reward_std": 0.35831767320632935, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.8785676956176758, |
| "step": 208 |
| }, |
| { |
| "completion_length": 255.671875, |
| "epoch": 0.3042212518195051, |
| "grad_norm": 1.3050143773992107, |
| "kl": 0.0458984375, |
| "learning_rate": 8.15941535800236e-07, |
| "loss": -0.003, |
| "reward": 1.0622721910476685, |
| "reward_std": 0.30105501413345337, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.87548828125, |
| "step": 209 |
| }, |
| { |
| "completion_length": 275.90625, |
| "epoch": 0.3056768558951965, |
| "grad_norm": 1.1249884714957055, |
| "kl": 0.041015625, |
| "learning_rate": 8.141099986478212e-07, |
| "loss": -0.0028, |
| "reward": 1.0719857215881348, |
| "reward_std": 1.051162838935852, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.8484700918197632, |
| "step": 210 |
| }, |
| { |
| "completion_length": 285.9375, |
| "epoch": 0.3071324599708879, |
| "grad_norm": 1.3343580700065274, |
| "kl": 0.049560546875, |
| "learning_rate": 8.122714722185695e-07, |
| "loss": 0.0006, |
| "reward": 1.1121940612792969, |
| "reward_std": 0.45470452308654785, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.8535741567611694, |
| "step": 211 |
| }, |
| { |
| "completion_length": 286.96875, |
| "epoch": 0.30858806404657935, |
| "grad_norm": 1.2721134321125864, |
| "kl": 0.04052734375, |
| "learning_rate": 8.104259974216218e-07, |
| "loss": 0.0011, |
| "reward": 1.3401693105697632, |
| "reward_std": 0.7562965154647827, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9284895658493042, |
| "step": 212 |
| }, |
| { |
| "completion_length": 263.28125, |
| "epoch": 0.31004366812227074, |
| "grad_norm": 1.2793045432243102, |
| "kl": 0.0419921875, |
| "learning_rate": 8.085736153207276e-07, |
| "loss": -0.0016, |
| "reward": 1.68442702293396, |
| "reward_std": 0.5725850462913513, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.8700129985809326, |
| "step": 213 |
| }, |
| { |
| "completion_length": 292.578125, |
| "epoch": 0.31149927219796214, |
| "grad_norm": 1.1759325546130317, |
| "kl": 0.039794921875, |
| "learning_rate": 8.067143671333309e-07, |
| "loss": -0.0033, |
| "reward": 0.6993098855018616, |
| "reward_std": 1.116674542427063, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8843098878860474, |
| "step": 214 |
| }, |
| { |
| "completion_length": 293.84375, |
| "epoch": 0.3129548762736536, |
| "grad_norm": 1.3260579782974318, |
| "kl": 0.036865234375, |
| "learning_rate": 8.048482942296535e-07, |
| "loss": -0.0036, |
| "reward": 0.4943684935569763, |
| "reward_std": 0.884007453918457, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8536393642425537, |
| "step": 215 |
| }, |
| { |
| "completion_length": 313.453125, |
| "epoch": 0.314410480349345, |
| "grad_norm": 1.244589688629832, |
| "kl": 0.041259765625, |
| "learning_rate": 8.02975438131774e-07, |
| "loss": 0.0005, |
| "reward": 0.7252734899520874, |
| "reward_std": 1.0751287937164307, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.8259244561195374, |
| "step": 216 |
| }, |
| { |
| "completion_length": 320.90625, |
| "epoch": 0.3158660844250364, |
| "grad_norm": 1.2906749232909462, |
| "kl": 0.0341796875, |
| "learning_rate": 8.010958405127047e-07, |
| "loss": 0.0034, |
| "reward": 1.0169856548309326, |
| "reward_std": 1.000779628753662, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.8830013275146484, |
| "step": 217 |
| }, |
| { |
| "completion_length": 255.71875, |
| "epoch": 0.3173216885007278, |
| "grad_norm": 1.309866431103487, |
| "kl": 0.042236328125, |
| "learning_rate": 7.992095431954634e-07, |
| "loss": 0.0027, |
| "reward": 0.8923372626304626, |
| "reward_std": 0.30078914761543274, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.8763346672058105, |
| "step": 218 |
| }, |
| { |
| "completion_length": 258.109375, |
| "epoch": 0.31877729257641924, |
| "grad_norm": 1.3975143387297508, |
| "kl": 0.040771484375, |
| "learning_rate": 7.973165881521433e-07, |
| "loss": 0.0012, |
| "reward": 1.0036718845367432, |
| "reward_std": 0.9058128595352173, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.8883463740348816, |
| "step": 219 |
| }, |
| { |
| "completion_length": 276.6875, |
| "epoch": 0.32023289665211063, |
| "grad_norm": 1.2837989231665352, |
| "kl": 0.03759765625, |
| "learning_rate": 7.954170175029791e-07, |
| "loss": -0.0022, |
| "reward": 0.8400716185569763, |
| "reward_std": 0.7807621955871582, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9180272817611694, |
| "step": 220 |
| }, |
| { |
| "completion_length": 280.125, |
| "epoch": 0.32168850072780203, |
| "grad_norm": 1.4163987682065777, |
| "kl": 0.04736328125, |
| "learning_rate": 7.935108735154092e-07, |
| "loss": 0.0044, |
| "reward": 0.7034765481948853, |
| "reward_std": 0.44748401641845703, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8766406178474426, |
| "step": 221 |
| }, |
| { |
| "completion_length": 277.4375, |
| "epoch": 0.3231441048034934, |
| "grad_norm": 1.2806875439187504, |
| "kl": 0.04150390625, |
| "learning_rate": 7.915981986031366e-07, |
| "loss": 0.0006, |
| "reward": 1.4459569454193115, |
| "reward_std": 0.6417677402496338, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.8500716090202332, |
| "step": 222 |
| }, |
| { |
| "completion_length": 304.625, |
| "epoch": 0.3245997088791849, |
| "grad_norm": 1.1924907907457698, |
| "kl": 0.037353515625, |
| "learning_rate": 7.896790353251835e-07, |
| "loss": 0.0001, |
| "reward": 0.46823570132255554, |
| "reward_std": 0.7769339084625244, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.8649283647537231, |
| "step": 223 |
| }, |
| { |
| "completion_length": 303.1875, |
| "epoch": 0.3260553129548763, |
| "grad_norm": 1.1503304963010326, |
| "kl": 0.03466796875, |
| "learning_rate": 7.877534263849451e-07, |
| "loss": 0.0005, |
| "reward": 1.0099999904632568, |
| "reward_std": 0.964565634727478, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.8617057204246521, |
| "step": 224 |
| }, |
| { |
| "completion_length": 282.5, |
| "epoch": 0.32751091703056767, |
| "grad_norm": 1.1761505740028426, |
| "kl": 0.039306640625, |
| "learning_rate": 7.858214146292393e-07, |
| "loss": -0.0016, |
| "reward": 0.07833331823348999, |
| "reward_std": 1.1625947952270508, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.8088932037353516, |
| "step": 225 |
| }, |
| { |
| "completion_length": 312.515625, |
| "epoch": 0.3289665211062591, |
| "grad_norm": 1.1648396839632396, |
| "kl": 0.0361328125, |
| "learning_rate": 7.838830430473538e-07, |
| "loss": 0.001, |
| "reward": -0.08507812023162842, |
| "reward_std": 0.9128743410110474, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.8148698210716248, |
| "step": 226 |
| }, |
| { |
| "completion_length": 306.5, |
| "epoch": 0.3304221251819505, |
| "grad_norm": 0.9816488205657019, |
| "kl": 0.036376953125, |
| "learning_rate": 7.819383547700889e-07, |
| "loss": 0.0021, |
| "reward": 0.7689843773841858, |
| "reward_std": 1.1138982772827148, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.8328125476837158, |
| "step": 227 |
| }, |
| { |
| "completion_length": 266.03125, |
| "epoch": 0.3318777292576419, |
| "grad_norm": 1.3503817696996208, |
| "kl": 0.048828125, |
| "learning_rate": 7.799873930687977e-07, |
| "loss": 0.002, |
| "reward": 0.3949218690395355, |
| "reward_std": 0.8604871034622192, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.8740885257720947, |
| "step": 228 |
| }, |
| { |
| "completion_length": 294.15625, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 1.0990493246122626, |
| "kl": 0.03369140625, |
| "learning_rate": 7.780302013544238e-07, |
| "loss": 0.0002, |
| "reward": 1.1642253398895264, |
| "reward_std": 0.8671401143074036, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8458007574081421, |
| "step": 229 |
| }, |
| { |
| "completion_length": 321.265625, |
| "epoch": 0.33478893740902477, |
| "grad_norm": 0.9938258028137048, |
| "kl": 0.037841796875, |
| "learning_rate": 7.760668231765351e-07, |
| "loss": -0.0014, |
| "reward": 0.6853646039962769, |
| "reward_std": 1.0251922607421875, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8308333158493042, |
| "step": 230 |
| }, |
| { |
| "completion_length": 309.53125, |
| "epoch": 0.33624454148471616, |
| "grad_norm": 1.123171517093054, |
| "kl": 0.037353515625, |
| "learning_rate": 7.740973022223549e-07, |
| "loss": -0.0024, |
| "reward": 1.0861718654632568, |
| "reward_std": 1.1104838848114014, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.8241406083106995, |
| "step": 231 |
| }, |
| { |
| "completion_length": 309.796875, |
| "epoch": 0.33770014556040756, |
| "grad_norm": 1.0934016007582914, |
| "kl": 0.033935546875, |
| "learning_rate": 7.721216823157894e-07, |
| "loss": -0.0063, |
| "reward": 1.000657558441162, |
| "reward_std": 0.6190701127052307, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.875084638595581, |
| "step": 232 |
| }, |
| { |
| "completion_length": 277.453125, |
| "epoch": 0.33915574963609896, |
| "grad_norm": 1.4389877479066715, |
| "kl": 0.035400390625, |
| "learning_rate": 7.701400074164535e-07, |
| "loss": -0.0021, |
| "reward": 1.146998643875122, |
| "reward_std": 0.6130830645561218, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.8996028900146484, |
| "step": 233 |
| }, |
| { |
| "completion_length": 292.296875, |
| "epoch": 0.3406113537117904, |
| "grad_norm": 1.254688662869592, |
| "kl": 0.031982421875, |
| "learning_rate": 7.681523216186911e-07, |
| "loss": -0.0019, |
| "reward": 1.3471484184265137, |
| "reward_std": 0.8775838613510132, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.92557293176651, |
| "step": 234 |
| }, |
| { |
| "completion_length": 277.265625, |
| "epoch": 0.3420669577874818, |
| "grad_norm": 1.1736210621507872, |
| "kl": 0.037841796875, |
| "learning_rate": 7.661586691505961e-07, |
| "loss": 0.0023, |
| "reward": 0.8513671159744263, |
| "reward_std": 0.8359072208404541, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9101953506469727, |
| "step": 235 |
| }, |
| { |
| "completion_length": 262.328125, |
| "epoch": 0.3435225618631732, |
| "grad_norm": 1.2639347156124685, |
| "kl": 0.03857421875, |
| "learning_rate": 7.641590943730258e-07, |
| "loss": 0.0009, |
| "reward": 1.421054720878601, |
| "reward_std": 0.36540117859840393, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9068489670753479, |
| "step": 236 |
| }, |
| { |
| "completion_length": 289.109375, |
| "epoch": 0.34497816593886466, |
| "grad_norm": 1.2779446490859168, |
| "kl": 0.0419921875, |
| "learning_rate": 7.621536417786158e-07, |
| "loss": 0.0021, |
| "reward": 0.836510419845581, |
| "reward_std": 0.8601652383804321, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.8924999833106995, |
| "step": 237 |
| }, |
| { |
| "completion_length": 287.0625, |
| "epoch": 0.34643377001455605, |
| "grad_norm": 1.255028005763755, |
| "kl": 0.033935546875, |
| "learning_rate": 7.601423559907894e-07, |
| "loss": 0.0035, |
| "reward": 0.6539322733879089, |
| "reward_std": 0.5522056818008423, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9175911545753479, |
| "step": 238 |
| }, |
| { |
| "completion_length": 289.578125, |
| "epoch": 0.34788937409024745, |
| "grad_norm": 1.1981964024570817, |
| "kl": 0.035400390625, |
| "learning_rate": 7.581252817627644e-07, |
| "loss": -0.0022, |
| "reward": 1.0136327743530273, |
| "reward_std": 0.5532514452934265, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.8591275215148926, |
| "step": 239 |
| }, |
| { |
| "completion_length": 294.765625, |
| "epoch": 0.34934497816593885, |
| "grad_norm": 1.3473651417615715, |
| "kl": 0.039306640625, |
| "learning_rate": 7.561024639765571e-07, |
| "loss": -0.0036, |
| "reward": 0.2712695300579071, |
| "reward_std": 0.544538676738739, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.8741991519927979, |
| "step": 240 |
| }, |
| { |
| "completion_length": 294.953125, |
| "epoch": 0.3508005822416303, |
| "grad_norm": 1.2518247684034733, |
| "kl": 0.039306640625, |
| "learning_rate": 7.540739476419846e-07, |
| "loss": -0.0002, |
| "reward": 0.4361327886581421, |
| "reward_std": 0.4436946213245392, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.8888281583786011, |
| "step": 241 |
| }, |
| { |
| "completion_length": 285.640625, |
| "epoch": 0.3522561863173217, |
| "grad_norm": 1.3471385559147369, |
| "kl": 0.037109375, |
| "learning_rate": 7.520397778956622e-07, |
| "loss": 0.0002, |
| "reward": 0.8719531297683716, |
| "reward_std": 0.7764471769332886, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9443880319595337, |
| "step": 242 |
| }, |
| { |
| "completion_length": 278.0625, |
| "epoch": 0.3537117903930131, |
| "grad_norm": 1.245756955751304, |
| "kl": 0.0458984375, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0024, |
| "reward": 0.9515169262886047, |
| "reward_std": 0.46474969387054443, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.8945116400718689, |
| "step": 243 |
| }, |
| { |
| "completion_length": 273.65625, |
| "epoch": 0.3551673944687045, |
| "grad_norm": 1.3186357128195956, |
| "kl": 0.041015625, |
| "learning_rate": 7.479546593421947e-07, |
| "loss": 0.0008, |
| "reward": 0.5497395992279053, |
| "reward_std": 0.9145887494087219, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9244270324707031, |
| "step": 244 |
| }, |
| { |
| "completion_length": 302.703125, |
| "epoch": 0.35662299854439594, |
| "grad_norm": 1.1625630519045944, |
| "kl": 0.035400390625, |
| "learning_rate": 7.459038014332209e-07, |
| "loss": 0.0016, |
| "reward": 0.05195310711860657, |
| "reward_std": 0.509819507598877, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.9129297733306885, |
| "step": 245 |
| }, |
| { |
| "completion_length": 311.40625, |
| "epoch": 0.35807860262008734, |
| "grad_norm": 1.3125269152541976, |
| "kl": 0.0380859375, |
| "learning_rate": 7.438474719068173e-07, |
| "loss": -0.0042, |
| "reward": 1.84199857711792, |
| "reward_std": 0.3553314805030823, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 0.9321680068969727, |
| "step": 246 |
| }, |
| { |
| "completion_length": 295.4375, |
| "epoch": 0.35953420669577874, |
| "grad_norm": 1.2573734594613415, |
| "kl": 0.036865234375, |
| "learning_rate": 7.417857165184723e-07, |
| "loss": 0.0018, |
| "reward": 0.6018099188804626, |
| "reward_std": 0.4691210389137268, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9123698472976685, |
| "step": 247 |
| }, |
| { |
| "completion_length": 306.125, |
| "epoch": 0.3609898107714702, |
| "grad_norm": 1.2433464807920045, |
| "kl": 0.03564453125, |
| "learning_rate": 7.397185811444049e-07, |
| "loss": 0.0005, |
| "reward": 0.889244794845581, |
| "reward_std": 0.806644856929779, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.8913020491600037, |
| "step": 248 |
| }, |
| { |
| "completion_length": 315.3125, |
| "epoch": 0.3624454148471616, |
| "grad_norm": 1.1237789290801656, |
| "kl": 0.042236328125, |
| "learning_rate": 7.376461117805449e-07, |
| "loss": -0.001, |
| "reward": 1.3869401216506958, |
| "reward_std": 0.9981783628463745, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.8924478888511658, |
| "step": 249 |
| }, |
| { |
| "completion_length": 299.75, |
| "epoch": 0.363901018922853, |
| "grad_norm": 1.1632799586225018, |
| "kl": 0.03759765625, |
| "learning_rate": 7.355683545415089e-07, |
| "loss": 0.0006, |
| "reward": 0.85239577293396, |
| "reward_std": 0.6559892892837524, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.8976563215255737, |
| "step": 250 |
| }, |
| { |
| "completion_length": 279.09375, |
| "epoch": 0.3653566229985444, |
| "grad_norm": 1.1514063608017875, |
| "kl": 0.04248046875, |
| "learning_rate": 7.33485355659574e-07, |
| "loss": -0.0002, |
| "reward": 0.6367447972297668, |
| "reward_std": 1.1905419826507568, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.913867175579071, |
| "step": 251 |
| }, |
| { |
| "completion_length": 319.765625, |
| "epoch": 0.36681222707423583, |
| "grad_norm": 0.9469865528331761, |
| "kl": 0.0361328125, |
| "learning_rate": 7.313971614836495e-07, |
| "loss": -0.0009, |
| "reward": 0.50822913646698, |
| "reward_std": 0.788324236869812, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8841667175292969, |
| "step": 252 |
| }, |
| { |
| "completion_length": 321.515625, |
| "epoch": 0.3682678311499272, |
| "grad_norm": 1.1463942286208362, |
| "kl": 0.038818359375, |
| "learning_rate": 7.293038184782454e-07, |
| "loss": -0.0042, |
| "reward": 1.2917838096618652, |
| "reward_std": 0.6375994086265564, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.89473956823349, |
| "step": 253 |
| }, |
| { |
| "completion_length": 341.265625, |
| "epoch": 0.3697234352256186, |
| "grad_norm": 0.9758868458165839, |
| "kl": 0.032958984375, |
| "learning_rate": 7.272053732224387e-07, |
| "loss": 0.004, |
| "reward": -0.20222003757953644, |
| "reward_std": 0.5149335265159607, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.8958659172058105, |
| "step": 254 |
| }, |
| { |
| "completion_length": 332.5625, |
| "epoch": 0.37117903930131, |
| "grad_norm": 1.1030567827011573, |
| "kl": 0.039794921875, |
| "learning_rate": 7.251018724088366e-07, |
| "loss": 0.0035, |
| "reward": 0.6221875548362732, |
| "reward_std": 1.2777836322784424, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.87088543176651, |
| "step": 255 |
| }, |
| { |
| "completion_length": 296.59375, |
| "epoch": 0.3726346433770015, |
| "grad_norm": 1.1648513737927273, |
| "kl": 0.042724609375, |
| "learning_rate": 7.22993362842538e-07, |
| "loss": 0.0038, |
| "reward": 0.18544921278953552, |
| "reward_std": 0.922585666179657, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.8398241996765137, |
| "step": 256 |
| }, |
| { |
| "completion_length": 325.078125, |
| "epoch": 0.37409024745269287, |
| "grad_norm": 1.063758965452858, |
| "kl": 0.03662109375, |
| "learning_rate": 7.208798914400915e-07, |
| "loss": 0.0011, |
| "reward": 0.6968294382095337, |
| "reward_std": 1.144836187362671, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8460221290588379, |
| "step": 257 |
| }, |
| { |
| "completion_length": 315.84375, |
| "epoch": 0.37554585152838427, |
| "grad_norm": 1.1772919287263088, |
| "kl": 0.036376953125, |
| "learning_rate": 7.187615052284521e-07, |
| "loss": -0.0012, |
| "reward": 1.352858066558838, |
| "reward_std": 0.5259904861450195, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9533268213272095, |
| "step": 258 |
| }, |
| { |
| "completion_length": 317.984375, |
| "epoch": 0.37700145560407566, |
| "grad_norm": 1.0657146598710652, |
| "kl": 0.03759765625, |
| "learning_rate": 7.166382513439343e-07, |
| "loss": 0.0, |
| "reward": 0.8918294310569763, |
| "reward_std": 0.6458997130393982, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.8966862559318542, |
| "step": 259 |
| }, |
| { |
| "completion_length": 319.5, |
| "epoch": 0.3784570596797671, |
| "grad_norm": 1.074343844728951, |
| "kl": 0.037841796875, |
| "learning_rate": 7.145101770311633e-07, |
| "loss": -0.0006, |
| "reward": 0.8149153590202332, |
| "reward_std": 1.0369267463684082, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.86767578125, |
| "step": 260 |
| }, |
| { |
| "completion_length": 308.359375, |
| "epoch": 0.3799126637554585, |
| "grad_norm": 1.291348430451882, |
| "kl": 0.03662109375, |
| "learning_rate": 7.12377329642024e-07, |
| "loss": 0.0032, |
| "reward": 1.153606653213501, |
| "reward_std": 0.9500166773796082, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.8819010853767395, |
| "step": 261 |
| }, |
| { |
| "completion_length": 325.78125, |
| "epoch": 0.3813682678311499, |
| "grad_norm": 1.0948918878420022, |
| "kl": 0.0380859375, |
| "learning_rate": 7.102397566346072e-07, |
| "loss": -0.0025, |
| "reward": 0.6883333325386047, |
| "reward_std": 0.6285832524299622, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9229167103767395, |
| "step": 262 |
| }, |
| { |
| "completion_length": 303.671875, |
| "epoch": 0.38282387190684136, |
| "grad_norm": 1.2246966458821558, |
| "kl": 0.039306640625, |
| "learning_rate": 7.080975055721537e-07, |
| "loss": 0.0028, |
| "reward": 1.2784569263458252, |
| "reward_std": 0.6068631410598755, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9090169072151184, |
| "step": 263 |
| }, |
| { |
| "completion_length": 301.515625, |
| "epoch": 0.38427947598253276, |
| "grad_norm": 1.274689095724219, |
| "kl": 0.034423828125, |
| "learning_rate": 7.059506241219964e-07, |
| "loss": 0.0038, |
| "reward": 0.5575065016746521, |
| "reward_std": 0.2938391864299774, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9139648079872131, |
| "step": 264 |
| }, |
| { |
| "completion_length": 317.65625, |
| "epoch": 0.38573508005822416, |
| "grad_norm": 1.1114171001211495, |
| "kl": 0.037353515625, |
| "learning_rate": 7.037991600544982e-07, |
| "loss": 0.0001, |
| "reward": 0.32826173305511475, |
| "reward_std": 1.0576179027557373, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.8831444978713989, |
| "step": 265 |
| }, |
| { |
| "completion_length": 307.125, |
| "epoch": 0.38719068413391555, |
| "grad_norm": 1.0365067938257357, |
| "kl": 0.03271484375, |
| "learning_rate": 7.016431612419906e-07, |
| "loss": 0.0014, |
| "reward": 0.7509114742279053, |
| "reward_std": 1.0201146602630615, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.8540234565734863, |
| "step": 266 |
| }, |
| { |
| "completion_length": 315.0625, |
| "epoch": 0.388646288209607, |
| "grad_norm": 1.1542972359276211, |
| "kl": 0.039794921875, |
| "learning_rate": 6.994826756577081e-07, |
| "loss": 0.0028, |
| "reward": 0.3422461152076721, |
| "reward_std": 0.7236604690551758, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.8930273652076721, |
| "step": 267 |
| }, |
| { |
| "completion_length": 292.453125, |
| "epoch": 0.3901018922852984, |
| "grad_norm": 1.247691385135112, |
| "kl": 0.03759765625, |
| "learning_rate": 6.973177513747204e-07, |
| "loss": 0.004, |
| "reward": 1.1121549606323242, |
| "reward_std": 0.2723959684371948, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9376627206802368, |
| "step": 268 |
| }, |
| { |
| "completion_length": 327.25, |
| "epoch": 0.3915574963609898, |
| "grad_norm": 1.1547141576656188, |
| "kl": 0.03662109375, |
| "learning_rate": 6.951484365648627e-07, |
| "loss": 0.0021, |
| "reward": 0.8944270610809326, |
| "reward_std": 0.8052605390548706, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.8651041984558105, |
| "step": 269 |
| }, |
| { |
| "completion_length": 303.03125, |
| "epoch": 0.3930131004366812, |
| "grad_norm": 1.3851574923540422, |
| "kl": 0.03955078125, |
| "learning_rate": 6.929747794976643e-07, |
| "loss": -0.0045, |
| "reward": 1.0955729484558105, |
| "reward_std": 0.38242411613464355, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9138151407241821, |
| "step": 270 |
| }, |
| { |
| "completion_length": 271.625, |
| "epoch": 0.39446870451237265, |
| "grad_norm": 1.3401818240693362, |
| "kl": 0.037353515625, |
| "learning_rate": 6.907968285392743e-07, |
| "loss": -0.0006, |
| "reward": 1.7740495204925537, |
| "reward_std": 0.47597813606262207, |
| "rewards/accuracy_reward": 0.953125, |
| "rewards/format_reward": 0.9092708826065063, |
| "step": 271 |
| }, |
| { |
| "completion_length": 294.453125, |
| "epoch": 0.39592430858806404, |
| "grad_norm": 1.2186833540315785, |
| "kl": 0.036865234375, |
| "learning_rate": 6.886146321513849e-07, |
| "loss": -0.0039, |
| "reward": 0.8727213740348816, |
| "reward_std": 0.7979456186294556, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.86697918176651, |
| "step": 272 |
| }, |
| { |
| "completion_length": 299.578125, |
| "epoch": 0.39737991266375544, |
| "grad_norm": 1.1830905690032625, |
| "kl": 0.0390625, |
| "learning_rate": 6.864282388901543e-07, |
| "loss": -0.0064, |
| "reward": 0.28068357706069946, |
| "reward_std": 0.5684016942977905, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.9229100942611694, |
| "step": 273 |
| }, |
| { |
| "completion_length": 298.40625, |
| "epoch": 0.3988355167394469, |
| "grad_norm": 1.1221914279750511, |
| "kl": 0.035888671875, |
| "learning_rate": 6.84237697405125e-07, |
| "loss": -0.003, |
| "reward": 1.0086263418197632, |
| "reward_std": 0.8679967522621155, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.8907877206802368, |
| "step": 274 |
| }, |
| { |
| "completion_length": 326.9375, |
| "epoch": 0.4002911208151383, |
| "grad_norm": 1.0580769064192943, |
| "kl": 0.0400390625, |
| "learning_rate": 6.820430564381419e-07, |
| "loss": -0.0007, |
| "reward": 0.6745051741600037, |
| "reward_std": 0.84361732006073, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8712891340255737, |
| "step": 275 |
| }, |
| { |
| "completion_length": 293.09375, |
| "epoch": 0.4017467248908297, |
| "grad_norm": 1.2858645148857393, |
| "kl": 0.04541015625, |
| "learning_rate": 6.79844364822268e-07, |
| "loss": -0.0051, |
| "reward": 0.7683008313179016, |
| "reward_std": 0.6962195634841919, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.893470048904419, |
| "step": 276 |
| }, |
| { |
| "completion_length": 328.234375, |
| "epoch": 0.4032023289665211, |
| "grad_norm": 1.0468195864436123, |
| "kl": 0.038818359375, |
| "learning_rate": 6.776416714806969e-07, |
| "loss": -0.0035, |
| "reward": 1.0561068058013916, |
| "reward_std": 1.1647756099700928, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.8450260162353516, |
| "step": 277 |
| }, |
| { |
| "completion_length": 309.640625, |
| "epoch": 0.40465793304221254, |
| "grad_norm": 1.3703569385022722, |
| "kl": 0.042724609375, |
| "learning_rate": 6.754350254256652e-07, |
| "loss": 0.003, |
| "reward": 1.3729296922683716, |
| "reward_std": 0.5724613070487976, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9223437309265137, |
| "step": 278 |
| }, |
| { |
| "completion_length": 299.453125, |
| "epoch": 0.40611353711790393, |
| "grad_norm": 1.1513843198550995, |
| "kl": 0.0458984375, |
| "learning_rate": 6.732244757573618e-07, |
| "loss": 0.0021, |
| "reward": 0.58970046043396, |
| "reward_std": 0.6756365299224854, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.8674869537353516, |
| "step": 279 |
| }, |
| { |
| "completion_length": 285.890625, |
| "epoch": 0.40756914119359533, |
| "grad_norm": 1.293846147841921, |
| "kl": 0.0390625, |
| "learning_rate": 6.710100716628344e-07, |
| "loss": 0.0049, |
| "reward": 0.7233072519302368, |
| "reward_std": 0.29892754554748535, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8979557752609253, |
| "step": 280 |
| }, |
| { |
| "completion_length": 266.390625, |
| "epoch": 0.4090247452692867, |
| "grad_norm": 1.2289155325973107, |
| "kl": 0.041259765625, |
| "learning_rate": 6.687918624148963e-07, |
| "loss": -0.0007, |
| "reward": 0.04064452648162842, |
| "reward_std": 0.2825842499732971, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.8914909362792969, |
| "step": 281 |
| }, |
| { |
| "completion_length": 297.8125, |
| "epoch": 0.4104803493449782, |
| "grad_norm": 1.1879387558526018, |
| "kl": 0.037109375, |
| "learning_rate": 6.665698973710288e-07, |
| "loss": -0.0028, |
| "reward": 0.9278711080551147, |
| "reward_std": 0.7110856771469116, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.8445507884025574, |
| "step": 282 |
| }, |
| { |
| "completion_length": 300.5625, |
| "epoch": 0.4119359534206696, |
| "grad_norm": 1.1205832368306863, |
| "kl": 0.03759765625, |
| "learning_rate": 6.643442259722845e-07, |
| "loss": -0.0021, |
| "reward": 0.5016796588897705, |
| "reward_std": 0.8245861530303955, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8688150644302368, |
| "step": 283 |
| }, |
| { |
| "completion_length": 291.46875, |
| "epoch": 0.413391557496361, |
| "grad_norm": 1.1232126206960045, |
| "kl": 0.040771484375, |
| "learning_rate": 6.621148977421855e-07, |
| "loss": 0.0054, |
| "reward": 1.0367382764816284, |
| "reward_std": 0.8832352757453918, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9021419286727905, |
| "step": 284 |
| }, |
| { |
| "completion_length": 288.328125, |
| "epoch": 0.4148471615720524, |
| "grad_norm": 1.186415542575965, |
| "kl": 0.035888671875, |
| "learning_rate": 6.598819622856226e-07, |
| "loss": -0.0035, |
| "reward": 1.6761784553527832, |
| "reward_std": 0.69477379322052, |
| "rewards/accuracy_reward": 0.921875, |
| "rewards/format_reward": 0.9105534553527832, |
| "step": 285 |
| }, |
| { |
| "completion_length": 292.609375, |
| "epoch": 0.4163027656477438, |
| "grad_norm": 1.2789098266158088, |
| "kl": 0.043212890625, |
| "learning_rate": 6.576454692877512e-07, |
| "loss": 0.0009, |
| "reward": 1.4440560340881348, |
| "reward_std": 0.40633606910705566, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9008529186248779, |
| "step": 286 |
| }, |
| { |
| "completion_length": 300.9375, |
| "epoch": 0.4177583697234352, |
| "grad_norm": 1.1125232437916257, |
| "kl": 0.039306640625, |
| "learning_rate": 6.554054685128856e-07, |
| "loss": 0.003, |
| "reward": 0.5055012702941895, |
| "reward_std": 0.5191164612770081, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9176367521286011, |
| "step": 287 |
| }, |
| { |
| "completion_length": 305.1875, |
| "epoch": 0.4192139737991266, |
| "grad_norm": 1.3508336793233238, |
| "kl": 0.042236328125, |
| "learning_rate": 6.531620098033918e-07, |
| "loss": -0.0003, |
| "reward": 0.727037787437439, |
| "reward_std": 0.7846024036407471, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9105533957481384, |
| "step": 288 |
| }, |
| { |
| "completion_length": 286.953125, |
| "epoch": 0.42066957787481807, |
| "grad_norm": 1.355472255753869, |
| "kl": 0.0419921875, |
| "learning_rate": 6.509151430785785e-07, |
| "loss": 0.0057, |
| "reward": 1.5655077695846558, |
| "reward_std": 0.363979309797287, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9299349188804626, |
| "step": 289 |
| }, |
| { |
| "completion_length": 324.296875, |
| "epoch": 0.42212518195050946, |
| "grad_norm": 1.0617875027109431, |
| "kl": 0.042236328125, |
| "learning_rate": 6.486649183335862e-07, |
| "loss": 0.0028, |
| "reward": 0.3495572805404663, |
| "reward_std": 0.61403489112854, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.8596354126930237, |
| "step": 290 |
| }, |
| { |
| "completion_length": 303.375, |
| "epoch": 0.42358078602620086, |
| "grad_norm": 1.2306825555758596, |
| "kl": 0.040283203125, |
| "learning_rate": 6.464113856382751e-07, |
| "loss": 0.0013, |
| "reward": 0.9183528423309326, |
| "reward_std": 0.4856081008911133, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.88413405418396, |
| "step": 291 |
| }, |
| { |
| "completion_length": 339.09375, |
| "epoch": 0.42503639010189226, |
| "grad_norm": 0.9937441597737549, |
| "kl": 0.035400390625, |
| "learning_rate": 6.441545951361109e-07, |
| "loss": -0.0029, |
| "reward": 0.5682356357574463, |
| "reward_std": 1.1054012775421143, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.8453580737113953, |
| "step": 292 |
| }, |
| { |
| "completion_length": 325.109375, |
| "epoch": 0.4264919941775837, |
| "grad_norm": 1.1320952087408245, |
| "kl": 0.03857421875, |
| "learning_rate": 6.418945970430485e-07, |
| "loss": -0.0018, |
| "reward": 1.6307356357574463, |
| "reward_std": 0.8666630983352661, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 0.9119856357574463, |
| "step": 293 |
| }, |
| { |
| "completion_length": 312.953125, |
| "epoch": 0.4279475982532751, |
| "grad_norm": 1.0902739141585274, |
| "kl": 0.042236328125, |
| "learning_rate": 6.39631441646415e-07, |
| "loss": -0.003, |
| "reward": 0.8973242044448853, |
| "reward_std": 0.8014050722122192, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.8655273914337158, |
| "step": 294 |
| }, |
| { |
| "completion_length": 317.765625, |
| "epoch": 0.4294032023289665, |
| "grad_norm": 1.1492762649925403, |
| "kl": 0.04052734375, |
| "learning_rate": 6.373651793037916e-07, |
| "loss": -0.001, |
| "reward": 0.6366666555404663, |
| "reward_std": 0.6303349733352661, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9156770706176758, |
| "step": 295 |
| }, |
| { |
| "completion_length": 303.640625, |
| "epoch": 0.43085880640465796, |
| "grad_norm": 1.1237468628260645, |
| "kl": 0.044921875, |
| "learning_rate": 6.35095860441891e-07, |
| "loss": 0.0027, |
| "reward": 0.8331836462020874, |
| "reward_std": 0.38058170676231384, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9308528900146484, |
| "step": 296 |
| }, |
| { |
| "completion_length": 309.765625, |
| "epoch": 0.43231441048034935, |
| "grad_norm": 1.079089420677735, |
| "kl": 0.041015625, |
| "learning_rate": 6.328235355554381e-07, |
| "loss": 0.0039, |
| "reward": 1.14655601978302, |
| "reward_std": 0.9032782912254333, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9077669382095337, |
| "step": 297 |
| }, |
| { |
| "completion_length": 327.1875, |
| "epoch": 0.43377001455604075, |
| "grad_norm": 1.1012850272675234, |
| "kl": 0.041015625, |
| "learning_rate": 6.305482552060441e-07, |
| "loss": -0.0018, |
| "reward": 0.41550779342651367, |
| "reward_std": 0.9270626306533813, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.8848437070846558, |
| "step": 298 |
| }, |
| { |
| "completion_length": 321.90625, |
| "epoch": 0.43522561863173215, |
| "grad_norm": 1.1334862363298608, |
| "kl": 0.0439453125, |
| "learning_rate": 6.282700700210826e-07, |
| "loss": 0.0016, |
| "reward": 0.8490754961967468, |
| "reward_std": 0.8852044939994812, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.950390636920929, |
| "step": 299 |
| }, |
| { |
| "completion_length": 319.46875, |
| "epoch": 0.4366812227074236, |
| "grad_norm": 1.3237664738865698, |
| "kl": 0.04052734375, |
| "learning_rate": 6.259890306925626e-07, |
| "loss": 0.0024, |
| "reward": 1.1324349641799927, |
| "reward_std": 0.6891584992408752, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9130728840827942, |
| "step": 300 |
| }, |
| { |
| "completion_length": 340.21875, |
| "epoch": 0.438136826783115, |
| "grad_norm": 0.925628291932368, |
| "kl": 0.039794921875, |
| "learning_rate": 6.237051879760013e-07, |
| "loss": -0.0004, |
| "reward": -0.13870440423488617, |
| "reward_std": 0.4732888340950012, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.8486002683639526, |
| "step": 301 |
| }, |
| { |
| "completion_length": 293.5, |
| "epoch": 0.4395924308588064, |
| "grad_norm": 1.3848838366572507, |
| "kl": 0.048583984375, |
| "learning_rate": 6.214185926892935e-07, |
| "loss": 0.0011, |
| "reward": 0.7430468797683716, |
| "reward_std": 0.5002673864364624, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9450520873069763, |
| "step": 302 |
| }, |
| { |
| "completion_length": 301.5, |
| "epoch": 0.4410480349344978, |
| "grad_norm": 1.1287435583620058, |
| "kl": 0.057373046875, |
| "learning_rate": 6.191292957115824e-07, |
| "loss": 0.0004, |
| "reward": 1.1467642784118652, |
| "reward_std": 0.2716268002986908, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9220898151397705, |
| "step": 303 |
| }, |
| { |
| "completion_length": 313.65625, |
| "epoch": 0.44250363901018924, |
| "grad_norm": 1.1031415899812522, |
| "kl": 0.0419921875, |
| "learning_rate": 6.168373479821263e-07, |
| "loss": 0.0023, |
| "reward": 0.6278645992279053, |
| "reward_std": 0.4927278757095337, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9092448353767395, |
| "step": 304 |
| }, |
| { |
| "completion_length": 324.65625, |
| "epoch": 0.44395924308588064, |
| "grad_norm": 1.1753628489624894, |
| "kl": 0.04296875, |
| "learning_rate": 6.145428004991649e-07, |
| "loss": 0.0031, |
| "reward": 1.2421419620513916, |
| "reward_std": 0.5803054571151733, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.93757164478302, |
| "step": 305 |
| }, |
| { |
| "completion_length": 329.421875, |
| "epoch": 0.44541484716157204, |
| "grad_norm": 1.0124718909261303, |
| "kl": 0.042724609375, |
| "learning_rate": 6.122457043187862e-07, |
| "loss": 0.0026, |
| "reward": 1.2116667032241821, |
| "reward_std": 0.6427367925643921, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.90095055103302, |
| "step": 306 |
| }, |
| { |
| "completion_length": 308.265625, |
| "epoch": 0.4468704512372635, |
| "grad_norm": 1.0168849937197908, |
| "kl": 0.047607421875, |
| "learning_rate": 6.099461105537888e-07, |
| "loss": -0.0027, |
| "reward": 0.7654426693916321, |
| "reward_std": 0.5294202566146851, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9123828411102295, |
| "step": 307 |
| }, |
| { |
| "completion_length": 312.984375, |
| "epoch": 0.4483260553129549, |
| "grad_norm": 1.304050444945634, |
| "kl": 0.04443359375, |
| "learning_rate": 6.076440703725452e-07, |
| "loss": -0.0045, |
| "reward": 0.4798697829246521, |
| "reward_std": 0.11288902163505554, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9344010353088379, |
| "step": 308 |
| }, |
| { |
| "completion_length": 316.625, |
| "epoch": 0.4497816593886463, |
| "grad_norm": 1.0318927611491964, |
| "kl": 0.047607421875, |
| "learning_rate": 6.053396349978631e-07, |
| "loss": -0.0003, |
| "reward": 0.36076819896698, |
| "reward_std": 0.37749600410461426, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.8674870133399963, |
| "step": 309 |
| }, |
| { |
| "completion_length": 321.265625, |
| "epoch": 0.4512372634643377, |
| "grad_norm": 1.054780956617541, |
| "kl": 0.0517578125, |
| "learning_rate": 6.030328557058463e-07, |
| "loss": 0.0008, |
| "reward": 0.5132877826690674, |
| "reward_std": 0.8486453294754028, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.8326627612113953, |
| "step": 310 |
| }, |
| { |
| "completion_length": 321.21875, |
| "epoch": 0.45269286754002913, |
| "grad_norm": 1.1712340167228774, |
| "kl": 0.045654296875, |
| "learning_rate": 6.007237838247525e-07, |
| "loss": 0.0004, |
| "reward": 1.3080989122390747, |
| "reward_std": 0.7990570068359375, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.91447913646698, |
| "step": 311 |
| }, |
| { |
| "completion_length": 306.953125, |
| "epoch": 0.45414847161572053, |
| "grad_norm": 1.2366218066275552, |
| "kl": 0.050048828125, |
| "learning_rate": 5.984124707338527e-07, |
| "loss": -0.0022, |
| "reward": 1.22621750831604, |
| "reward_std": 0.586514413356781, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.9193814396858215, |
| "step": 312 |
| }, |
| { |
| "completion_length": 290.796875, |
| "epoch": 0.4556040756914119, |
| "grad_norm": 1.4431350657604642, |
| "kl": 0.051513671875, |
| "learning_rate": 5.960989678622864e-07, |
| "loss": 0.0034, |
| "reward": 1.7498502731323242, |
| "reward_std": 0.417039155960083, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.9267643094062805, |
| "step": 313 |
| }, |
| { |
| "completion_length": 324.75, |
| "epoch": 0.4570596797671033, |
| "grad_norm": 1.2410969845454607, |
| "kl": 0.0419921875, |
| "learning_rate": 5.937833266879186e-07, |
| "loss": -0.0065, |
| "reward": 1.30293607711792, |
| "reward_std": 1.056250810623169, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9061523675918579, |
| "step": 314 |
| }, |
| { |
| "completion_length": 299.71875, |
| "epoch": 0.4585152838427948, |
| "grad_norm": 1.206619606919444, |
| "kl": 0.049072265625, |
| "learning_rate": 5.914655987361933e-07, |
| "loss": 0.0001, |
| "reward": 1.1884570121765137, |
| "reward_std": 0.744032621383667, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.8389909267425537, |
| "step": 315 |
| }, |
| { |
| "completion_length": 320.125, |
| "epoch": 0.45997088791848617, |
| "grad_norm": 1.3350598933022013, |
| "kl": 0.048583984375, |
| "learning_rate": 5.891458355789879e-07, |
| "loss": 0.0027, |
| "reward": 0.7094922065734863, |
| "reward_std": 0.5736180543899536, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9550390839576721, |
| "step": 316 |
| }, |
| { |
| "completion_length": 317.203125, |
| "epoch": 0.46142649199417757, |
| "grad_norm": 1.1181223383097147, |
| "kl": 0.05078125, |
| "learning_rate": 5.868240888334652e-07, |
| "loss": 0.001, |
| "reward": 0.6829947829246521, |
| "reward_std": 0.4590635299682617, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9185676574707031, |
| "step": 317 |
| }, |
| { |
| "completion_length": 340.171875, |
| "epoch": 0.462882096069869, |
| "grad_norm": 1.0567451773483107, |
| "kl": 0.052734375, |
| "learning_rate": 5.845004101609246e-07, |
| "loss": 0.0011, |
| "reward": 0.6952344179153442, |
| "reward_std": 0.4979342818260193, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9068880677223206, |
| "step": 318 |
| }, |
| { |
| "completion_length": 320.375, |
| "epoch": 0.4643377001455604, |
| "grad_norm": 1.216157657027118, |
| "kl": 0.0498046875, |
| "learning_rate": 5.82174851265653e-07, |
| "loss": 0.0047, |
| "reward": 0.7820637822151184, |
| "reward_std": 0.6226429343223572, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.8854231834411621, |
| "step": 319 |
| }, |
| { |
| "completion_length": 324.625, |
| "epoch": 0.4657933042212518, |
| "grad_norm": 1.091320761141732, |
| "kl": 0.04541015625, |
| "learning_rate": 5.798474638937747e-07, |
| "loss": 0.0003, |
| "reward": 1.2951171398162842, |
| "reward_std": 0.6852482557296753, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9411588907241821, |
| "step": 320 |
| }, |
| { |
| "completion_length": 310.015625, |
| "epoch": 0.4672489082969432, |
| "grad_norm": 1.305156958844844, |
| "kl": 0.054443359375, |
| "learning_rate": 5.775182998320989e-07, |
| "loss": -0.0021, |
| "reward": 1.2777929306030273, |
| "reward_std": 1.1254336833953857, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.8748111724853516, |
| "step": 321 |
| }, |
| { |
| "completion_length": 330.21875, |
| "epoch": 0.46870451237263466, |
| "grad_norm": 1.2302790562732457, |
| "kl": 0.045166015625, |
| "learning_rate": 5.751874109069684e-07, |
| "loss": -0.0025, |
| "reward": 1.56145179271698, |
| "reward_std": 0.5644485950469971, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9349414110183716, |
| "step": 322 |
| }, |
| { |
| "completion_length": 341.703125, |
| "epoch": 0.47016011644832606, |
| "grad_norm": 1.0929022937248165, |
| "kl": 0.04736328125, |
| "learning_rate": 5.728548489831057e-07, |
| "loss": 0.0025, |
| "reward": 0.21611331403255463, |
| "reward_std": 0.6904685497283936, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.8760481476783752, |
| "step": 323 |
| }, |
| { |
| "completion_length": 316.984375, |
| "epoch": 0.47161572052401746, |
| "grad_norm": 1.048375002277041, |
| "kl": 0.04736328125, |
| "learning_rate": 5.705206659624596e-07, |
| "loss": -0.001, |
| "reward": 1.0557878017425537, |
| "reward_std": 0.761435866355896, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9400846362113953, |
| "step": 324 |
| }, |
| { |
| "completion_length": 323.625, |
| "epoch": 0.47307132459970885, |
| "grad_norm": 1.1891964320438795, |
| "kl": 0.0498046875, |
| "learning_rate": 5.6818491378305e-07, |
| "loss": -0.0003, |
| "reward": 1.0568814277648926, |
| "reward_std": 0.422406941652298, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9388867020606995, |
| "step": 325 |
| }, |
| { |
| "completion_length": 358.796875, |
| "epoch": 0.4745269286754003, |
| "grad_norm": 1.1285434321384047, |
| "kl": 0.046630859375, |
| "learning_rate": 5.658476444178118e-07, |
| "loss": -0.0037, |
| "reward": 0.16022136807441711, |
| "reward_std": 0.4567154049873352, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.9228385090827942, |
| "step": 326 |
| }, |
| { |
| "completion_length": 311.484375, |
| "epoch": 0.4759825327510917, |
| "grad_norm": 1.2560433618274836, |
| "kl": 0.05224609375, |
| "learning_rate": 5.635089098734393e-07, |
| "loss": 0.0005, |
| "reward": 1.5854361057281494, |
| "reward_std": 0.5700551271438599, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9499154090881348, |
| "step": 327 |
| }, |
| { |
| "completion_length": 350.953125, |
| "epoch": 0.4774381368267831, |
| "grad_norm": 1.0742309488389477, |
| "kl": 0.046142578125, |
| "learning_rate": 5.611687621892286e-07, |
| "loss": 0.0009, |
| "reward": 1.1679883003234863, |
| "reward_std": 0.7857703566551208, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9162304401397705, |
| "step": 328 |
| }, |
| { |
| "completion_length": 341.96875, |
| "epoch": 0.47889374090247455, |
| "grad_norm": 1.1156444737560618, |
| "kl": 0.046142578125, |
| "learning_rate": 5.588272534359192e-07, |
| "loss": 0.005, |
| "reward": 1.3799349069595337, |
| "reward_std": 0.4269542694091797, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9424349069595337, |
| "step": 329 |
| }, |
| { |
| "completion_length": 329.125, |
| "epoch": 0.48034934497816595, |
| "grad_norm": 1.0577263144575106, |
| "kl": 0.04833984375, |
| "learning_rate": 5.564844357145364e-07, |
| "loss": 0.0029, |
| "reward": 1.4966275691986084, |
| "reward_std": 0.5279096961021423, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.9164062738418579, |
| "step": 330 |
| }, |
| { |
| "completion_length": 336.40625, |
| "epoch": 0.48180494905385735, |
| "grad_norm": 1.218724139541029, |
| "kl": 0.051513671875, |
| "learning_rate": 5.541403611552309e-07, |
| "loss": 0.0004, |
| "reward": 0.5468424558639526, |
| "reward_std": 0.6334984302520752, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9350846409797668, |
| "step": 331 |
| }, |
| { |
| "completion_length": 372.921875, |
| "epoch": 0.48326055312954874, |
| "grad_norm": 0.9652643655545916, |
| "kl": 0.043212890625, |
| "learning_rate": 5.517950819161196e-07, |
| "loss": -0.0024, |
| "reward": 0.44192707538604736, |
| "reward_std": 1.4124200344085693, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8328776359558105, |
| "step": 332 |
| }, |
| { |
| "completion_length": 353.0, |
| "epoch": 0.4847161572052402, |
| "grad_norm": 1.0409879516651472, |
| "kl": 0.046142578125, |
| "learning_rate": 5.49448650182125e-07, |
| "loss": 0.0024, |
| "reward": 0.6646744608879089, |
| "reward_std": 0.7230473756790161, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8712239265441895, |
| "step": 333 |
| }, |
| { |
| "completion_length": 333.578125, |
| "epoch": 0.4861717612809316, |
| "grad_norm": 1.0907676643802855, |
| "kl": 0.052978515625, |
| "learning_rate": 5.47101118163813e-07, |
| "loss": 0.0032, |
| "reward": 1.0463411808013916, |
| "reward_std": 0.6876223087310791, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9171745181083679, |
| "step": 334 |
| }, |
| { |
| "completion_length": 345.171875, |
| "epoch": 0.487627365356623, |
| "grad_norm": 1.0768322878334375, |
| "kl": 0.051025390625, |
| "learning_rate": 5.447525380962334e-07, |
| "loss": 0.0012, |
| "reward": 0.48595699667930603, |
| "reward_std": 0.8556101322174072, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8658137917518616, |
| "step": 335 |
| }, |
| { |
| "completion_length": 357.90625, |
| "epoch": 0.4890829694323144, |
| "grad_norm": 1.0818933763915959, |
| "kl": 0.0439453125, |
| "learning_rate": 5.424029622377546e-07, |
| "loss": -0.0016, |
| "reward": 0.9220898151397705, |
| "reward_std": 1.0697258710861206, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.8551627397537231, |
| "step": 336 |
| }, |
| { |
| "completion_length": 367.09375, |
| "epoch": 0.49053857350800584, |
| "grad_norm": 0.9408245385193819, |
| "kl": 0.046142578125, |
| "learning_rate": 5.400524428689035e-07, |
| "loss": -0.0019, |
| "reward": 0.23695963621139526, |
| "reward_std": 0.7709278464317322, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.8613867163658142, |
| "step": 337 |
| }, |
| { |
| "completion_length": 341.09375, |
| "epoch": 0.49199417758369723, |
| "grad_norm": 0.9905746624278071, |
| "kl": 0.050048828125, |
| "learning_rate": 5.377010322912008e-07, |
| "loss": -0.0, |
| "reward": 0.875012993812561, |
| "reward_std": 0.9516023397445679, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.8464062213897705, |
| "step": 338 |
| }, |
| { |
| "completion_length": 340.9375, |
| "epoch": 0.49344978165938863, |
| "grad_norm": 1.1600321561212144, |
| "kl": 0.052001953125, |
| "learning_rate": 5.353487828259972e-07, |
| "loss": 0.0018, |
| "reward": 0.9952148795127869, |
| "reward_std": 0.7256425619125366, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9173762798309326, |
| "step": 339 |
| }, |
| { |
| "completion_length": 362.953125, |
| "epoch": 0.4949053857350801, |
| "grad_norm": 0.9677169222469415, |
| "kl": 0.043212890625, |
| "learning_rate": 5.329957468133103e-07, |
| "loss": -0.0019, |
| "reward": 0.9681054353713989, |
| "reward_std": 0.8885953426361084, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.8975194692611694, |
| "step": 340 |
| }, |
| { |
| "completion_length": 348.46875, |
| "epoch": 0.4963609898107715, |
| "grad_norm": 0.9992111334554165, |
| "kl": 0.04541015625, |
| "learning_rate": 5.306419766106581e-07, |
| "loss": -0.0002, |
| "reward": 0.6875976324081421, |
| "reward_std": 0.5412222146987915, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9448763132095337, |
| "step": 341 |
| }, |
| { |
| "completion_length": 339.234375, |
| "epoch": 0.4978165938864629, |
| "grad_norm": 1.0798384611489298, |
| "kl": 0.047119140625, |
| "learning_rate": 5.282875245918962e-07, |
| "loss": -0.0015, |
| "reward": 1.2949869632720947, |
| "reward_std": 0.6558061838150024, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.8986979722976685, |
| "step": 342 |
| }, |
| { |
| "completion_length": 328.71875, |
| "epoch": 0.4992721979621543, |
| "grad_norm": 1.1460335335257221, |
| "kl": 0.056884765625, |
| "learning_rate": 5.259324431460506e-07, |
| "loss": 0.0016, |
| "reward": 1.0809309482574463, |
| "reward_std": 0.42390185594558716, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9173502326011658, |
| "step": 343 |
| }, |
| { |
| "completion_length": 327.828125, |
| "epoch": 0.5007278020378457, |
| "grad_norm": 1.1732063642209178, |
| "kl": 0.046142578125, |
| "learning_rate": 5.235767846761529e-07, |
| "loss": -0.003, |
| "reward": 1.85539710521698, |
| "reward_std": 0.26673340797424316, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 0.9491471648216248, |
| "step": 344 |
| }, |
| { |
| "completion_length": 351.890625, |
| "epoch": 0.5021834061135371, |
| "grad_norm": 1.1152451844872624, |
| "kl": 0.0478515625, |
| "learning_rate": 5.212206015980741e-07, |
| "loss": 0.0033, |
| "reward": 1.0626237392425537, |
| "reward_std": 0.6199690103530884, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9055924415588379, |
| "step": 345 |
| }, |
| { |
| "completion_length": 339.921875, |
| "epoch": 0.5036390101892285, |
| "grad_norm": 1.1045572732346232, |
| "kl": 0.046142578125, |
| "learning_rate": 5.188639463393586e-07, |
| "loss": 0.0048, |
| "reward": 1.1775846481323242, |
| "reward_std": 0.4957619905471802, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9695507287979126, |
| "step": 346 |
| }, |
| { |
| "completion_length": 356.515625, |
| "epoch": 0.50509461426492, |
| "grad_norm": 1.0563165828060364, |
| "kl": 0.04638671875, |
| "learning_rate": 5.165068713380567e-07, |
| "loss": 0.0018, |
| "reward": 1.2808787822723389, |
| "reward_std": 0.6754826903343201, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.8902539610862732, |
| "step": 347 |
| }, |
| { |
| "completion_length": 333.265625, |
| "epoch": 0.5065502183406113, |
| "grad_norm": 1.1133538964235559, |
| "kl": 0.0458984375, |
| "learning_rate": 5.141494290415591e-07, |
| "loss": 0.001, |
| "reward": 1.0276042222976685, |
| "reward_std": 0.6215536594390869, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.951171875, |
| "step": 348 |
| }, |
| { |
| "completion_length": 355.34375, |
| "epoch": 0.5080058224163028, |
| "grad_norm": 1.0253948324473836, |
| "kl": 0.04150390625, |
| "learning_rate": 5.117916719054285e-07, |
| "loss": 0.0003, |
| "reward": 0.8305078744888306, |
| "reward_std": 0.8258182406425476, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.90471351146698, |
| "step": 349 |
| }, |
| { |
| "completion_length": 329.4375, |
| "epoch": 0.5094614264919942, |
| "grad_norm": 1.1732665473569692, |
| "kl": 0.049072265625, |
| "learning_rate": 5.094336523922335e-07, |
| "loss": 0.0032, |
| "reward": 0.745130181312561, |
| "reward_std": 0.2655390501022339, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.948046863079071, |
| "step": 350 |
| }, |
| { |
| "completion_length": 358.890625, |
| "epoch": 0.5109170305676856, |
| "grad_norm": 1.1759706647822885, |
| "kl": 0.046142578125, |
| "learning_rate": 5.07075422970381e-07, |
| "loss": 0.0022, |
| "reward": 1.322669267654419, |
| "reward_std": 0.8540600538253784, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9266666173934937, |
| "step": 351 |
| }, |
| { |
| "completion_length": 340.71875, |
| "epoch": 0.512372634643377, |
| "grad_norm": 1.280765883903055, |
| "kl": 0.04931640625, |
| "learning_rate": 5.047170361129483e-07, |
| "loss": -0.0051, |
| "reward": 0.7140104174613953, |
| "reward_std": 0.7314082384109497, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8824348449707031, |
| "step": 352 |
| }, |
| { |
| "completion_length": 365.671875, |
| "epoch": 0.5138282387190685, |
| "grad_norm": 1.0208673688095518, |
| "kl": 0.041259765625, |
| "learning_rate": 5.023585442965162e-07, |
| "loss": 0.002, |
| "reward": 0.8170312643051147, |
| "reward_std": 0.6544123888015747, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.8933594226837158, |
| "step": 353 |
| }, |
| { |
| "completion_length": 343.65625, |
| "epoch": 0.5152838427947598, |
| "grad_norm": 1.0616017021974933, |
| "kl": 0.044921875, |
| "learning_rate": 5e-07, |
| "loss": -0.0017, |
| "reward": 1.360579490661621, |
| "reward_std": 0.6262814998626709, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9170117378234863, |
| "step": 354 |
| }, |
| { |
| "completion_length": 346.1875, |
| "epoch": 0.5167394468704513, |
| "grad_norm": 0.9668421147018488, |
| "kl": 0.04296875, |
| "learning_rate": 4.976414557034839e-07, |
| "loss": 0.0022, |
| "reward": -0.3669726252555847, |
| "reward_std": 0.33296334743499756, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.9034309387207031, |
| "step": 355 |
| }, |
| { |
| "completion_length": 345.4375, |
| "epoch": 0.5181950509461426, |
| "grad_norm": 1.2913148050931513, |
| "kl": 0.041015625, |
| "learning_rate": 4.952829638870515e-07, |
| "loss": -0.0009, |
| "reward": 0.7532292008399963, |
| "reward_std": 0.7976954579353333, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9168879985809326, |
| "step": 356 |
| }, |
| { |
| "completion_length": 368.5, |
| "epoch": 0.519650655021834, |
| "grad_norm": 0.8900198844773329, |
| "kl": 0.040283203125, |
| "learning_rate": 4.92924577029619e-07, |
| "loss": -0.0012, |
| "reward": 1.5500717163085938, |
| "reward_std": 1.1232110261917114, |
| "rewards/accuracy_reward": 0.890625, |
| "rewards/format_reward": 0.8781965970993042, |
| "step": 357 |
| }, |
| { |
| "completion_length": 350.390625, |
| "epoch": 0.5211062590975255, |
| "grad_norm": 1.0655510860501003, |
| "kl": 0.050537109375, |
| "learning_rate": 4.905663476077665e-07, |
| "loss": -0.0054, |
| "reward": 1.3917381763458252, |
| "reward_std": 0.9621044397354126, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9068945050239563, |
| "step": 358 |
| }, |
| { |
| "completion_length": 329.6875, |
| "epoch": 0.5225618631732168, |
| "grad_norm": 1.1100376051956256, |
| "kl": 0.048828125, |
| "learning_rate": 4.882083280945716e-07, |
| "loss": 0.0028, |
| "reward": 0.8387891054153442, |
| "reward_std": 0.6230899691581726, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9546484351158142, |
| "step": 359 |
| }, |
| { |
| "completion_length": 346.484375, |
| "epoch": 0.5240174672489083, |
| "grad_norm": 1.2301004521682009, |
| "kl": 0.047607421875, |
| "learning_rate": 4.85850570958441e-07, |
| "loss": 0.0047, |
| "reward": 0.77734375, |
| "reward_std": 0.2949136197566986, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.93436199426651, |
| "step": 360 |
| }, |
| { |
| "completion_length": 324.4375, |
| "epoch": 0.5254730713245997, |
| "grad_norm": 1.150417363080493, |
| "kl": 0.04541015625, |
| "learning_rate": 4.834931286619432e-07, |
| "loss": -0.0014, |
| "reward": 1.0515625476837158, |
| "reward_std": 0.45344188809394836, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9319791793823242, |
| "step": 361 |
| }, |
| { |
| "completion_length": 338.5625, |
| "epoch": 0.5269286754002911, |
| "grad_norm": 1.0871105100886687, |
| "kl": 0.05224609375, |
| "learning_rate": 4.811360536606415e-07, |
| "loss": 0.0006, |
| "reward": 1.0285286903381348, |
| "reward_std": 0.5896121263504028, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9137629866600037, |
| "step": 362 |
| }, |
| { |
| "completion_length": 346.4375, |
| "epoch": 0.5283842794759825, |
| "grad_norm": 1.16086348005986, |
| "kl": 0.047119140625, |
| "learning_rate": 4.787793984019259e-07, |
| "loss": -0.0023, |
| "reward": 1.1483073234558105, |
| "reward_std": 0.1926122009754181, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9851562976837158, |
| "step": 363 |
| }, |
| { |
| "completion_length": 330.25, |
| "epoch": 0.529839883551674, |
| "grad_norm": 1.122777391792208, |
| "kl": 0.04345703125, |
| "learning_rate": 4.764232153238472e-07, |
| "loss": -0.0037, |
| "reward": 1.3178515434265137, |
| "reward_std": 0.3849244713783264, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9673958420753479, |
| "step": 364 |
| }, |
| { |
| "completion_length": 331.015625, |
| "epoch": 0.5312954876273653, |
| "grad_norm": 1.2307126175016525, |
| "kl": 0.047607421875, |
| "learning_rate": 4.7406755685394943e-07, |
| "loss": -0.0013, |
| "reward": 0.8999348878860474, |
| "reward_std": 0.7801786065101624, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.910559892654419, |
| "step": 365 |
| }, |
| { |
| "completion_length": 335.90625, |
| "epoch": 0.5327510917030568, |
| "grad_norm": 1.222675950836803, |
| "kl": 0.04931640625, |
| "learning_rate": 4.7171247540810377e-07, |
| "loss": 0.0039, |
| "reward": 0.7975065112113953, |
| "reward_std": 0.47923994064331055, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9561002254486084, |
| "step": 366 |
| }, |
| { |
| "completion_length": 335.640625, |
| "epoch": 0.5342066957787481, |
| "grad_norm": 1.0798428455730729, |
| "kl": 0.04443359375, |
| "learning_rate": 4.693580233893419e-07, |
| "loss": -0.0001, |
| "reward": 0.7751432657241821, |
| "reward_std": 0.8045872449874878, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9402864575386047, |
| "step": 367 |
| }, |
| { |
| "completion_length": 327.671875, |
| "epoch": 0.5356622998544396, |
| "grad_norm": 1.2982851617600433, |
| "kl": 0.052001953125, |
| "learning_rate": 4.6700425318668983e-07, |
| "loss": -0.0008, |
| "reward": 1.1960091590881348, |
| "reward_std": 0.4194261431694031, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9822461009025574, |
| "step": 368 |
| }, |
| { |
| "completion_length": 311.640625, |
| "epoch": 0.537117903930131, |
| "grad_norm": 1.1430814971743037, |
| "kl": 0.046630859375, |
| "learning_rate": 4.646512171740027e-07, |
| "loss": 0.002, |
| "reward": 0.6666406393051147, |
| "reward_std": 0.41208669543266296, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9482681751251221, |
| "step": 369 |
| }, |
| { |
| "completion_length": 332.796875, |
| "epoch": 0.5385735080058224, |
| "grad_norm": 1.1537373469060968, |
| "kl": 0.043701171875, |
| "learning_rate": 4.6229896770879925e-07, |
| "loss": -0.0008, |
| "reward": 0.8343229293823242, |
| "reward_std": 0.35879752039909363, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9422265291213989, |
| "step": 370 |
| }, |
| { |
| "completion_length": 329.578125, |
| "epoch": 0.5400291120815138, |
| "grad_norm": 1.117973192100589, |
| "kl": 0.04541015625, |
| "learning_rate": 4.599475571310964e-07, |
| "loss": 0.0006, |
| "reward": 0.95947265625, |
| "reward_std": 0.6068175435066223, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9388346672058105, |
| "step": 371 |
| }, |
| { |
| "completion_length": 344.375, |
| "epoch": 0.5414847161572053, |
| "grad_norm": 1.196872378128055, |
| "kl": 0.04931640625, |
| "learning_rate": 4.5759703776224555e-07, |
| "loss": 0.0017, |
| "reward": 1.1496614217758179, |
| "reward_std": 0.7437654733657837, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9437239170074463, |
| "step": 372 |
| }, |
| { |
| "completion_length": 323.984375, |
| "epoch": 0.5429403202328966, |
| "grad_norm": 1.2948089305623915, |
| "kl": 0.050537109375, |
| "learning_rate": 4.552474619037668e-07, |
| "loss": -0.0012, |
| "reward": 1.1745052337646484, |
| "reward_std": 0.1955292969942093, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9643619656562805, |
| "step": 373 |
| }, |
| { |
| "completion_length": 334.40625, |
| "epoch": 0.5443959243085881, |
| "grad_norm": 1.1752903191077182, |
| "kl": 0.045166015625, |
| "learning_rate": 4.528988818361869e-07, |
| "loss": -0.0005, |
| "reward": 0.9639453291893005, |
| "reward_std": 0.7098885774612427, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9328385591506958, |
| "step": 374 |
| }, |
| { |
| "completion_length": 336.34375, |
| "epoch": 0.5458515283842795, |
| "grad_norm": 1.0217530778000898, |
| "kl": 0.0458984375, |
| "learning_rate": 4.505513498178751e-07, |
| "loss": 0.0044, |
| "reward": 1.0537173748016357, |
| "reward_std": 0.42270171642303467, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.936022162437439, |
| "step": 375 |
| }, |
| { |
| "completion_length": 336.140625, |
| "epoch": 0.5473071324599709, |
| "grad_norm": 1.1035824214262215, |
| "kl": 0.041748046875, |
| "learning_rate": 4.4820491808388035e-07, |
| "loss": 0.0024, |
| "reward": 1.3975260257720947, |
| "reward_std": 0.5420940518379211, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9560156464576721, |
| "step": 376 |
| }, |
| { |
| "completion_length": 319.171875, |
| "epoch": 0.5487627365356623, |
| "grad_norm": 1.1858924715525574, |
| "kl": 0.048583984375, |
| "learning_rate": 4.45859638844769e-07, |
| "loss": 0.0027, |
| "reward": 1.609043002128601, |
| "reward_std": 0.04546473175287247, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9743945598602295, |
| "step": 377 |
| }, |
| { |
| "completion_length": 332.078125, |
| "epoch": 0.5502183406113537, |
| "grad_norm": 1.137595761389025, |
| "kl": 0.051513671875, |
| "learning_rate": 4.4351556428546365e-07, |
| "loss": 0.0042, |
| "reward": 1.075364589691162, |
| "reward_std": 0.9671132564544678, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.8985416293144226, |
| "step": 378 |
| }, |
| { |
| "completion_length": 365.34375, |
| "epoch": 0.5516739446870451, |
| "grad_norm": 1.064698625138544, |
| "kl": 0.0439453125, |
| "learning_rate": 4.411727465640808e-07, |
| "loss": 0.0007, |
| "reward": 0.705507755279541, |
| "reward_std": 0.2698853611946106, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.958437442779541, |
| "step": 379 |
| }, |
| { |
| "completion_length": 332.1875, |
| "epoch": 0.5531295487627366, |
| "grad_norm": 0.954227809746772, |
| "kl": 0.047119140625, |
| "learning_rate": 4.388312378107714e-07, |
| "loss": 0.0019, |
| "reward": 0.017766959965229034, |
| "reward_std": 0.4671540856361389, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.9211132526397705, |
| "step": 380 |
| }, |
| { |
| "completion_length": 357.34375, |
| "epoch": 0.5545851528384279, |
| "grad_norm": 1.0746234998917534, |
| "kl": 0.04638671875, |
| "learning_rate": 4.364910901265606e-07, |
| "loss": 0.0009, |
| "reward": 0.9512304663658142, |
| "reward_std": 0.9008158445358276, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9242773056030273, |
| "step": 381 |
| }, |
| { |
| "completion_length": 353.171875, |
| "epoch": 0.5560407569141194, |
| "grad_norm": 1.1000427889048656, |
| "kl": 0.04638671875, |
| "learning_rate": 4.341523555821881e-07, |
| "loss": 0.001, |
| "reward": 0.6827343702316284, |
| "reward_std": 0.32222965359687805, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8887369632720947, |
| "step": 382 |
| }, |
| { |
| "completion_length": 330.015625, |
| "epoch": 0.5574963609898108, |
| "grad_norm": 1.0062009920301431, |
| "kl": 0.052734375, |
| "learning_rate": 4.3181508621695015e-07, |
| "loss": 0.0044, |
| "reward": 1.2704167366027832, |
| "reward_std": 0.3719358444213867, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.9685026407241821, |
| "step": 383 |
| }, |
| { |
| "completion_length": 358.90625, |
| "epoch": 0.5589519650655022, |
| "grad_norm": 1.0500378444634122, |
| "kl": 0.055419921875, |
| "learning_rate": 4.294793340375404e-07, |
| "loss": 0.0022, |
| "reward": 0.5693033933639526, |
| "reward_std": 0.7073459029197693, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9236393570899963, |
| "step": 384 |
| }, |
| { |
| "completion_length": 317.875, |
| "epoch": 0.5604075691411936, |
| "grad_norm": 1.2585391427250612, |
| "kl": 0.052978515625, |
| "learning_rate": 4.271451510168943e-07, |
| "loss": 0.0019, |
| "reward": 1.0041340589523315, |
| "reward_std": 0.6024578809738159, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9240299463272095, |
| "step": 385 |
| }, |
| { |
| "completion_length": 340.671875, |
| "epoch": 0.5618631732168851, |
| "grad_norm": 1.171525425235524, |
| "kl": 0.055419921875, |
| "learning_rate": 4.248125890930316e-07, |
| "loss": 0.001, |
| "reward": 0.5767643451690674, |
| "reward_std": 0.9831317067146301, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.868079423904419, |
| "step": 386 |
| }, |
| { |
| "completion_length": 348.484375, |
| "epoch": 0.5633187772925764, |
| "grad_norm": 1.1137932938055821, |
| "kl": 0.0498046875, |
| "learning_rate": 4.22481700167901e-07, |
| "loss": 0.0038, |
| "reward": 1.6028320789337158, |
| "reward_std": 0.785223126411438, |
| "rewards/accuracy_reward": 0.890625, |
| "rewards/format_reward": 0.9309570789337158, |
| "step": 387 |
| }, |
| { |
| "completion_length": 357.015625, |
| "epoch": 0.5647743813682679, |
| "grad_norm": 1.1683554196934958, |
| "kl": 0.04541015625, |
| "learning_rate": 4.201525361062254e-07, |
| "loss": 0.002, |
| "reward": 0.885696530342102, |
| "reward_std": 0.9379022121429443, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9048242568969727, |
| "step": 388 |
| }, |
| { |
| "completion_length": 349.765625, |
| "epoch": 0.5662299854439592, |
| "grad_norm": 1.0285239615931752, |
| "kl": 0.045654296875, |
| "learning_rate": 4.17825148734347e-07, |
| "loss": -0.0016, |
| "reward": 1.4252278804779053, |
| "reward_std": 0.555104672908783, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9401627779006958, |
| "step": 389 |
| }, |
| { |
| "completion_length": 344.578125, |
| "epoch": 0.5676855895196506, |
| "grad_norm": 0.9824416716745002, |
| "kl": 0.046875, |
| "learning_rate": 4.154995898390755e-07, |
| "loss": 0.0001, |
| "reward": 0.30329427123069763, |
| "reward_std": 0.8160994648933411, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.8936979174613953, |
| "step": 390 |
| }, |
| { |
| "completion_length": 362.265625, |
| "epoch": 0.5691411935953421, |
| "grad_norm": 1.0347605373720594, |
| "kl": 0.04345703125, |
| "learning_rate": 4.131759111665348e-07, |
| "loss": 0.0006, |
| "reward": 0.3581119775772095, |
| "reward_std": 0.6576919555664062, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.8985155820846558, |
| "step": 391 |
| }, |
| { |
| "completion_length": 354.8125, |
| "epoch": 0.5705967976710334, |
| "grad_norm": 1.0801343806418962, |
| "kl": 0.04736328125, |
| "learning_rate": 4.1085416442101203e-07, |
| "loss": 0.0022, |
| "reward": 1.1929036378860474, |
| "reward_std": 1.0718597173690796, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8935286402702332, |
| "step": 392 |
| }, |
| { |
| "completion_length": 337.828125, |
| "epoch": 0.5720524017467249, |
| "grad_norm": 0.9437613999658896, |
| "kl": 0.042236328125, |
| "learning_rate": 4.0853440126380666e-07, |
| "loss": -0.0027, |
| "reward": 1.2306054830551147, |
| "reward_std": 0.06505398452281952, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9732356667518616, |
| "step": 393 |
| }, |
| { |
| "completion_length": 353.15625, |
| "epoch": 0.5735080058224163, |
| "grad_norm": 1.0240210848060178, |
| "kl": 0.042724609375, |
| "learning_rate": 4.0621667331208156e-07, |
| "loss": 0.0022, |
| "reward": 1.047447919845581, |
| "reward_std": 0.5841343402862549, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9282682538032532, |
| "step": 394 |
| }, |
| { |
| "completion_length": 371.28125, |
| "epoch": 0.5749636098981077, |
| "grad_norm": 0.9665992608409852, |
| "kl": 0.04931640625, |
| "learning_rate": 4.0390103213771363e-07, |
| "loss": -0.0038, |
| "reward": 0.1188020408153534, |
| "reward_std": 0.9848098158836365, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.8411197662353516, |
| "step": 395 |
| }, |
| { |
| "completion_length": 341.75, |
| "epoch": 0.5764192139737991, |
| "grad_norm": 1.2322164783845382, |
| "kl": 0.04833984375, |
| "learning_rate": 4.015875292661473e-07, |
| "loss": 0.0022, |
| "reward": 0.32570311427116394, |
| "reward_std": 0.9908155798912048, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.9544401168823242, |
| "step": 396 |
| }, |
| { |
| "completion_length": 333.5625, |
| "epoch": 0.5778748180494906, |
| "grad_norm": 1.082096942344887, |
| "kl": 0.0498046875, |
| "learning_rate": 3.9927621617524736e-07, |
| "loss": 0.0069, |
| "reward": 1.061464786529541, |
| "reward_std": 0.4430278539657593, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.8951106667518616, |
| "step": 397 |
| }, |
| { |
| "completion_length": 349.640625, |
| "epoch": 0.5793304221251819, |
| "grad_norm": 1.1730680460847622, |
| "kl": 0.046142578125, |
| "learning_rate": 3.969671442941538e-07, |
| "loss": 0.0014, |
| "reward": 0.3286914527416229, |
| "reward_std": 0.6564339995384216, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9076367616653442, |
| "step": 398 |
| }, |
| { |
| "completion_length": 347.859375, |
| "epoch": 0.5807860262008734, |
| "grad_norm": 0.936643056977854, |
| "kl": 0.048095703125, |
| "learning_rate": 3.94660365002137e-07, |
| "loss": 0.001, |
| "reward": -0.10537109524011612, |
| "reward_std": 0.5952030420303345, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.8903840780258179, |
| "step": 399 |
| }, |
| { |
| "completion_length": 361.015625, |
| "epoch": 0.5822416302765647, |
| "grad_norm": 0.8588675688237927, |
| "kl": 0.047119140625, |
| "learning_rate": 3.923559296274549e-07, |
| "loss": -0.0018, |
| "reward": 0.81843101978302, |
| "reward_std": 0.8495593667030334, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.895423173904419, |
| "step": 400 |
| }, |
| { |
| "completion_length": 327.78125, |
| "epoch": 0.5836972343522562, |
| "grad_norm": 1.1927270347808032, |
| "kl": 0.046142578125, |
| "learning_rate": 3.900538894462112e-07, |
| "loss": -0.0011, |
| "reward": 0.4946874976158142, |
| "reward_std": 0.5388225317001343, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9400129914283752, |
| "step": 401 |
| }, |
| { |
| "completion_length": 335.125, |
| "epoch": 0.5851528384279476, |
| "grad_norm": 1.1416346509953534, |
| "kl": 0.050048828125, |
| "learning_rate": 3.877542956812136e-07, |
| "loss": -0.0009, |
| "reward": 1.2370572090148926, |
| "reward_std": 0.7617213129997253, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.9355729222297668, |
| "step": 402 |
| }, |
| { |
| "completion_length": 339.640625, |
| "epoch": 0.586608442503639, |
| "grad_norm": 1.2059860449472852, |
| "kl": 0.048828125, |
| "learning_rate": 3.8545719950083503e-07, |
| "loss": 0.0032, |
| "reward": 1.136875033378601, |
| "reward_std": 0.3422207832336426, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9610416293144226, |
| "step": 403 |
| }, |
| { |
| "completion_length": 311.203125, |
| "epoch": 0.5880640465793304, |
| "grad_norm": 1.2427528264658227, |
| "kl": 0.057373046875, |
| "learning_rate": 3.831626520178738e-07, |
| "loss": 0.0021, |
| "reward": 1.1694010496139526, |
| "reward_std": 0.21720562875270844, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9581640958786011, |
| "step": 404 |
| }, |
| { |
| "completion_length": 337.6875, |
| "epoch": 0.5895196506550219, |
| "grad_norm": 1.1003634317266355, |
| "kl": 0.05224609375, |
| "learning_rate": 3.8087070428841753e-07, |
| "loss": 0.0002, |
| "reward": 0.48251304030418396, |
| "reward_std": 0.9312198758125305, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9204167127609253, |
| "step": 405 |
| }, |
| { |
| "completion_length": 339.484375, |
| "epoch": 0.5909752547307132, |
| "grad_norm": 0.9678972898174403, |
| "kl": 0.044677734375, |
| "learning_rate": 3.785814073107064e-07, |
| "loss": 0.0013, |
| "reward": 1.5340235233306885, |
| "reward_std": 0.5008847117424011, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.94998699426651, |
| "step": 406 |
| }, |
| { |
| "completion_length": 308.375, |
| "epoch": 0.5924308588064047, |
| "grad_norm": 1.047424956436592, |
| "kl": 0.047607421875, |
| "learning_rate": 3.762948120239988e-07, |
| "loss": -0.0021, |
| "reward": 0.9827408790588379, |
| "reward_std": 0.627294659614563, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9047591090202332, |
| "step": 407 |
| }, |
| { |
| "completion_length": 317.78125, |
| "epoch": 0.5938864628820961, |
| "grad_norm": 1.31987404266799, |
| "kl": 0.049560546875, |
| "learning_rate": 3.7401096930743746e-07, |
| "loss": -0.0035, |
| "reward": 1.220130205154419, |
| "reward_std": 0.5085282325744629, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9641405940055847, |
| "step": 408 |
| }, |
| { |
| "completion_length": 317.328125, |
| "epoch": 0.5953420669577875, |
| "grad_norm": 1.258115336162508, |
| "kl": 0.0458984375, |
| "learning_rate": 3.717299299789175e-07, |
| "loss": 0.0015, |
| "reward": 0.8206315040588379, |
| "reward_std": 0.16567295789718628, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.980527400970459, |
| "step": 409 |
| }, |
| { |
| "completion_length": 311.0625, |
| "epoch": 0.5967976710334789, |
| "grad_norm": 1.0797344698183151, |
| "kl": 0.0400390625, |
| "learning_rate": 3.6945174479395584e-07, |
| "loss": 0.0006, |
| "reward": 0.4124348759651184, |
| "reward_std": 0.5062814950942993, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.9456771612167358, |
| "step": 410 |
| }, |
| { |
| "completion_length": 326.015625, |
| "epoch": 0.5982532751091703, |
| "grad_norm": 0.9392678790614373, |
| "kl": 0.051513671875, |
| "learning_rate": 3.6717646444456193e-07, |
| "loss": 0.0014, |
| "reward": 0.35626304149627686, |
| "reward_std": 0.4074528217315674, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9396615028381348, |
| "step": 411 |
| }, |
| { |
| "completion_length": 320.078125, |
| "epoch": 0.5997088791848617, |
| "grad_norm": 1.3610941268678678, |
| "kl": 0.044921875, |
| "learning_rate": 3.649041395581089e-07, |
| "loss": 0.0001, |
| "reward": 0.6594465970993042, |
| "reward_std": 0.6716386079788208, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9064517617225647, |
| "step": 412 |
| }, |
| { |
| "completion_length": 327.734375, |
| "epoch": 0.6011644832605532, |
| "grad_norm": 1.2129906269391977, |
| "kl": 0.048095703125, |
| "learning_rate": 3.6263482069620865e-07, |
| "loss": -0.0036, |
| "reward": 1.199205756187439, |
| "reward_std": 0.3400747776031494, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9805078506469727, |
| "step": 413 |
| }, |
| { |
| "completion_length": 322.78125, |
| "epoch": 0.6026200873362445, |
| "grad_norm": 1.0044565070885603, |
| "kl": 0.047607421875, |
| "learning_rate": 3.6036855835358496e-07, |
| "loss": 0.0017, |
| "reward": 0.64725261926651, |
| "reward_std": 0.4769784212112427, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9339061975479126, |
| "step": 414 |
| }, |
| { |
| "completion_length": 356.59375, |
| "epoch": 0.604075691411936, |
| "grad_norm": 1.144784540293772, |
| "kl": 0.0478515625, |
| "learning_rate": 3.581054029569516e-07, |
| "loss": -0.005, |
| "reward": 0.7913802266120911, |
| "reward_std": 0.649086058139801, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9033724069595337, |
| "step": 415 |
| }, |
| { |
| "completion_length": 327.5625, |
| "epoch": 0.6055312954876274, |
| "grad_norm": 1.1441222735837988, |
| "kl": 0.050537109375, |
| "learning_rate": 3.55845404863889e-07, |
| "loss": 0.0033, |
| "reward": 1.7637498378753662, |
| "reward_std": 0.5468021631240845, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.9512500166893005, |
| "step": 416 |
| }, |
| { |
| "completion_length": 304.46875, |
| "epoch": 0.6069868995633187, |
| "grad_norm": 1.2559225472205957, |
| "kl": 0.04638671875, |
| "learning_rate": 3.535886143617248e-07, |
| "loss": -0.0045, |
| "reward": 0.8564192652702332, |
| "reward_std": 0.08503374457359314, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9544661641120911, |
| "step": 417 |
| }, |
| { |
| "completion_length": 319.765625, |
| "epoch": 0.6084425036390102, |
| "grad_norm": 1.1233374483471503, |
| "kl": 0.048583984375, |
| "learning_rate": 3.513350816664138e-07, |
| "loss": 0.0005, |
| "reward": 1.250644564628601, |
| "reward_std": 0.6800768375396729, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.93666011095047, |
| "step": 418 |
| }, |
| { |
| "completion_length": 319.390625, |
| "epoch": 0.6098981077147017, |
| "grad_norm": 1.1149493029080904, |
| "kl": 0.04736328125, |
| "learning_rate": 3.4908485692142164e-07, |
| "loss": 0.0031, |
| "reward": 1.9074804782867432, |
| "reward_std": 0.20882548391819, |
| "rewards/accuracy_reward": 0.984375, |
| "rewards/format_reward": 0.9543554782867432, |
| "step": 419 |
| }, |
| { |
| "completion_length": 322.484375, |
| "epoch": 0.611353711790393, |
| "grad_norm": 1.1771319406793013, |
| "kl": 0.05078125, |
| "learning_rate": 3.4683799019660833e-07, |
| "loss": -0.0012, |
| "reward": 0.4527343511581421, |
| "reward_std": 0.1347397118806839, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9300520420074463, |
| "step": 420 |
| }, |
| { |
| "completion_length": 320.984375, |
| "epoch": 0.6128093158660844, |
| "grad_norm": 1.253919818449011, |
| "kl": 0.04541015625, |
| "learning_rate": 3.4459453148711437e-07, |
| "loss": -0.0025, |
| "reward": 1.2054883241653442, |
| "reward_std": 0.34866058826446533, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9499544501304626, |
| "step": 421 |
| }, |
| { |
| "completion_length": 308.75, |
| "epoch": 0.6142649199417758, |
| "grad_norm": 1.2228103194283946, |
| "kl": 0.0546875, |
| "learning_rate": 3.423545307122488e-07, |
| "loss": 0.0045, |
| "reward": 1.38002610206604, |
| "reward_std": 0.5425729751586914, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9391015768051147, |
| "step": 422 |
| }, |
| { |
| "completion_length": 334.921875, |
| "epoch": 0.6157205240174672, |
| "grad_norm": 1.0491711033896147, |
| "kl": 0.0517578125, |
| "learning_rate": 3.4011803771437735e-07, |
| "loss": 0.0027, |
| "reward": 1.441979169845581, |
| "reward_std": 0.9929073452949524, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9042838215827942, |
| "step": 423 |
| }, |
| { |
| "completion_length": 341.25, |
| "epoch": 0.6171761280931587, |
| "grad_norm": 1.1145287911850592, |
| "kl": 0.046142578125, |
| "learning_rate": 3.378851022578146e-07, |
| "loss": -0.0005, |
| "reward": 0.8648567199707031, |
| "reward_std": 0.6062259674072266, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.94259113073349, |
| "step": 424 |
| }, |
| { |
| "completion_length": 323.328125, |
| "epoch": 0.61863173216885, |
| "grad_norm": 1.2328706888305836, |
| "kl": 0.059326171875, |
| "learning_rate": 3.356557740277156e-07, |
| "loss": -0.0007, |
| "reward": 1.2751758098602295, |
| "reward_std": 0.894574761390686, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9299153685569763, |
| "step": 425 |
| }, |
| { |
| "completion_length": 331.3125, |
| "epoch": 0.6200873362445415, |
| "grad_norm": 1.047466025286303, |
| "kl": 0.048828125, |
| "learning_rate": 3.334301026289712e-07, |
| "loss": 0.0037, |
| "reward": 1.5336458683013916, |
| "reward_std": 0.252202570438385, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.9498567581176758, |
| "step": 426 |
| }, |
| { |
| "completion_length": 338.671875, |
| "epoch": 0.6215429403202329, |
| "grad_norm": 1.0464703790815202, |
| "kl": 0.04833984375, |
| "learning_rate": 3.312081375851038e-07, |
| "loss": 0.0015, |
| "reward": 1.5034700632095337, |
| "reward_std": 0.9077455997467041, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.9250195026397705, |
| "step": 427 |
| }, |
| { |
| "completion_length": 334.8125, |
| "epoch": 0.6229985443959243, |
| "grad_norm": 1.1030610799748342, |
| "kl": 0.047119140625, |
| "learning_rate": 3.2898992833716563e-07, |
| "loss": 0.002, |
| "reward": 1.0385351181030273, |
| "reward_std": 0.5042393803596497, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9632617235183716, |
| "step": 428 |
| }, |
| { |
| "completion_length": 341.953125, |
| "epoch": 0.6244541484716157, |
| "grad_norm": 0.9140662347223634, |
| "kl": 0.05224609375, |
| "learning_rate": 3.2677552424263834e-07, |
| "loss": -0.0018, |
| "reward": 0.9026367664337158, |
| "reward_std": 0.9520148038864136, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.8862174153327942, |
| "step": 429 |
| }, |
| { |
| "completion_length": 355.328125, |
| "epoch": 0.6259097525473072, |
| "grad_norm": 1.0653314224895472, |
| "kl": 0.046630859375, |
| "learning_rate": 3.2456497457433475e-07, |
| "loss": 0.0019, |
| "reward": 0.8542708158493042, |
| "reward_std": 0.8604847192764282, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.8829036355018616, |
| "step": 430 |
| }, |
| { |
| "completion_length": 353.859375, |
| "epoch": 0.6273653566229985, |
| "grad_norm": 1.0396201810405379, |
| "kl": 0.0478515625, |
| "learning_rate": 3.2235832851930315e-07, |
| "loss": -0.0011, |
| "reward": 1.552734375, |
| "reward_std": 0.19750574231147766, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.974609375, |
| "step": 431 |
| }, |
| { |
| "completion_length": 330.3125, |
| "epoch": 0.62882096069869, |
| "grad_norm": 1.149398763893242, |
| "kl": 0.048828125, |
| "learning_rate": 3.201556351777321e-07, |
| "loss": 0.0008, |
| "reward": 1.3697071075439453, |
| "reward_std": 0.5695977210998535, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9322070479393005, |
| "step": 432 |
| }, |
| { |
| "completion_length": 339.59375, |
| "epoch": 0.6302765647743813, |
| "grad_norm": 1.2047591749520952, |
| "kl": 0.045654296875, |
| "learning_rate": 3.1795694356185797e-07, |
| "loss": -0.0004, |
| "reward": 1.2822070121765137, |
| "reward_std": 0.7462552785873413, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9377539157867432, |
| "step": 433 |
| }, |
| { |
| "completion_length": 354.828125, |
| "epoch": 0.6317321688500728, |
| "grad_norm": 1.0709344253665385, |
| "kl": 0.056396484375, |
| "learning_rate": 3.157623025948751e-07, |
| "loss": 0.0054, |
| "reward": 0.5403645634651184, |
| "reward_std": 0.7718614339828491, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9401432275772095, |
| "step": 434 |
| }, |
| { |
| "completion_length": 367.015625, |
| "epoch": 0.6331877729257642, |
| "grad_norm": 0.9081689546614261, |
| "kl": 0.042236328125, |
| "learning_rate": 3.135717611098457e-07, |
| "loss": -0.0028, |
| "reward": 0.5762760639190674, |
| "reward_std": 0.9539381265640259, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9339452981948853, |
| "step": 435 |
| }, |
| { |
| "completion_length": 361.609375, |
| "epoch": 0.6346433770014556, |
| "grad_norm": 0.9935443191736562, |
| "kl": 0.050048828125, |
| "learning_rate": 3.11385367848615e-07, |
| "loss": 0.0029, |
| "reward": 1.0945442914962769, |
| "reward_std": 0.364665687084198, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9768098592758179, |
| "step": 436 |
| }, |
| { |
| "completion_length": 365.984375, |
| "epoch": 0.636098981077147, |
| "grad_norm": 1.1539983550423447, |
| "kl": 0.046142578125, |
| "learning_rate": 3.0920317146072574e-07, |
| "loss": -0.003, |
| "reward": 0.9360742568969727, |
| "reward_std": 0.656904935836792, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9138607382774353, |
| "step": 437 |
| }, |
| { |
| "completion_length": 364.125, |
| "epoch": 0.6375545851528385, |
| "grad_norm": 1.0332344243928102, |
| "kl": 0.04638671875, |
| "learning_rate": 3.070252205023355e-07, |
| "loss": -0.0011, |
| "reward": 0.920188844203949, |
| "reward_std": 0.4112701714038849, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9488345980644226, |
| "step": 438 |
| }, |
| { |
| "completion_length": 363.140625, |
| "epoch": 0.6390101892285298, |
| "grad_norm": 0.8434589833001687, |
| "kl": 0.04345703125, |
| "learning_rate": 3.048515634351373e-07, |
| "loss": -0.0007, |
| "reward": 1.021744728088379, |
| "reward_std": 0.9885683655738831, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9123697876930237, |
| "step": 439 |
| }, |
| { |
| "completion_length": 349.859375, |
| "epoch": 0.6404657933042213, |
| "grad_norm": 1.100810311524533, |
| "kl": 0.049072265625, |
| "learning_rate": 3.026822486252796e-07, |
| "loss": 0.004, |
| "reward": 1.1635351181030273, |
| "reward_std": 0.23715665936470032, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9604101777076721, |
| "step": 440 |
| }, |
| { |
| "completion_length": 361.28125, |
| "epoch": 0.6419213973799127, |
| "grad_norm": 0.9996154209580299, |
| "kl": 0.043701171875, |
| "learning_rate": 3.005173243422918e-07, |
| "loss": -0.0018, |
| "reward": 1.4946484565734863, |
| "reward_std": 0.36391258239746094, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9633983969688416, |
| "step": 441 |
| }, |
| { |
| "completion_length": 355.890625, |
| "epoch": 0.6433770014556041, |
| "grad_norm": 1.064092737707726, |
| "kl": 0.04931640625, |
| "learning_rate": 2.983568387580093e-07, |
| "loss": 0.0009, |
| "reward": 0.9472330808639526, |
| "reward_std": 1.0713083744049072, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9316080808639526, |
| "step": 442 |
| }, |
| { |
| "completion_length": 352.546875, |
| "epoch": 0.6448326055312955, |
| "grad_norm": 1.2213995677397735, |
| "kl": 0.046142578125, |
| "learning_rate": 2.9620083994550184e-07, |
| "loss": -0.0025, |
| "reward": 1.1790754795074463, |
| "reward_std": 0.2134915590286255, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9699999690055847, |
| "step": 443 |
| }, |
| { |
| "completion_length": 343.625, |
| "epoch": 0.6462882096069869, |
| "grad_norm": 0.9866828975217354, |
| "kl": 0.046142578125, |
| "learning_rate": 2.940493758780037e-07, |
| "loss": 0.0007, |
| "reward": 1.290442705154419, |
| "reward_std": 0.4115654230117798, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.945742130279541, |
| "step": 444 |
| }, |
| { |
| "completion_length": 369.09375, |
| "epoch": 0.6477438136826783, |
| "grad_norm": 0.7976157219662836, |
| "kl": 0.04833984375, |
| "learning_rate": 2.919024944278462e-07, |
| "loss": -0.0017, |
| "reward": 1.1873372793197632, |
| "reward_std": 0.38416576385498047, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9347070455551147, |
| "step": 445 |
| }, |
| { |
| "completion_length": 349.140625, |
| "epoch": 0.6491994177583698, |
| "grad_norm": 1.034098435240967, |
| "kl": 0.047119140625, |
| "learning_rate": 2.8976024336539297e-07, |
| "loss": -0.001, |
| "reward": 0.949485719203949, |
| "reward_std": 0.7218962907791138, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9301497936248779, |
| "step": 446 |
| }, |
| { |
| "completion_length": 361.703125, |
| "epoch": 0.6506550218340611, |
| "grad_norm": 0.9093087268112631, |
| "kl": 0.0537109375, |
| "learning_rate": 2.8762267035797606e-07, |
| "loss": -0.003, |
| "reward": 1.5500586032867432, |
| "reward_std": 0.852545976638794, |
| "rewards/accuracy_reward": 0.890625, |
| "rewards/format_reward": 0.8781836032867432, |
| "step": 447 |
| }, |
| { |
| "completion_length": 362.65625, |
| "epoch": 0.6521106259097526, |
| "grad_norm": 0.9962666508897389, |
| "kl": 0.047607421875, |
| "learning_rate": 2.8548982296883685e-07, |
| "loss": -0.0015, |
| "reward": 0.7888085842132568, |
| "reward_std": 0.24111366271972656, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9591341614723206, |
| "step": 448 |
| }, |
| { |
| "completion_length": 365.578125, |
| "epoch": 0.653566229985444, |
| "grad_norm": 1.0644973570656397, |
| "kl": 0.047607421875, |
| "learning_rate": 2.8336174865606583e-07, |
| "loss": -0.0012, |
| "reward": 1.458815097808838, |
| "reward_std": 0.6925506591796875, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9265494346618652, |
| "step": 449 |
| }, |
| { |
| "completion_length": 377.78125, |
| "epoch": 0.6550218340611353, |
| "grad_norm": 0.8690653477908721, |
| "kl": 0.04541015625, |
| "learning_rate": 2.8123849477154806e-07, |
| "loss": 0.0022, |
| "reward": 0.6074088215827942, |
| "reward_std": 0.6371817588806152, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.91539067029953, |
| "step": 450 |
| }, |
| { |
| "completion_length": 343.5625, |
| "epoch": 0.6564774381368268, |
| "grad_norm": 0.9959008179108618, |
| "kl": 0.045166015625, |
| "learning_rate": 2.791201085599084e-07, |
| "loss": 0.0026, |
| "reward": 1.1102409362792969, |
| "reward_std": 0.37362387776374817, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9482877254486084, |
| "step": 451 |
| }, |
| { |
| "completion_length": 372.015625, |
| "epoch": 0.6579330422125182, |
| "grad_norm": 1.1144444930254087, |
| "kl": 0.0517578125, |
| "learning_rate": 2.770066371574621e-07, |
| "loss": 0.0046, |
| "reward": 0.5106966495513916, |
| "reward_std": 0.35071802139282227, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9128841161727905, |
| "step": 452 |
| }, |
| { |
| "completion_length": 370.703125, |
| "epoch": 0.6593886462882096, |
| "grad_norm": 0.9690064498501554, |
| "kl": 0.047119140625, |
| "learning_rate": 2.748981275911633e-07, |
| "loss": -0.0043, |
| "reward": 0.20597657561302185, |
| "reward_std": 0.7211107015609741, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.8844531178474426, |
| "step": 453 |
| }, |
| { |
| "completion_length": 364.328125, |
| "epoch": 0.660844250363901, |
| "grad_norm": 0.9366506361406456, |
| "kl": 0.044189453125, |
| "learning_rate": 2.7279462677756126e-07, |
| "loss": -0.0036, |
| "reward": 0.7447395920753479, |
| "reward_std": 0.6155897974967957, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9127213954925537, |
| "step": 454 |
| }, |
| { |
| "completion_length": 350.484375, |
| "epoch": 0.6622998544395924, |
| "grad_norm": 1.0086261231667006, |
| "kl": 0.047607421875, |
| "learning_rate": 2.7069618152175464e-07, |
| "loss": -0.0014, |
| "reward": 1.0693293809890747, |
| "reward_std": 0.44240403175354004, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9077669382095337, |
| "step": 455 |
| }, |
| { |
| "completion_length": 352.890625, |
| "epoch": 0.6637554585152838, |
| "grad_norm": 1.017662444972447, |
| "kl": 0.051513671875, |
| "learning_rate": 2.6860283851635063e-07, |
| "loss": -0.0016, |
| "reward": 1.1334245204925537, |
| "reward_std": 0.8161476850509644, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9266666769981384, |
| "step": 456 |
| }, |
| { |
| "completion_length": 363.09375, |
| "epoch": 0.6652110625909753, |
| "grad_norm": 0.9088775376959461, |
| "kl": 0.049072265625, |
| "learning_rate": 2.6651464434042596e-07, |
| "loss": 0.0007, |
| "reward": 1.1859569549560547, |
| "reward_std": 0.7904758453369141, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9305403232574463, |
| "step": 457 |
| }, |
| { |
| "completion_length": 358.1875, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.956917415239815, |
| "kl": 0.054931640625, |
| "learning_rate": 2.6443164545849113e-07, |
| "loss": -0.0023, |
| "reward": 0.31612628698349, |
| "reward_std": 0.6449480652809143, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.8944726586341858, |
| "step": 458 |
| }, |
| { |
| "completion_length": 346.40625, |
| "epoch": 0.6681222707423581, |
| "grad_norm": 1.1043376858496878, |
| "kl": 0.048828125, |
| "learning_rate": 2.6235388821945495e-07, |
| "loss": 0.0022, |
| "reward": 1.320787787437439, |
| "reward_std": 0.5146819353103638, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9273632764816284, |
| "step": 459 |
| }, |
| { |
| "completion_length": 358.453125, |
| "epoch": 0.6695778748180495, |
| "grad_norm": 1.1203658951352162, |
| "kl": 0.054443359375, |
| "learning_rate": 2.602814188555951e-07, |
| "loss": -0.004, |
| "reward": 0.97126305103302, |
| "reward_std": 0.8944222927093506, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9062368869781494, |
| "step": 460 |
| }, |
| { |
| "completion_length": 373.3125, |
| "epoch": 0.6710334788937409, |
| "grad_norm": 1.0496443261695805, |
| "kl": 0.04443359375, |
| "learning_rate": 2.5821428348152786e-07, |
| "loss": -0.0001, |
| "reward": 0.44576171040534973, |
| "reward_std": 1.2483347654342651, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.8427278399467468, |
| "step": 461 |
| }, |
| { |
| "completion_length": 344.953125, |
| "epoch": 0.6724890829694323, |
| "grad_norm": 1.053156674987038, |
| "kl": 0.050048828125, |
| "learning_rate": 2.561525280931828e-07, |
| "loss": 0.0003, |
| "reward": 0.9904752373695374, |
| "reward_std": 0.34265437722206116, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9724804759025574, |
| "step": 462 |
| }, |
| { |
| "completion_length": 370.65625, |
| "epoch": 0.6739446870451238, |
| "grad_norm": 0.940859770356533, |
| "kl": 0.046875, |
| "learning_rate": 2.5409619856677913e-07, |
| "loss": -0.0002, |
| "reward": 0.28072917461395264, |
| "reward_std": 0.7043707370758057, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.9188151359558105, |
| "step": 463 |
| }, |
| { |
| "completion_length": 350.5, |
| "epoch": 0.6754002911208151, |
| "grad_norm": 0.9974930451895179, |
| "kl": 0.052490234375, |
| "learning_rate": 2.5204534065780533e-07, |
| "loss": 0.0036, |
| "reward": 1.0816080570220947, |
| "reward_std": 0.7134989500045776, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9230663776397705, |
| "step": 464 |
| }, |
| { |
| "completion_length": 344.09375, |
| "epoch": 0.6768558951965066, |
| "grad_norm": 1.0005662998651903, |
| "kl": 0.05419921875, |
| "learning_rate": 2.500000000000001e-07, |
| "loss": 0.0036, |
| "reward": 1.0266536474227905, |
| "reward_std": 0.7755259871482849, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.909726619720459, |
| "step": 465 |
| }, |
| { |
| "completion_length": 373.8125, |
| "epoch": 0.6783114992721979, |
| "grad_norm": 0.8697379409789556, |
| "kl": 0.050048828125, |
| "learning_rate": 2.4796022210433764e-07, |
| "loss": 0.0015, |
| "reward": 0.6482356786727905, |
| "reward_std": 1.0505759716033936, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8648502230644226, |
| "step": 466 |
| }, |
| { |
| "completion_length": 350.203125, |
| "epoch": 0.6797671033478894, |
| "grad_norm": 1.0448464801188178, |
| "kl": 0.04345703125, |
| "learning_rate": 2.4592605235801537e-07, |
| "loss": 0.0024, |
| "reward": 1.170351505279541, |
| "reward_std": 0.9689666032791138, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.91190105676651, |
| "step": 467 |
| }, |
| { |
| "completion_length": 349.625, |
| "epoch": 0.6812227074235808, |
| "grad_norm": 1.0606796035208255, |
| "kl": 0.048828125, |
| "learning_rate": 2.438975360234429e-07, |
| "loss": 0.0004, |
| "reward": 1.5108983516693115, |
| "reward_std": 0.5824877023696899, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9749218821525574, |
| "step": 468 |
| }, |
| { |
| "completion_length": 334.25, |
| "epoch": 0.6826783114992722, |
| "grad_norm": 1.0491904469297986, |
| "kl": 0.049072265625, |
| "learning_rate": 2.4187471823723555e-07, |
| "loss": -0.0005, |
| "reward": 1.7903971672058105, |
| "reward_std": 0.5473105907440186, |
| "rewards/accuracy_reward": 0.953125, |
| "rewards/format_reward": 0.9310221672058105, |
| "step": 469 |
| }, |
| { |
| "completion_length": 368.1875, |
| "epoch": 0.6841339155749636, |
| "grad_norm": 0.8795420237835467, |
| "kl": 0.04296875, |
| "learning_rate": 2.3985764400921054e-07, |
| "loss": -0.0016, |
| "reward": 0.8880664110183716, |
| "reward_std": 0.7747170329093933, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9181705713272095, |
| "step": 470 |
| }, |
| { |
| "completion_length": 353.734375, |
| "epoch": 0.6855895196506551, |
| "grad_norm": 1.0274200723626312, |
| "kl": 0.051513671875, |
| "learning_rate": 2.378463582213842e-07, |
| "loss": 0.0011, |
| "reward": 1.3674869537353516, |
| "reward_std": 0.8639088869094849, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9249348640441895, |
| "step": 471 |
| }, |
| { |
| "completion_length": 371.578125, |
| "epoch": 0.6870451237263464, |
| "grad_norm": 0.9918082918405166, |
| "kl": 0.04443359375, |
| "learning_rate": 2.3584090562697424e-07, |
| "loss": -0.0038, |
| "reward": 0.47871094942092896, |
| "reward_std": 0.9248123168945312, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9306640625, |
| "step": 472 |
| }, |
| { |
| "completion_length": 367.421875, |
| "epoch": 0.6885007278020379, |
| "grad_norm": 0.9925350173805925, |
| "kl": 0.053466796875, |
| "learning_rate": 2.33841330849404e-07, |
| "loss": -0.0016, |
| "reward": 0.6097005009651184, |
| "reward_std": 0.6457391381263733, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9147526025772095, |
| "step": 473 |
| }, |
| { |
| "completion_length": 354.078125, |
| "epoch": 0.6899563318777293, |
| "grad_norm": 0.9429812617452892, |
| "kl": 0.044189453125, |
| "learning_rate": 2.3184767838130882e-07, |
| "loss": -0.002, |
| "reward": 0.7430534362792969, |
| "reward_std": 1.0212041139602661, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.8673632740974426, |
| "step": 474 |
| }, |
| { |
| "completion_length": 350.171875, |
| "epoch": 0.6914119359534207, |
| "grad_norm": 1.2082129742972214, |
| "kl": 0.04638671875, |
| "learning_rate": 2.298599925835466e-07, |
| "loss": 0.0004, |
| "reward": 0.455533891916275, |
| "reward_std": 0.4722123444080353, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9366145730018616, |
| "step": 475 |
| }, |
| { |
| "completion_length": 366.515625, |
| "epoch": 0.6928675400291121, |
| "grad_norm": 1.0274981124427542, |
| "kl": 0.048828125, |
| "learning_rate": 2.2787831768421046e-07, |
| "loss": 0.0033, |
| "reward": 0.10612629354000092, |
| "reward_std": 0.7683642506599426, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.9080273509025574, |
| "step": 476 |
| }, |
| { |
| "completion_length": 369.484375, |
| "epoch": 0.6943231441048034, |
| "grad_norm": 1.0175711783602979, |
| "kl": 0.04052734375, |
| "learning_rate": 2.2590269777764514e-07, |
| "loss": -0.0005, |
| "reward": 1.336686134338379, |
| "reward_std": 0.7981055974960327, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9460612535476685, |
| "step": 477 |
| }, |
| { |
| "completion_length": 337.78125, |
| "epoch": 0.6957787481804949, |
| "grad_norm": 1.1846182441564443, |
| "kl": 0.048828125, |
| "learning_rate": 2.2393317682346479e-07, |
| "loss": 0.0045, |
| "reward": 0.29799482226371765, |
| "reward_std": 0.4017283320426941, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.9656379818916321, |
| "step": 478 |
| }, |
| { |
| "completion_length": 342.171875, |
| "epoch": 0.6972343522561864, |
| "grad_norm": 1.0582752970242983, |
| "kl": 0.04443359375, |
| "learning_rate": 2.219697986455762e-07, |
| "loss": -0.0034, |
| "reward": 0.9232031106948853, |
| "reward_std": 0.38019859790802, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.988684892654419, |
| "step": 479 |
| }, |
| { |
| "completion_length": 331.15625, |
| "epoch": 0.6986899563318777, |
| "grad_norm": 0.956549348435419, |
| "kl": 0.0537109375, |
| "learning_rate": 2.2001260693120232e-07, |
| "loss": -0.0002, |
| "reward": 0.63113933801651, |
| "reward_std": 0.21865397691726685, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9857747554779053, |
| "step": 480 |
| }, |
| { |
| "completion_length": 320.21875, |
| "epoch": 0.7001455604075691, |
| "grad_norm": 1.149313470515847, |
| "kl": 0.056396484375, |
| "learning_rate": 2.1806164522991115e-07, |
| "loss": 0.0058, |
| "reward": 1.1596614122390747, |
| "reward_std": 0.2571667730808258, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9488281011581421, |
| "step": 481 |
| }, |
| { |
| "completion_length": 347.265625, |
| "epoch": 0.7016011644832606, |
| "grad_norm": 1.0533814952805514, |
| "kl": 0.048095703125, |
| "learning_rate": 2.1611695695264605e-07, |
| "loss": -0.0009, |
| "reward": 0.5740299224853516, |
| "reward_std": 0.5495303869247437, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9683268666267395, |
| "step": 482 |
| }, |
| { |
| "completion_length": 331.109375, |
| "epoch": 0.7030567685589519, |
| "grad_norm": 0.934218888424527, |
| "kl": 0.049560546875, |
| "learning_rate": 2.1417858537076067e-07, |
| "loss": 0.0015, |
| "reward": 1.4878125190734863, |
| "reward_std": 0.37252911925315857, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9549609422683716, |
| "step": 483 |
| }, |
| { |
| "completion_length": 359.734375, |
| "epoch": 0.7045123726346434, |
| "grad_norm": 1.0518061573140363, |
| "kl": 0.046142578125, |
| "learning_rate": 2.122465736150549e-07, |
| "loss": 0.0049, |
| "reward": 1.4919662475585938, |
| "reward_std": 0.26884597539901733, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9607161283493042, |
| "step": 484 |
| }, |
| { |
| "completion_length": 340.125, |
| "epoch": 0.7059679767103348, |
| "grad_norm": 0.9147536840648907, |
| "kl": 0.046630859375, |
| "learning_rate": 2.1032096467481664e-07, |
| "loss": 0.0008, |
| "reward": 0.4311913847923279, |
| "reward_std": 0.18409988284111023, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.9738216400146484, |
| "step": 485 |
| }, |
| { |
| "completion_length": 348.34375, |
| "epoch": 0.7074235807860262, |
| "grad_norm": 1.1058446642922939, |
| "kl": 0.044677734375, |
| "learning_rate": 2.0840180139686332e-07, |
| "loss": 0.0021, |
| "reward": -0.23982422053813934, |
| "reward_std": 0.7118780612945557, |
| "rewards/accuracy_reward": 0.265625, |
| "rewards/format_reward": 0.941347599029541, |
| "step": 486 |
| }, |
| { |
| "completion_length": 348.796875, |
| "epoch": 0.7088791848617176, |
| "grad_norm": 0.936587157845369, |
| "kl": 0.044921875, |
| "learning_rate": 2.0648912648459072e-07, |
| "loss": 0.0001, |
| "reward": 0.8201823234558105, |
| "reward_std": 0.6160249710083008, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9445312023162842, |
| "step": 487 |
| }, |
| { |
| "completion_length": 350.484375, |
| "epoch": 0.710334788937409, |
| "grad_norm": 1.0710239125130703, |
| "kl": 0.0439453125, |
| "learning_rate": 2.0458298249702095e-07, |
| "loss": 0.0004, |
| "reward": 1.0420703887939453, |
| "reward_std": 0.5851905345916748, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9298177361488342, |
| "step": 488 |
| }, |
| { |
| "completion_length": 356.015625, |
| "epoch": 0.7117903930131004, |
| "grad_norm": 0.8863497690353069, |
| "kl": 0.04296875, |
| "learning_rate": 2.026834118478567e-07, |
| "loss": -0.0044, |
| "reward": 1.2863867282867432, |
| "reward_std": 0.7339239716529846, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9426367282867432, |
| "step": 489 |
| }, |
| { |
| "completion_length": 345.890625, |
| "epoch": 0.7132459970887919, |
| "grad_norm": 1.0595443329712118, |
| "kl": 0.0498046875, |
| "learning_rate": 2.007904568045366e-07, |
| "loss": -0.0012, |
| "reward": 0.6528515815734863, |
| "reward_std": 0.4416601359844208, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9612500071525574, |
| "step": 490 |
| }, |
| { |
| "completion_length": 340.859375, |
| "epoch": 0.7147016011644832, |
| "grad_norm": 1.102733406546571, |
| "kl": 0.052001953125, |
| "learning_rate": 1.9890415948729534e-07, |
| "loss": -0.0017, |
| "reward": 1.5185351371765137, |
| "reward_std": 0.5196734666824341, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.93973308801651, |
| "step": 491 |
| }, |
| { |
| "completion_length": 344.796875, |
| "epoch": 0.7161572052401747, |
| "grad_norm": 1.013811804087309, |
| "kl": 0.04296875, |
| "learning_rate": 1.9702456186822592e-07, |
| "loss": 0.0057, |
| "reward": 1.5015950202941895, |
| "reward_std": 0.2588142156600952, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9537825584411621, |
| "step": 492 |
| }, |
| { |
| "completion_length": 323.671875, |
| "epoch": 0.7176128093158661, |
| "grad_norm": 1.1965785844666434, |
| "kl": 0.06201171875, |
| "learning_rate": 1.9515170577034657e-07, |
| "loss": -0.0009, |
| "reward": 1.3499219417572021, |
| "reward_std": 0.564841091632843, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9540885090827942, |
| "step": 493 |
| }, |
| { |
| "completion_length": 337.328125, |
| "epoch": 0.7190684133915575, |
| "grad_norm": 1.1320885181605083, |
| "kl": 0.053955078125, |
| "learning_rate": 1.93285632866669e-07, |
| "loss": -0.0003, |
| "reward": 1.0488346815109253, |
| "reward_std": 0.430465966463089, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9381445646286011, |
| "step": 494 |
| }, |
| { |
| "completion_length": 336.59375, |
| "epoch": 0.7205240174672489, |
| "grad_norm": 0.7656987950423686, |
| "kl": 0.05029296875, |
| "learning_rate": 1.914263846792725e-07, |
| "loss": -0.0014, |
| "reward": 0.2095833271741867, |
| "reward_std": 0.22649352252483368, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.9746744632720947, |
| "step": 495 |
| }, |
| { |
| "completion_length": 345.4375, |
| "epoch": 0.7219796215429404, |
| "grad_norm": 1.0169013561590603, |
| "kl": 0.043701171875, |
| "learning_rate": 1.895740025783782e-07, |
| "loss": -0.0001, |
| "reward": 1.433619737625122, |
| "reward_std": 0.5415338277816772, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9466406106948853, |
| "step": 496 |
| }, |
| { |
| "completion_length": 358.96875, |
| "epoch": 0.7234352256186317, |
| "grad_norm": 1.036810610063429, |
| "kl": 0.0458984375, |
| "learning_rate": 1.8772852778143062e-07, |
| "loss": -0.0019, |
| "reward": 1.3713606595993042, |
| "reward_std": 0.6118265390396118, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9330794811248779, |
| "step": 497 |
| }, |
| { |
| "completion_length": 347.625, |
| "epoch": 0.7248908296943232, |
| "grad_norm": 0.9756569197859266, |
| "kl": 0.0478515625, |
| "learning_rate": 1.858900013521788e-07, |
| "loss": -0.0005, |
| "reward": 1.2231640815734863, |
| "reward_std": 0.06950952857732773, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9696353673934937, |
| "step": 498 |
| }, |
| { |
| "completion_length": 342.609375, |
| "epoch": 0.7263464337700145, |
| "grad_norm": 1.104737132675653, |
| "kl": 0.047119140625, |
| "learning_rate": 1.8405846419976394e-07, |
| "loss": 0.0046, |
| "reward": 1.1284375190734863, |
| "reward_std": 0.30641376972198486, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9205468893051147, |
| "step": 499 |
| }, |
| { |
| "completion_length": 341.171875, |
| "epoch": 0.727802037845706, |
| "grad_norm": 1.0685649265760062, |
| "kl": 0.047119140625, |
| "learning_rate": 1.8223395707780786e-07, |
| "loss": -0.0022, |
| "reward": 0.746009111404419, |
| "reward_std": 0.37746497988700867, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.95305335521698, |
| "step": 500 |
| }, |
| { |
| "completion_length": 339.109375, |
| "epoch": 0.7292576419213974, |
| "grad_norm": 1.1494787457478763, |
| "kl": 0.06201171875, |
| "learning_rate": 1.8041652058350766e-07, |
| "loss": 0.0024, |
| "reward": 0.6165429353713989, |
| "reward_std": 1.0153552293777466, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9255794286727905, |
| "step": 501 |
| }, |
| { |
| "completion_length": 347.234375, |
| "epoch": 0.7307132459970888, |
| "grad_norm": 1.0597503387535725, |
| "kl": 0.044677734375, |
| "learning_rate": 1.7860619515673032e-07, |
| "loss": 0.0031, |
| "reward": 1.761705756187439, |
| "reward_std": 0.34419363737106323, |
| "rewards/accuracy_reward": 0.921875, |
| "rewards/format_reward": 0.9930989742279053, |
| "step": 502 |
| }, |
| { |
| "completion_length": 358.609375, |
| "epoch": 0.7321688500727802, |
| "grad_norm": 1.1777505730146933, |
| "kl": 0.0498046875, |
| "learning_rate": 1.7680302107911544e-07, |
| "loss": 0.0014, |
| "reward": 0.7594987154006958, |
| "reward_std": 0.6815189123153687, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9232096672058105, |
| "step": 503 |
| }, |
| { |
| "completion_length": 352.9375, |
| "epoch": 0.7336244541484717, |
| "grad_norm": 1.0262394911227826, |
| "kl": 0.051025390625, |
| "learning_rate": 1.7500703847317662e-07, |
| "loss": 0.0005, |
| "reward": 0.17281901836395264, |
| "reward_std": 0.6659858226776123, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.8982356786727905, |
| "step": 504 |
| }, |
| { |
| "completion_length": 366.046875, |
| "epoch": 0.735080058224163, |
| "grad_norm": 0.8757486128031936, |
| "kl": 0.046142578125, |
| "learning_rate": 1.7321828730141037e-07, |
| "loss": -0.003, |
| "reward": 0.9162174463272095, |
| "reward_std": 0.6034565567970276, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9449284076690674, |
| "step": 505 |
| }, |
| { |
| "completion_length": 350.703125, |
| "epoch": 0.7365356622998545, |
| "grad_norm": 1.027601535912833, |
| "kl": 0.048828125, |
| "learning_rate": 1.7143680736540572e-07, |
| "loss": -0.0017, |
| "reward": 0.7806705236434937, |
| "reward_std": 0.4177248477935791, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9474283456802368, |
| "step": 506 |
| }, |
| { |
| "completion_length": 345.546875, |
| "epoch": 0.7379912663755459, |
| "grad_norm": 0.9635347526020929, |
| "kl": 0.051025390625, |
| "learning_rate": 1.6966263830495935e-07, |
| "loss": 0.0, |
| "reward": 1.5247917175292969, |
| "reward_std": 0.18780048191547394, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9935416579246521, |
| "step": 507 |
| }, |
| { |
| "completion_length": 347.53125, |
| "epoch": 0.7394468704512372, |
| "grad_norm": 1.0624576316371095, |
| "kl": 0.048095703125, |
| "learning_rate": 1.6789581959719294e-07, |
| "loss": 0.0018, |
| "reward": 0.1759960949420929, |
| "reward_std": 0.6078127026557922, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.9451627731323242, |
| "step": 508 |
| }, |
| { |
| "completion_length": 357.21875, |
| "epoch": 0.7409024745269287, |
| "grad_norm": 0.9931955691969966, |
| "kl": 0.045654296875, |
| "learning_rate": 1.661363905556758e-07, |
| "loss": 0.0016, |
| "reward": 0.9983984231948853, |
| "reward_std": 0.25032860040664673, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9772396087646484, |
| "step": 509 |
| }, |
| { |
| "completion_length": 335.78125, |
| "epoch": 0.74235807860262, |
| "grad_norm": 1.1766955680022009, |
| "kl": 0.056640625, |
| "learning_rate": 1.6438439032954853e-07, |
| "loss": -0.0019, |
| "reward": 0.9316536784172058, |
| "reward_std": 0.9252973794937134, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9565234184265137, |
| "step": 510 |
| }, |
| { |
| "completion_length": 351.8125, |
| "epoch": 0.7438136826783115, |
| "grad_norm": 1.0104315217544244, |
| "kl": 0.047119140625, |
| "learning_rate": 1.6263985790265383e-07, |
| "loss": -0.0008, |
| "reward": 0.7268945574760437, |
| "reward_std": 0.5565764904022217, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9422982335090637, |
| "step": 511 |
| }, |
| { |
| "completion_length": 339.59375, |
| "epoch": 0.745269286754003, |
| "grad_norm": 1.0913421651153437, |
| "kl": 0.053955078125, |
| "learning_rate": 1.609028320926668e-07, |
| "loss": 0.0001, |
| "reward": 0.7289843559265137, |
| "reward_std": 0.42006969451904297, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9395182132720947, |
| "step": 512 |
| }, |
| { |
| "completion_length": 328.71875, |
| "epoch": 0.7467248908296943, |
| "grad_norm": 1.3247436820269498, |
| "kl": 0.056396484375, |
| "learning_rate": 1.5917335155023366e-07, |
| "loss": -0.0024, |
| "reward": 1.460852861404419, |
| "reward_std": 0.3853009045124054, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9632877707481384, |
| "step": 513 |
| }, |
| { |
| "completion_length": 343.625, |
| "epoch": 0.7481804949053857, |
| "grad_norm": 0.8746716302390848, |
| "kl": 0.04541015625, |
| "learning_rate": 1.574514547581095e-07, |
| "loss": 0.0023, |
| "reward": 0.48206380009651184, |
| "reward_std": 0.2509358525276184, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9288216233253479, |
| "step": 514 |
| }, |
| { |
| "completion_length": 341.65625, |
| "epoch": 0.7496360989810772, |
| "grad_norm": 1.1764333370638496, |
| "kl": 0.046875, |
| "learning_rate": 1.557371800303039e-07, |
| "loss": -0.0049, |
| "reward": 1.1197460889816284, |
| "reward_std": 0.38245856761932373, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9527279138565063, |
| "step": 515 |
| }, |
| { |
| "completion_length": 347.953125, |
| "epoch": 0.7510917030567685, |
| "grad_norm": 0.9861112517300733, |
| "kl": 0.055419921875, |
| "learning_rate": 1.5403056551122694e-07, |
| "loss": -0.0008, |
| "reward": 1.1204687356948853, |
| "reward_std": 0.37004244327545166, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.96037757396698, |
| "step": 516 |
| }, |
| { |
| "completion_length": 344.5, |
| "epoch": 0.75254730713246, |
| "grad_norm": 1.1969290490481124, |
| "kl": 0.04833984375, |
| "learning_rate": 1.5233164917484114e-07, |
| "loss": -0.0003, |
| "reward": 1.2382487058639526, |
| "reward_std": 0.022989844903349876, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9881705641746521, |
| "step": 517 |
| }, |
| { |
| "completion_length": 351.859375, |
| "epoch": 0.7540029112081513, |
| "grad_norm": 0.8687338106556315, |
| "kl": 0.044677734375, |
| "learning_rate": 1.5064046882381626e-07, |
| "loss": 0.0009, |
| "reward": 0.549817681312561, |
| "reward_std": 0.1450074017047882, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9933593273162842, |
| "step": 518 |
| }, |
| { |
| "completion_length": 342.375, |
| "epoch": 0.7554585152838428, |
| "grad_norm": 1.1222413107637277, |
| "kl": 0.051513671875, |
| "learning_rate": 1.4895706208868876e-07, |
| "loss": 0.0008, |
| "reward": 0.9473632574081421, |
| "reward_std": 0.6324166655540466, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9235742092132568, |
| "step": 519 |
| }, |
| { |
| "completion_length": 361.09375, |
| "epoch": 0.7569141193595342, |
| "grad_norm": 0.8767855411501952, |
| "kl": 0.04248046875, |
| "learning_rate": 1.4728146642702338e-07, |
| "loss": 0.0043, |
| "reward": 0.6358333826065063, |
| "reward_std": 0.42567020654678345, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9458463788032532, |
| "step": 520 |
| }, |
| { |
| "completion_length": 339.40625, |
| "epoch": 0.7583697234352256, |
| "grad_norm": 1.1957414378411515, |
| "kl": 0.046875, |
| "learning_rate": 1.4561371912258098e-07, |
| "loss": 0.0018, |
| "reward": 1.0426563024520874, |
| "reward_std": 0.49093982577323914, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.972434937953949, |
| "step": 521 |
| }, |
| { |
| "completion_length": 356.4375, |
| "epoch": 0.759825327510917, |
| "grad_norm": 1.1172524921233873, |
| "kl": 0.05419921875, |
| "learning_rate": 1.4395385728448727e-07, |
| "loss": -0.0025, |
| "reward": 1.7248958349227905, |
| "reward_std": 0.41973060369491577, |
| "rewards/accuracy_reward": 0.921875, |
| "rewards/format_reward": 0.9582551717758179, |
| "step": 522 |
| }, |
| { |
| "completion_length": 338.921875, |
| "epoch": 0.7612809315866085, |
| "grad_norm": 1.0582473955837757, |
| "kl": 0.044921875, |
| "learning_rate": 1.423019178464091e-07, |
| "loss": 0.0002, |
| "reward": 1.3815103769302368, |
| "reward_std": 0.5760092735290527, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9437500238418579, |
| "step": 523 |
| }, |
| { |
| "completion_length": 343.59375, |
| "epoch": 0.7627365356622998, |
| "grad_norm": 0.9984931203320635, |
| "kl": 0.046875, |
| "learning_rate": 1.406579375657308e-07, |
| "loss": 0.0039, |
| "reward": 0.6505143642425537, |
| "reward_std": 0.33663293719291687, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9126237034797668, |
| "step": 524 |
| }, |
| { |
| "completion_length": 341.40625, |
| "epoch": 0.7641921397379913, |
| "grad_norm": 1.1754051815019624, |
| "kl": 0.051513671875, |
| "learning_rate": 1.3902195302273778e-07, |
| "loss": -0.004, |
| "reward": 1.7554621696472168, |
| "reward_std": 0.6817976236343384, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.9429622888565063, |
| "step": 525 |
| }, |
| { |
| "completion_length": 333.265625, |
| "epoch": 0.7656477438136827, |
| "grad_norm": 1.0633972900525368, |
| "kl": 0.05078125, |
| "learning_rate": 1.373940006198014e-07, |
| "loss": 0.0001, |
| "reward": 1.441927194595337, |
| "reward_std": 0.5073419809341431, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9575521349906921, |
| "step": 526 |
| }, |
| { |
| "completion_length": 338.953125, |
| "epoch": 0.7671033478893741, |
| "grad_norm": 0.98335951118238, |
| "kl": 0.0478515625, |
| "learning_rate": 1.3577411658056965e-07, |
| "loss": 0.0003, |
| "reward": 0.7285221219062805, |
| "reward_std": 0.41777336597442627, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9417122602462769, |
| "step": 527 |
| }, |
| { |
| "completion_length": 342.265625, |
| "epoch": 0.7685589519650655, |
| "grad_norm": 1.104214294364568, |
| "kl": 0.048828125, |
| "learning_rate": 1.3416233694916086e-07, |
| "loss": -0.0018, |
| "reward": 1.0809439420700073, |
| "reward_std": 0.498748779296875, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9592252969741821, |
| "step": 528 |
| }, |
| { |
| "completion_length": 365.03125, |
| "epoch": 0.7700145560407569, |
| "grad_norm": 0.8864241710102599, |
| "kl": 0.05322265625, |
| "learning_rate": 1.325586975893621e-07, |
| "loss": -0.001, |
| "reward": 1.0926563739776611, |
| "reward_std": 0.29806235432624817, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9340364933013916, |
| "step": 529 |
| }, |
| { |
| "completion_length": 355.90625, |
| "epoch": 0.7714701601164483, |
| "grad_norm": 0.8844305655178724, |
| "kl": 0.045654296875, |
| "learning_rate": 1.3096323418383043e-07, |
| "loss": 0.0012, |
| "reward": 1.4333789348602295, |
| "reward_std": 0.4198879301548004, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9490039348602295, |
| "step": 530 |
| }, |
| { |
| "completion_length": 343.890625, |
| "epoch": 0.7729257641921398, |
| "grad_norm": 1.253496882295048, |
| "kl": 0.048828125, |
| "learning_rate": 1.2937598223330005e-07, |
| "loss": 0.0014, |
| "reward": 1.2209309339523315, |
| "reward_std": 0.08089350908994675, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.96630859375, |
| "step": 531 |
| }, |
| { |
| "completion_length": 338.375, |
| "epoch": 0.7743813682678311, |
| "grad_norm": 1.1782096109944808, |
| "kl": 0.046630859375, |
| "learning_rate": 1.2779697705579058e-07, |
| "loss": 0.0036, |
| "reward": 0.9942382574081421, |
| "reward_std": 0.33707913756370544, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.97488933801651, |
| "step": 532 |
| }, |
| { |
| "completion_length": 356.609375, |
| "epoch": 0.7758369723435226, |
| "grad_norm": 1.0305776858800557, |
| "kl": 0.04345703125, |
| "learning_rate": 1.262262537858233e-07, |
| "loss": -0.0023, |
| "reward": 0.9166210889816284, |
| "reward_std": 0.7489937543869019, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9442252516746521, |
| "step": 533 |
| }, |
| { |
| "completion_length": 366.984375, |
| "epoch": 0.777292576419214, |
| "grad_norm": 1.0309599470321495, |
| "kl": 0.043212890625, |
| "learning_rate": 1.2466384737363779e-07, |
| "loss": 0.0028, |
| "reward": 0.9351236820220947, |
| "reward_std": 1.2396140098571777, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9184830784797668, |
| "step": 534 |
| }, |
| { |
| "completion_length": 347.921875, |
| "epoch": 0.7787481804949054, |
| "grad_norm": 0.9649435296818741, |
| "kl": 0.05078125, |
| "learning_rate": 1.231097925844153e-07, |
| "loss": -0.003, |
| "reward": 1.10358726978302, |
| "reward_std": 0.4084942936897278, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.94621741771698, |
| "step": 535 |
| }, |
| { |
| "completion_length": 353.625, |
| "epoch": 0.7802037845705968, |
| "grad_norm": 1.2290305710541891, |
| "kl": 0.050537109375, |
| "learning_rate": 1.215641239975042e-07, |
| "loss": 0.004, |
| "reward": 0.35907554626464844, |
| "reward_std": 0.4438338279724121, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9410285949707031, |
| "step": 536 |
| }, |
| { |
| "completion_length": 356.546875, |
| "epoch": 0.7816593886462883, |
| "grad_norm": 0.7192201796065968, |
| "kl": 0.046142578125, |
| "learning_rate": 1.2002687600565137e-07, |
| "loss": -0.0018, |
| "reward": 1.6950325965881348, |
| "reward_std": 0.20082132518291473, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 0.97628253698349, |
| "step": 537 |
| }, |
| { |
| "completion_length": 348.703125, |
| "epoch": 0.7831149927219796, |
| "grad_norm": 1.0661868149196754, |
| "kl": 0.052734375, |
| "learning_rate": 1.1849808281423635e-07, |
| "loss": 0.002, |
| "reward": 1.1826952695846558, |
| "reward_std": 0.5324006080627441, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9778385162353516, |
| "step": 538 |
| }, |
| { |
| "completion_length": 354.0625, |
| "epoch": 0.784570596797671, |
| "grad_norm": 1.0553040942110303, |
| "kl": 0.049072265625, |
| "learning_rate": 1.1697777844051104e-07, |
| "loss": -0.0017, |
| "reward": 1.595253825187683, |
| "reward_std": 0.9109958410263062, |
| "rewards/accuracy_reward": 0.890625, |
| "rewards/format_reward": 0.9233788847923279, |
| "step": 539 |
| }, |
| { |
| "completion_length": 352.875, |
| "epoch": 0.7860262008733624, |
| "grad_norm": 0.8135854803568479, |
| "kl": 0.052734375, |
| "learning_rate": 1.1546599671284158e-07, |
| "loss": 0.0048, |
| "reward": 0.06436197459697723, |
| "reward_std": 0.49560049176216125, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.9329817295074463, |
| "step": 540 |
| }, |
| { |
| "completion_length": 356.609375, |
| "epoch": 0.7874818049490538, |
| "grad_norm": 1.098756609309082, |
| "kl": 0.043701171875, |
| "learning_rate": 1.1396277126995707e-07, |
| "loss": -0.0004, |
| "reward": 0.7316992282867432, |
| "reward_std": 0.6466116905212402, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9482096433639526, |
| "step": 541 |
| }, |
| { |
| "completion_length": 360.78125, |
| "epoch": 0.7889374090247453, |
| "grad_norm": 1.0121213548186219, |
| "kl": 0.046630859375, |
| "learning_rate": 1.1246813556019924e-07, |
| "loss": -0.0013, |
| "reward": 1.4149739742279053, |
| "reward_std": 0.7345938086509705, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9305989742279053, |
| "step": 542 |
| }, |
| { |
| "completion_length": 351.8125, |
| "epoch": 0.7903930131004366, |
| "grad_norm": 1.1589095869023704, |
| "kl": 0.04736328125, |
| "learning_rate": 1.1098212284078035e-07, |
| "loss": -0.0025, |
| "reward": 1.481673240661621, |
| "reward_std": 0.5085718631744385, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9943815469741821, |
| "step": 543 |
| }, |
| { |
| "completion_length": 345.8125, |
| "epoch": 0.7918486171761281, |
| "grad_norm": 0.9205851416631115, |
| "kl": 0.048583984375, |
| "learning_rate": 1.0950476617704124e-07, |
| "loss": 0.0009, |
| "reward": 1.3531510829925537, |
| "reward_std": 0.41115716099739075, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9585286378860474, |
| "step": 544 |
| }, |
| { |
| "completion_length": 364.125, |
| "epoch": 0.7933042212518195, |
| "grad_norm": 0.9830413575226257, |
| "kl": 0.041748046875, |
| "learning_rate": 1.0803609844171719e-07, |
| "loss": -0.0028, |
| "reward": 0.51806640625, |
| "reward_std": 0.560853123664856, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9600846767425537, |
| "step": 545 |
| }, |
| { |
| "completion_length": 363.234375, |
| "epoch": 0.7947598253275109, |
| "grad_norm": 0.9801296629236323, |
| "kl": 0.04736328125, |
| "learning_rate": 1.0657615231420491e-07, |
| "loss": 0.0013, |
| "reward": 0.36118483543395996, |
| "reward_std": 0.406131386756897, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9475390315055847, |
| "step": 546 |
| }, |
| { |
| "completion_length": 372.859375, |
| "epoch": 0.7962154294032023, |
| "grad_norm": 0.8933107929367656, |
| "kl": 0.0458984375, |
| "learning_rate": 1.0512496027983714e-07, |
| "loss": 0.0033, |
| "reward": 0.27970701456069946, |
| "reward_std": 0.969652533531189, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.871985673904419, |
| "step": 547 |
| }, |
| { |
| "completion_length": 376.1875, |
| "epoch": 0.7976710334788938, |
| "grad_norm": 0.8824515938688806, |
| "kl": 0.04541015625, |
| "learning_rate": 1.0368255462915765e-07, |
| "loss": 0.0006, |
| "reward": 0.76199871301651, |
| "reward_std": 0.5525078773498535, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9308398365974426, |
| "step": 548 |
| }, |
| { |
| "completion_length": 372.171875, |
| "epoch": 0.7991266375545851, |
| "grad_norm": 0.9171428811714127, |
| "kl": 0.04541015625, |
| "learning_rate": 1.0224896745720512e-07, |
| "loss": 0.002, |
| "reward": 1.1354882717132568, |
| "reward_std": 1.028761863708496, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.8847330808639526, |
| "step": 549 |
| }, |
| { |
| "completion_length": 374.25, |
| "epoch": 0.8005822416302766, |
| "grad_norm": 0.9349834701869527, |
| "kl": 0.04931640625, |
| "learning_rate": 1.00824230662797e-07, |
| "loss": 0.0021, |
| "reward": 0.27811199426651, |
| "reward_std": 1.2048439979553223, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.9164583683013916, |
| "step": 550 |
| }, |
| { |
| "completion_length": 354.34375, |
| "epoch": 0.8020378457059679, |
| "grad_norm": 1.0746749824810193, |
| "kl": 0.04443359375, |
| "learning_rate": 9.940837594782125e-08, |
| "loss": 0.001, |
| "reward": 0.9130924344062805, |
| "reward_std": 0.9886895418167114, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9405273199081421, |
| "step": 551 |
| }, |
| { |
| "completion_length": 369.28125, |
| "epoch": 0.8034934497816594, |
| "grad_norm": 0.9674452242381952, |
| "kl": 0.04443359375, |
| "learning_rate": 9.800143481652979e-08, |
| "loss": -0.0007, |
| "reward": 0.9438866972923279, |
| "reward_std": 0.6287566423416138, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9270898103713989, |
| "step": 552 |
| }, |
| { |
| "completion_length": 375.828125, |
| "epoch": 0.8049490538573508, |
| "grad_norm": 0.9793357755444789, |
| "kl": 0.042236328125, |
| "learning_rate": 9.660343857483799e-08, |
| "loss": -0.0034, |
| "reward": 1.215846300125122, |
| "reward_std": 1.2285206317901611, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.8707551956176758, |
| "step": 553 |
| }, |
| { |
| "completion_length": 355.03125, |
| "epoch": 0.8064046579330422, |
| "grad_norm": 0.9372538556562692, |
| "kl": 0.045166015625, |
| "learning_rate": 9.521441832962801e-08, |
| "loss": -0.0035, |
| "reward": 0.5855793952941895, |
| "reward_std": 0.4810252785682678, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9439908862113953, |
| "step": 554 |
| }, |
| { |
| "completion_length": 340.40625, |
| "epoch": 0.8078602620087336, |
| "grad_norm": 1.131677567848908, |
| "kl": 0.0478515625, |
| "learning_rate": 9.383440498805712e-08, |
| "loss": 0.0006, |
| "reward": 1.6682096719741821, |
| "reward_std": 0.6599315404891968, |
| "rewards/accuracy_reward": 0.921875, |
| "rewards/format_reward": 0.9025846719741821, |
| "step": 555 |
| }, |
| { |
| "completion_length": 360.125, |
| "epoch": 0.8093158660844251, |
| "grad_norm": 0.9570116291494392, |
| "kl": 0.048583984375, |
| "learning_rate": 9.246342925686884e-08, |
| "loss": -0.0017, |
| "reward": 0.6679362058639526, |
| "reward_std": 0.5891071557998657, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9264127612113953, |
| "step": 556 |
| }, |
| { |
| "completion_length": 350.296875, |
| "epoch": 0.8107714701601164, |
| "grad_norm": 1.0041594196588526, |
| "kl": 0.047607421875, |
| "learning_rate": 9.110152164171125e-08, |
| "loss": 0.0003, |
| "reward": 1.5817317962646484, |
| "reward_std": 0.5429558157920837, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9567317962646484, |
| "step": 557 |
| }, |
| { |
| "completion_length": 346.484375, |
| "epoch": 0.8122270742358079, |
| "grad_norm": 1.1612136223210474, |
| "kl": 0.045654296875, |
| "learning_rate": 8.974871244645626e-08, |
| "loss": -0.0012, |
| "reward": 1.318763017654419, |
| "reward_std": 0.8850969672203064, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9250651001930237, |
| "step": 558 |
| }, |
| { |
| "completion_length": 346.96875, |
| "epoch": 0.8136826783114993, |
| "grad_norm": 0.9594471396023049, |
| "kl": 0.05224609375, |
| "learning_rate": 8.840503177252745e-08, |
| "loss": -0.0016, |
| "reward": 1.168027400970459, |
| "reward_std": 0.8090516328811646, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9134830832481384, |
| "step": 559 |
| }, |
| { |
| "completion_length": 355.625, |
| "epoch": 0.8151382823871907, |
| "grad_norm": 1.165829749030139, |
| "kl": 0.0458984375, |
| "learning_rate": 8.707050951822842e-08, |
| "loss": -0.0029, |
| "reward": 0.23499347269535065, |
| "reward_std": 0.6717128753662109, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.9124023914337158, |
| "step": 560 |
| }, |
| { |
| "completion_length": 348.375, |
| "epoch": 0.8165938864628821, |
| "grad_norm": 0.8797964228562034, |
| "kl": 0.0439453125, |
| "learning_rate": 8.574517537807896e-08, |
| "loss": 0.0035, |
| "reward": 0.8617708086967468, |
| "reward_std": 0.06106572225689888, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9768619537353516, |
| "step": 561 |
| }, |
| { |
| "completion_length": 356.109375, |
| "epoch": 0.8180494905385735, |
| "grad_norm": 0.9540926767615193, |
| "kl": 0.04833984375, |
| "learning_rate": 8.442905884215329e-08, |
| "loss": 0.0001, |
| "reward": 0.7149023413658142, |
| "reward_std": 0.884148895740509, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9297460913658142, |
| "step": 562 |
| }, |
| { |
| "completion_length": 337.484375, |
| "epoch": 0.8195050946142649, |
| "grad_norm": 1.0541518957324363, |
| "kl": 0.0498046875, |
| "learning_rate": 8.31221891954243e-08, |
| "loss": -0.0055, |
| "reward": 0.9072656631469727, |
| "reward_std": 0.35776764154434204, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.9742317199707031, |
| "step": 563 |
| }, |
| { |
| "completion_length": 349.96875, |
| "epoch": 0.8209606986899564, |
| "grad_norm": 1.0803926660638092, |
| "kl": 0.047119140625, |
| "learning_rate": 8.182459551711197e-08, |
| "loss": 0.001, |
| "reward": 1.44936203956604, |
| "reward_std": 0.49504512548446655, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9640494585037231, |
| "step": 564 |
| }, |
| { |
| "completion_length": 350.421875, |
| "epoch": 0.8224163027656477, |
| "grad_norm": 1.0513767136216559, |
| "kl": 0.0576171875, |
| "learning_rate": 8.053630668003642e-08, |
| "loss": 0.0045, |
| "reward": 0.4982747435569763, |
| "reward_std": 0.013658922165632248, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9944205284118652, |
| "step": 565 |
| }, |
| { |
| "completion_length": 372.59375, |
| "epoch": 0.8238719068413392, |
| "grad_norm": 0.9692434119961149, |
| "kl": 0.04833984375, |
| "learning_rate": 7.925735134997491e-08, |
| "loss": 0.0001, |
| "reward": 0.5592772960662842, |
| "reward_std": 0.5283293724060059, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9178320169448853, |
| "step": 566 |
| }, |
| { |
| "completion_length": 342.453125, |
| "epoch": 0.8253275109170306, |
| "grad_norm": 1.0913320663341668, |
| "kl": 0.048095703125, |
| "learning_rate": 7.798775798502482e-08, |
| "loss": 0.0006, |
| "reward": 1.8661328554153442, |
| "reward_std": 0.2512561082839966, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 0.9598828554153442, |
| "step": 567 |
| }, |
| { |
| "completion_length": 344.15625, |
| "epoch": 0.826783114992722, |
| "grad_norm": 1.0413698797681266, |
| "kl": 0.04638671875, |
| "learning_rate": 7.672755483496979e-08, |
| "loss": 0.0034, |
| "reward": 1.9249870777130127, |
| "reward_std": 0.19554658234119415, |
| "rewards/accuracy_reward": 0.984375, |
| "rewards/format_reward": 0.9718619585037231, |
| "step": 568 |
| }, |
| { |
| "completion_length": 355.34375, |
| "epoch": 0.8282387190684134, |
| "grad_norm": 1.1642983816622974, |
| "kl": 0.04541015625, |
| "learning_rate": 7.547676994065116e-08, |
| "loss": -0.0025, |
| "reward": 1.4583983421325684, |
| "reward_std": 0.6337254643440247, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9729557037353516, |
| "step": 569 |
| }, |
| { |
| "completion_length": 358.421875, |
| "epoch": 0.8296943231441049, |
| "grad_norm": 0.8768810528600219, |
| "kl": 0.045654296875, |
| "learning_rate": 7.423543113334435e-08, |
| "loss": -0.0034, |
| "reward": 0.9223567843437195, |
| "reward_std": 0.24935288727283478, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9516797065734863, |
| "step": 570 |
| }, |
| { |
| "completion_length": 377.25, |
| "epoch": 0.8311499272197962, |
| "grad_norm": 0.9846837227637683, |
| "kl": 0.04443359375, |
| "learning_rate": 7.300356603413965e-08, |
| "loss": 0.0001, |
| "reward": 1.3689582347869873, |
| "reward_std": 1.1683030128479004, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.8845832347869873, |
| "step": 571 |
| }, |
| { |
| "completion_length": 360.484375, |
| "epoch": 0.8326055312954876, |
| "grad_norm": 1.008771276578655, |
| "kl": 0.051025390625, |
| "learning_rate": 7.178120205332716e-08, |
| "loss": -0.0002, |
| "reward": 1.635097622871399, |
| "reward_std": 0.7983198165893555, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 0.9163476228713989, |
| "step": 572 |
| }, |
| { |
| "completion_length": 352.296875, |
| "epoch": 0.834061135371179, |
| "grad_norm": 1.1642594254533514, |
| "kl": 0.0498046875, |
| "learning_rate": 7.056836638978696e-08, |
| "loss": 0.0036, |
| "reward": 0.44958335161209106, |
| "reward_std": 0.6141500473022461, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9442839026451111, |
| "step": 573 |
| }, |
| { |
| "completion_length": 336.0625, |
| "epoch": 0.8355167394468704, |
| "grad_norm": 1.0723516782661382, |
| "kl": 0.05029296875, |
| "learning_rate": 6.936508603038465e-08, |
| "loss": 0.0009, |
| "reward": 1.4923112392425537, |
| "reward_std": 0.3686758875846863, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9570898413658142, |
| "step": 574 |
| }, |
| { |
| "completion_length": 362.203125, |
| "epoch": 0.8369723435225619, |
| "grad_norm": 0.8489752039238883, |
| "kl": 0.04443359375, |
| "learning_rate": 6.817138774936975e-08, |
| "loss": -0.0025, |
| "reward": 1.5311849117279053, |
| "reward_std": 0.48614656925201416, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.952552080154419, |
| "step": 575 |
| }, |
| { |
| "completion_length": 321.90625, |
| "epoch": 0.8384279475982532, |
| "grad_norm": 1.2381358943286065, |
| "kl": 0.057861328125, |
| "learning_rate": 6.698729810778064e-08, |
| "loss": 0.0007, |
| "reward": 1.2007226943969727, |
| "reward_std": 0.3871755599975586, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9507226943969727, |
| "step": 576 |
| }, |
| { |
| "completion_length": 345.765625, |
| "epoch": 0.8398835516739447, |
| "grad_norm": 0.9963944986671528, |
| "kl": 0.04638671875, |
| "learning_rate": 6.58128434528537e-08, |
| "loss": -0.0006, |
| "reward": 0.7609505653381348, |
| "reward_std": 0.811994194984436, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9278906583786011, |
| "step": 577 |
| }, |
| { |
| "completion_length": 345.453125, |
| "epoch": 0.8413391557496361, |
| "grad_norm": 0.9963445120426506, |
| "kl": 0.0498046875, |
| "learning_rate": 6.464804991743628e-08, |
| "loss": 0.0036, |
| "reward": 1.5395898818969727, |
| "reward_std": 0.23713621497154236, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.959876298904419, |
| "step": 578 |
| }, |
| { |
| "completion_length": 357.3125, |
| "epoch": 0.8427947598253275, |
| "grad_norm": 0.9805282214375126, |
| "kl": 0.048095703125, |
| "learning_rate": 6.349294341940592e-08, |
| "loss": 0.0006, |
| "reward": 1.4974348545074463, |
| "reward_std": 0.36625921726226807, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9642578363418579, |
| "step": 579 |
| }, |
| { |
| "completion_length": 368.546875, |
| "epoch": 0.8442503639010189, |
| "grad_norm": 0.9270265739588998, |
| "kl": 0.043701171875, |
| "learning_rate": 6.234754966109351e-08, |
| "loss": -0.001, |
| "reward": 1.1266862154006958, |
| "reward_std": 0.35941964387893677, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9662825465202332, |
| "step": 580 |
| }, |
| { |
| "completion_length": 338.53125, |
| "epoch": 0.8457059679767104, |
| "grad_norm": 1.0906414405939546, |
| "kl": 0.049072265625, |
| "learning_rate": 6.12118941287112e-08, |
| "loss": 0.002, |
| "reward": 1.5338281393051147, |
| "reward_std": 0.24283231794834137, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.9543749690055847, |
| "step": 581 |
| }, |
| { |
| "completion_length": 343.84375, |
| "epoch": 0.8471615720524017, |
| "grad_norm": 0.9818201939354958, |
| "kl": 0.04638671875, |
| "learning_rate": 6.008600209178538e-08, |
| "loss": -0.0039, |
| "reward": 1.5875390768051147, |
| "reward_std": 0.33346858620643616, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9625390768051147, |
| "step": 582 |
| }, |
| { |
| "completion_length": 375.671875, |
| "epoch": 0.8486171761280932, |
| "grad_norm": 1.0416004517786164, |
| "kl": 0.04736328125, |
| "learning_rate": 5.8969898602594325e-08, |
| "loss": 0.0057, |
| "reward": 0.5021549463272095, |
| "reward_std": 0.6197409629821777, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9480664134025574, |
| "step": 583 |
| }, |
| { |
| "completion_length": 368.109375, |
| "epoch": 0.8500727802037845, |
| "grad_norm": 0.8470887591348754, |
| "kl": 0.048828125, |
| "learning_rate": 5.786360849561117e-08, |
| "loss": 0.0019, |
| "reward": 1.3838281631469727, |
| "reward_std": 0.6718348264694214, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9463281631469727, |
| "step": 584 |
| }, |
| { |
| "completion_length": 379.875, |
| "epoch": 0.851528384279476, |
| "grad_norm": 0.9181167404523447, |
| "kl": 0.044921875, |
| "learning_rate": 5.676715638695062e-08, |
| "loss": 0.0009, |
| "reward": 0.4963216185569763, |
| "reward_std": 0.826927900314331, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9025716185569763, |
| "step": 585 |
| }, |
| { |
| "completion_length": 366.78125, |
| "epoch": 0.8529839883551674, |
| "grad_norm": 1.2129962706817803, |
| "kl": 0.05419921875, |
| "learning_rate": 5.5680566673822096e-08, |
| "loss": -0.0009, |
| "reward": 0.641100287437439, |
| "reward_std": 1.051483154296875, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.9454361796379089, |
| "step": 586 |
| }, |
| { |
| "completion_length": 352.34375, |
| "epoch": 0.8544395924308588, |
| "grad_norm": 0.9410634324985974, |
| "kl": 0.04931640625, |
| "learning_rate": 5.4603863533985825e-08, |
| "loss": -0.0023, |
| "reward": 1.0430793762207031, |
| "reward_std": 0.46832895278930664, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9298372268676758, |
| "step": 587 |
| }, |
| { |
| "completion_length": 362.78125, |
| "epoch": 0.8558951965065502, |
| "grad_norm": 1.0843006668744697, |
| "kl": 0.044921875, |
| "learning_rate": 5.353707092521581e-08, |
| "loss": 0.0068, |
| "reward": 1.073815107345581, |
| "reward_std": 0.4660176932811737, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9153515100479126, |
| "step": 588 |
| }, |
| { |
| "completion_length": 348.21875, |
| "epoch": 0.8573508005822417, |
| "grad_norm": 1.0292956885097058, |
| "kl": 0.048095703125, |
| "learning_rate": 5.2480212584766035e-08, |
| "loss": -0.0002, |
| "reward": 1.5367252826690674, |
| "reward_std": 0.8215746879577637, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 0.9117252826690674, |
| "step": 589 |
| }, |
| { |
| "completion_length": 356.265625, |
| "epoch": 0.858806404657933, |
| "grad_norm": 0.9099297684040104, |
| "kl": 0.04638671875, |
| "learning_rate": 5.143331202884299e-08, |
| "loss": -0.0024, |
| "reward": 1.44970703125, |
| "reward_std": 0.39304301142692566, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9629882574081421, |
| "step": 590 |
| }, |
| { |
| "completion_length": 364.65625, |
| "epoch": 0.8602620087336245, |
| "grad_norm": 0.9309941077444687, |
| "kl": 0.048583984375, |
| "learning_rate": 5.039639255208156e-08, |
| "loss": -0.0002, |
| "reward": 1.375429630279541, |
| "reward_std": 0.5939683318138123, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9332812428474426, |
| "step": 591 |
| }, |
| { |
| "completion_length": 367.625, |
| "epoch": 0.8617176128093159, |
| "grad_norm": 0.8096322535425639, |
| "kl": 0.044921875, |
| "learning_rate": 4.9369477227027614e-08, |
| "loss": -0.0, |
| "reward": 1.422376275062561, |
| "reward_std": 0.4559106230735779, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9346680045127869, |
| "step": 592 |
| }, |
| { |
| "completion_length": 348.34375, |
| "epoch": 0.8631732168850073, |
| "grad_norm": 0.9888972852645406, |
| "kl": 0.047119140625, |
| "learning_rate": 4.835258890362387e-08, |
| "loss": 0.0014, |
| "reward": 0.8151302337646484, |
| "reward_std": 0.5511770844459534, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9372526407241821, |
| "step": 593 |
| }, |
| { |
| "completion_length": 359.03125, |
| "epoch": 0.8646288209606987, |
| "grad_norm": 1.0504733411057308, |
| "kl": 0.050537109375, |
| "learning_rate": 4.7345750208701684e-08, |
| "loss": -0.0021, |
| "reward": 1.6032031774520874, |
| "reward_std": 0.6642707586288452, |
| "rewards/accuracy_reward": 0.890625, |
| "rewards/format_reward": 0.9313281178474426, |
| "step": 594 |
| }, |
| { |
| "completion_length": 360.5625, |
| "epoch": 0.86608442503639, |
| "grad_norm": 0.9767718704060626, |
| "kl": 0.047119140625, |
| "learning_rate": 4.634898354547778e-08, |
| "loss": 0.0029, |
| "reward": 0.6824283599853516, |
| "reward_std": 0.3918249011039734, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8971809148788452, |
| "step": 595 |
| }, |
| { |
| "completion_length": 365.375, |
| "epoch": 0.8675400291120815, |
| "grad_norm": 0.7220183880376431, |
| "kl": 0.0419921875, |
| "learning_rate": 4.536231109305577e-08, |
| "loss": -0.0012, |
| "reward": 1.7962956428527832, |
| "reward_std": 0.3715449273586273, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.9828841686248779, |
| "step": 596 |
| }, |
| { |
| "completion_length": 369.734375, |
| "epoch": 0.868995633187773, |
| "grad_norm": 1.013373247031297, |
| "kl": 0.048828125, |
| "learning_rate": 4.4385754805932095e-08, |
| "loss": 0.0003, |
| "reward": 1.1727409362792969, |
| "reward_std": 0.9622257351875305, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.9174544215202332, |
| "step": 597 |
| }, |
| { |
| "completion_length": 373.765625, |
| "epoch": 0.8704512372634643, |
| "grad_norm": 0.9611521678835144, |
| "kl": 0.0478515625, |
| "learning_rate": 4.341933641350842e-08, |
| "loss": 0.0004, |
| "reward": 0.015071600675582886, |
| "reward_std": 0.68045574426651, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.9341601729393005, |
| "step": 598 |
| }, |
| { |
| "completion_length": 354.03125, |
| "epoch": 0.8719068413391557, |
| "grad_norm": 1.1525733336623423, |
| "kl": 0.046630859375, |
| "learning_rate": 4.2463077419606976e-08, |
| "loss": 0.0017, |
| "reward": 1.3347785472869873, |
| "reward_std": 0.567120373249054, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9418619871139526, |
| "step": 599 |
| }, |
| { |
| "completion_length": 363.328125, |
| "epoch": 0.8733624454148472, |
| "grad_norm": 1.0773160423814951, |
| "kl": 0.048095703125, |
| "learning_rate": 4.151699910199336e-08, |
| "loss": 0.0036, |
| "reward": 0.6707291603088379, |
| "reward_std": 0.8057493567466736, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.8792187571525574, |
| "step": 600 |
| }, |
| { |
| "completion_length": 359.28125, |
| "epoch": 0.8748180494905385, |
| "grad_norm": 0.9746476457350777, |
| "kl": 0.0537109375, |
| "learning_rate": 4.058112251190193e-08, |
| "loss": -0.0024, |
| "reward": 0.4454817771911621, |
| "reward_std": 0.7263391613960266, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.8914192318916321, |
| "step": 601 |
| }, |
| { |
| "completion_length": 380.09375, |
| "epoch": 0.87627365356623, |
| "grad_norm": 0.9946978532938755, |
| "kl": 0.046630859375, |
| "learning_rate": 3.9655468473568435e-08, |
| "loss": -0.0026, |
| "reward": 0.7158983945846558, |
| "reward_std": 0.7142089009284973, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9306380152702332, |
| "step": 602 |
| }, |
| { |
| "completion_length": 370.9375, |
| "epoch": 0.8777292576419214, |
| "grad_norm": 1.0502943403626153, |
| "kl": 0.0498046875, |
| "learning_rate": 3.8740057583765694e-08, |
| "loss": -0.0026, |
| "reward": 1.6357030868530273, |
| "reward_std": 0.7968278527259827, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 0.9169531464576721, |
| "step": 603 |
| }, |
| { |
| "completion_length": 367.421875, |
| "epoch": 0.8791848617176128, |
| "grad_norm": 0.8640177808219943, |
| "kl": 0.048828125, |
| "learning_rate": 3.783491021134588e-08, |
| "loss": 0.0039, |
| "reward": 1.4072265625, |
| "reward_std": 0.551764965057373, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9208593368530273, |
| "step": 604 |
| }, |
| { |
| "completion_length": 366.359375, |
| "epoch": 0.8806404657933042, |
| "grad_norm": 0.8946058550020072, |
| "kl": 0.053466796875, |
| "learning_rate": 3.694004649678706e-08, |
| "loss": -0.0015, |
| "reward": 1.261816382408142, |
| "reward_std": 0.89951491355896, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9160351753234863, |
| "step": 605 |
| }, |
| { |
| "completion_length": 346.609375, |
| "epoch": 0.8820960698689956, |
| "grad_norm": 0.9508950509942107, |
| "kl": 0.046142578125, |
| "learning_rate": 3.6055486351745324e-08, |
| "loss": 0.0035, |
| "reward": 1.8074610233306885, |
| "reward_std": 0.5373015999794006, |
| "rewards/accuracy_reward": 0.953125, |
| "rewards/format_reward": 0.9480859041213989, |
| "step": 606 |
| }, |
| { |
| "completion_length": 359.171875, |
| "epoch": 0.883551673944687, |
| "grad_norm": 1.0635120558945663, |
| "kl": 0.050537109375, |
| "learning_rate": 3.51812494586114e-08, |
| "loss": 0.0011, |
| "reward": -0.21584634482860565, |
| "reward_std": 0.69797682762146, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.8804166316986084, |
| "step": 607 |
| }, |
| { |
| "completion_length": 359.65625, |
| "epoch": 0.8850072780203785, |
| "grad_norm": 0.9616362685575608, |
| "kl": 0.0478515625, |
| "learning_rate": 3.4317355270072954e-08, |
| "loss": 0.0031, |
| "reward": 0.5914518237113953, |
| "reward_std": 0.7840473651885986, |
| "rewards/accuracy_reward": 0.546875, |
| "rewards/format_reward": 0.9474804997444153, |
| "step": 608 |
| }, |
| { |
| "completion_length": 374.515625, |
| "epoch": 0.8864628820960698, |
| "grad_norm": 0.9665570781215977, |
| "kl": 0.0537109375, |
| "learning_rate": 3.3463823008681334e-08, |
| "loss": 0.0009, |
| "reward": 0.6901432275772095, |
| "reward_std": 0.44793906807899475, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9485155940055847, |
| "step": 609 |
| }, |
| { |
| "completion_length": 348.96875, |
| "epoch": 0.8879184861717613, |
| "grad_norm": 1.1514296655136664, |
| "kl": 0.056396484375, |
| "learning_rate": 3.2620671666424515e-08, |
| "loss": -0.0005, |
| "reward": 1.0574414730072021, |
| "reward_std": 0.2184952199459076, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9922200441360474, |
| "step": 610 |
| }, |
| { |
| "completion_length": 386.796875, |
| "epoch": 0.8893740902474527, |
| "grad_norm": 0.8572317909478371, |
| "kl": 0.046630859375, |
| "learning_rate": 3.17879200043038e-08, |
| "loss": -0.0004, |
| "reward": 0.48298177123069763, |
| "reward_std": 0.4573482573032379, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.93610680103302, |
| "step": 611 |
| }, |
| { |
| "completion_length": 364.4375, |
| "epoch": 0.8908296943231441, |
| "grad_norm": 0.80677467402995, |
| "kl": 0.046142578125, |
| "learning_rate": 3.0965586551917054e-08, |
| "loss": 0.0023, |
| "reward": 0.9373893141746521, |
| "reward_std": 0.7911602258682251, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9207747578620911, |
| "step": 612 |
| }, |
| { |
| "completion_length": 377.734375, |
| "epoch": 0.8922852983988355, |
| "grad_norm": 0.7229029396190814, |
| "kl": 0.046875, |
| "learning_rate": 3.015368960704584e-08, |
| "loss": 0.0026, |
| "reward": 1.031211018562317, |
| "reward_std": 0.7297381162643433, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.91817706823349, |
| "step": 613 |
| }, |
| { |
| "completion_length": 348.046875, |
| "epoch": 0.893740902474527, |
| "grad_norm": 1.2890732275734214, |
| "kl": 0.056640625, |
| "learning_rate": 2.935224723524843e-08, |
| "loss": -0.0016, |
| "reward": -0.2784309983253479, |
| "reward_std": 0.33096808195114136, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.9581446051597595, |
| "step": 614 |
| }, |
| { |
| "completion_length": 341.65625, |
| "epoch": 0.8951965065502183, |
| "grad_norm": 1.1372194523462842, |
| "kl": 0.04638671875, |
| "learning_rate": 2.8561277269457895e-08, |
| "loss": -0.0019, |
| "reward": 1.5267903804779053, |
| "reward_std": 0.19310006499290466, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.99211585521698, |
| "step": 615 |
| }, |
| { |
| "completion_length": 340.84375, |
| "epoch": 0.8966521106259098, |
| "grad_norm": 1.1633831512202182, |
| "kl": 0.054443359375, |
| "learning_rate": 2.7780797309585603e-08, |
| "loss": 0.0015, |
| "reward": 1.0949218273162842, |
| "reward_std": 0.4195671081542969, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9380208253860474, |
| "step": 616 |
| }, |
| { |
| "completion_length": 340.375, |
| "epoch": 0.8981077147016011, |
| "grad_norm": 1.0572682704568244, |
| "kl": 0.0546875, |
| "learning_rate": 2.701082472212879e-08, |
| "loss": 0.0005, |
| "reward": 1.088769555091858, |
| "reward_std": 1.067856788635254, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9237044453620911, |
| "step": 617 |
| }, |
| { |
| "completion_length": 365.3125, |
| "epoch": 0.8995633187772926, |
| "grad_norm": 0.8282223286971002, |
| "kl": 0.054931640625, |
| "learning_rate": 2.625137663978516e-08, |
| "loss": 0.0001, |
| "reward": 1.419173240661621, |
| "reward_std": 0.7184128761291504, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9299284219741821, |
| "step": 618 |
| }, |
| { |
| "completion_length": 354.203125, |
| "epoch": 0.901018922852984, |
| "grad_norm": 1.0373995243480785, |
| "kl": 0.046142578125, |
| "learning_rate": 2.5502469961070637e-08, |
| "loss": -0.0006, |
| "reward": 0.5479947924613953, |
| "reward_std": 0.4919097423553467, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9511458873748779, |
| "step": 619 |
| }, |
| { |
| "completion_length": 370.484375, |
| "epoch": 0.9024745269286754, |
| "grad_norm": 0.9212810693479083, |
| "kl": 0.046142578125, |
| "learning_rate": 2.4764121349944265e-08, |
| "loss": 0.0015, |
| "reward": 0.3472330868244171, |
| "reward_std": 0.448018878698349, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9308269023895264, |
| "step": 620 |
| }, |
| { |
| "completion_length": 358.921875, |
| "epoch": 0.9039301310043668, |
| "grad_norm": 0.9860633655951645, |
| "kl": 0.0478515625, |
| "learning_rate": 2.4036347235436738e-08, |
| "loss": 0.0069, |
| "reward": 0.37333983182907104, |
| "reward_std": 0.5936962962150574, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9631054997444153, |
| "step": 621 |
| }, |
| { |
| "completion_length": 351.390625, |
| "epoch": 0.9053857350800583, |
| "grad_norm": 0.9338847956064392, |
| "kl": 0.04736328125, |
| "learning_rate": 2.331916381128535e-08, |
| "loss": 0.0042, |
| "reward": 1.354824185371399, |
| "reward_std": 0.553050696849823, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9633399248123169, |
| "step": 622 |
| }, |
| { |
| "completion_length": 363.1875, |
| "epoch": 0.9068413391557496, |
| "grad_norm": 0.877438715066185, |
| "kl": 0.04443359375, |
| "learning_rate": 2.2612587035573226e-08, |
| "loss": 0.0024, |
| "reward": 0.77873694896698, |
| "reward_std": 0.24482090771198273, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9492838382720947, |
| "step": 623 |
| }, |
| { |
| "completion_length": 377.453125, |
| "epoch": 0.9082969432314411, |
| "grad_norm": 0.9221142915068276, |
| "kl": 0.04150390625, |
| "learning_rate": 2.1916632630374577e-08, |
| "loss": 0.0007, |
| "reward": 1.2329556941986084, |
| "reward_std": 0.8266670107841492, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.8883593082427979, |
| "step": 624 |
| }, |
| { |
| "completion_length": 357.9375, |
| "epoch": 0.9097525473071325, |
| "grad_norm": 1.0177312459104024, |
| "kl": 0.048583984375, |
| "learning_rate": 2.123131608140455e-08, |
| "loss": -0.001, |
| "reward": 1.74072265625, |
| "reward_std": 0.59820157289505, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.92822265625, |
| "step": 625 |
| }, |
| { |
| "completion_length": 352.171875, |
| "epoch": 0.9112081513828238, |
| "grad_norm": 0.8169726421475969, |
| "kl": 0.051025390625, |
| "learning_rate": 2.0556652637675144e-08, |
| "loss": -0.0005, |
| "reward": 1.4498176574707031, |
| "reward_std": 0.48735594749450684, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9654426574707031, |
| "step": 626 |
| }, |
| { |
| "completion_length": 347.40625, |
| "epoch": 0.9126637554585153, |
| "grad_norm": 1.043701865873264, |
| "kl": 0.049072265625, |
| "learning_rate": 1.989265731115525e-08, |
| "loss": 0.003, |
| "reward": 1.333925724029541, |
| "reward_std": 0.8411956429481506, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9422720670700073, |
| "step": 627 |
| }, |
| { |
| "completion_length": 363.640625, |
| "epoch": 0.9141193595342066, |
| "grad_norm": 0.9420184278378598, |
| "kl": 0.047607421875, |
| "learning_rate": 1.9239344876437248e-08, |
| "loss": 0.0015, |
| "reward": 1.0000911951065063, |
| "reward_std": 0.5910583138465881, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9331510663032532, |
| "step": 628 |
| }, |
| { |
| "completion_length": 355.1875, |
| "epoch": 0.9155749636098981, |
| "grad_norm": 0.9554452056503712, |
| "kl": 0.052978515625, |
| "learning_rate": 1.8596729870407835e-08, |
| "loss": 0.0032, |
| "reward": 1.8222005367279053, |
| "reward_std": 0.49411827325820923, |
| "rewards/accuracy_reward": 0.953125, |
| "rewards/format_reward": 0.9628255367279053, |
| "step": 629 |
| }, |
| { |
| "completion_length": 347.140625, |
| "epoch": 0.9170305676855895, |
| "grad_norm": 0.9306151510914769, |
| "kl": 0.046142578125, |
| "learning_rate": 1.796482659192472e-08, |
| "loss": 0.0041, |
| "reward": 1.479524850845337, |
| "reward_std": 0.41134506464004517, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9424023032188416, |
| "step": 630 |
| }, |
| { |
| "completion_length": 374.625, |
| "epoch": 0.9184861717612809, |
| "grad_norm": 0.8480676856677177, |
| "kl": 0.047119140625, |
| "learning_rate": 1.7343649101498327e-08, |
| "loss": 0.0028, |
| "reward": 1.3734700679779053, |
| "reward_std": 0.3788111209869385, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9359700679779053, |
| "step": 631 |
| }, |
| { |
| "completion_length": 353.765625, |
| "epoch": 0.9199417758369723, |
| "grad_norm": 0.8894056614696153, |
| "kl": 0.0595703125, |
| "learning_rate": 1.6733211220979315e-08, |
| "loss": -0.0011, |
| "reward": 0.7032226324081421, |
| "reward_std": 0.6665799617767334, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.9133919477462769, |
| "step": 632 |
| }, |
| { |
| "completion_length": 354.5, |
| "epoch": 0.9213973799126638, |
| "grad_norm": 1.2302918226628299, |
| "kl": 0.0478515625, |
| "learning_rate": 1.6133526533250563e-08, |
| "loss": -0.0005, |
| "reward": 1.0562500953674316, |
| "reward_std": 0.6852624416351318, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9466797113418579, |
| "step": 633 |
| }, |
| { |
| "completion_length": 364.296875, |
| "epoch": 0.9228529839883551, |
| "grad_norm": 0.794210762993565, |
| "kl": 0.042236328125, |
| "learning_rate": 1.5544608381925285e-08, |
| "loss": 0.0023, |
| "reward": 1.3560612201690674, |
| "reward_std": 0.5529812574386597, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9654362201690674, |
| "step": 634 |
| }, |
| { |
| "completion_length": 375.125, |
| "epoch": 0.9243085880640466, |
| "grad_norm": 1.1478859293196242, |
| "kl": 0.046630859375, |
| "learning_rate": 1.4966469871049604e-08, |
| "loss": -0.0025, |
| "reward": 0.9822721481323242, |
| "reward_std": 0.7605947256088257, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9166861772537231, |
| "step": 635 |
| }, |
| { |
| "completion_length": 367.328125, |
| "epoch": 0.925764192139738, |
| "grad_norm": 0.9763592041679303, |
| "kl": 0.0458984375, |
| "learning_rate": 1.4399123864811902e-08, |
| "loss": 0.0015, |
| "reward": 1.4207422733306885, |
| "reward_std": 0.8127451539039612, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9341145753860474, |
| "step": 636 |
| }, |
| { |
| "completion_length": 350.515625, |
| "epoch": 0.9272197962154294, |
| "grad_norm": 1.133505236634761, |
| "kl": 0.053466796875, |
| "learning_rate": 1.384258298725549e-08, |
| "loss": -0.002, |
| "reward": 1.8829882144927979, |
| "reward_std": 0.3248525857925415, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 0.9767382740974426, |
| "step": 637 |
| }, |
| { |
| "completion_length": 362.71875, |
| "epoch": 0.9286754002911208, |
| "grad_norm": 0.9584500222009057, |
| "kl": 0.0439453125, |
| "learning_rate": 1.3296859621998668e-08, |
| "loss": 0.0008, |
| "reward": 1.1840624809265137, |
| "reward_std": 0.1869007647037506, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9802994728088379, |
| "step": 638 |
| }, |
| { |
| "completion_length": 351.375, |
| "epoch": 0.9301310043668122, |
| "grad_norm": 1.10049930360626, |
| "kl": 0.051513671875, |
| "learning_rate": 1.2761965911958384e-08, |
| "loss": 0.0002, |
| "reward": 1.2603776454925537, |
| "reward_std": 0.6707699298858643, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.9109375476837158, |
| "step": 639 |
| }, |
| { |
| "completion_length": 366.0625, |
| "epoch": 0.9315866084425036, |
| "grad_norm": 1.1123121973401144, |
| "kl": 0.047607421875, |
| "learning_rate": 1.2237913759080676e-08, |
| "loss": 0.001, |
| "reward": 0.48548179864883423, |
| "reward_std": 0.06013864278793335, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9767447710037231, |
| "step": 640 |
| }, |
| { |
| "completion_length": 366.96875, |
| "epoch": 0.9330422125181951, |
| "grad_norm": 1.0164203486015158, |
| "kl": 0.047607421875, |
| "learning_rate": 1.1724714824075332e-08, |
| "loss": -0.0036, |
| "reward": 0.3951367139816284, |
| "reward_std": 0.7576174736022949, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.931347668170929, |
| "step": 641 |
| }, |
| { |
| "completion_length": 360.703125, |
| "epoch": 0.9344978165938864, |
| "grad_norm": 0.899137742081826, |
| "kl": 0.045654296875, |
| "learning_rate": 1.1222380526156927e-08, |
| "loss": -0.0016, |
| "reward": 0.7644987106323242, |
| "reward_std": 0.2871595323085785, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9332357048988342, |
| "step": 642 |
| }, |
| { |
| "completion_length": 343.53125, |
| "epoch": 0.9359534206695779, |
| "grad_norm": 1.1461565307561832, |
| "kl": 0.05078125, |
| "learning_rate": 1.073092204279019e-08, |
| "loss": -0.0061, |
| "reward": 1.073411464691162, |
| "reward_std": 0.4045974314212799, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9620703458786011, |
| "step": 643 |
| }, |
| { |
| "completion_length": 359.09375, |
| "epoch": 0.9374090247452693, |
| "grad_norm": 0.9684956712684953, |
| "kl": 0.05224609375, |
| "learning_rate": 1.0250350309441825e-08, |
| "loss": 0.0023, |
| "reward": 1.4904427528381348, |
| "reward_std": 0.36797118186950684, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9588932394981384, |
| "step": 644 |
| }, |
| { |
| "completion_length": 355.78125, |
| "epoch": 0.9388646288209607, |
| "grad_norm": 1.0295786196933356, |
| "kl": 0.0498046875, |
| "learning_rate": 9.780676019336632e-09, |
| "loss": -0.0004, |
| "reward": 1.1840624809265137, |
| "reward_std": 0.1880166232585907, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9797786474227905, |
| "step": 645 |
| }, |
| { |
| "completion_length": 341.78125, |
| "epoch": 0.9403202328966521, |
| "grad_norm": 0.8966605070859263, |
| "kl": 0.048095703125, |
| "learning_rate": 9.32190962322027e-09, |
| "loss": 0.0022, |
| "reward": 1.1638997793197632, |
| "reward_std": 0.4949903190135956, |
| "rewards/accuracy_reward": 0.734375, |
| "rewards/format_reward": 0.9582747220993042, |
| "step": 646 |
| }, |
| { |
| "completion_length": 350.515625, |
| "epoch": 0.9417758369723436, |
| "grad_norm": 1.1365206921282076, |
| "kl": 0.050537109375, |
| "learning_rate": 8.874061329125936e-09, |
| "loss": 0.0033, |
| "reward": 0.8706445097923279, |
| "reward_std": 0.02185887284576893, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9920898675918579, |
| "step": 647 |
| }, |
| { |
| "completion_length": 368.203125, |
| "epoch": 0.9432314410480349, |
| "grad_norm": 0.9073495313031953, |
| "kl": 0.048095703125, |
| "learning_rate": 8.437141102147882e-09, |
| "loss": -0.0029, |
| "reward": 0.3735416531562805, |
| "reward_std": 0.5898939967155457, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.9196093678474426, |
| "step": 648 |
| }, |
| { |
| "completion_length": 355.296875, |
| "epoch": 0.9446870451237264, |
| "grad_norm": 0.9644518486493937, |
| "kl": 0.04638671875, |
| "learning_rate": 8.011158664219253e-09, |
| "loss": 0.0028, |
| "reward": 1.4891471862792969, |
| "reward_std": 0.37131497263908386, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 0.9561783671379089, |
| "step": 649 |
| }, |
| { |
| "completion_length": 357.859375, |
| "epoch": 0.9461426491994177, |
| "grad_norm": 0.915855189506075, |
| "kl": 0.052001953125, |
| "learning_rate": 7.59612349389599e-09, |
| "loss": -0.0002, |
| "reward": 1.8685481548309326, |
| "reward_std": 0.24537202715873718, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 0.9622981548309326, |
| "step": 650 |
| }, |
| { |
| "completion_length": 344.15625, |
| "epoch": 0.9475982532751092, |
| "grad_norm": 0.9405891712802695, |
| "kl": 0.050537109375, |
| "learning_rate": 7.1920448261457715e-09, |
| "loss": -0.0006, |
| "reward": 0.40004557371139526, |
| "reward_std": 0.2768644690513611, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/format_reward": 0.9441732168197632, |
| "step": 651 |
| }, |
| { |
| "completion_length": 374.21875, |
| "epoch": 0.9490538573508006, |
| "grad_norm": 0.9649816496242495, |
| "kl": 0.05078125, |
| "learning_rate": 6.798931652142737e-09, |
| "loss": -0.0019, |
| "reward": 0.29408854246139526, |
| "reward_std": 0.7213122248649597, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/format_reward": 0.9326822757720947, |
| "step": 652 |
| }, |
| { |
| "completion_length": 367.28125, |
| "epoch": 0.950509461426492, |
| "grad_norm": 0.9385056166107855, |
| "kl": 0.05224609375, |
| "learning_rate": 6.416792719067143e-09, |
| "loss": 0.0026, |
| "reward": 0.6902929544448853, |
| "reward_std": 0.43664127588272095, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9491862058639526, |
| "step": 653 |
| }, |
| { |
| "completion_length": 349.265625, |
| "epoch": 0.9519650655021834, |
| "grad_norm": 1.1227123204211185, |
| "kl": 0.05419921875, |
| "learning_rate": 6.045636529911025e-09, |
| "loss": 0.0015, |
| "reward": 0.9635221362113953, |
| "reward_std": 0.18498189747333527, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9879622459411621, |
| "step": 654 |
| }, |
| { |
| "completion_length": 356.90625, |
| "epoch": 0.9534206695778749, |
| "grad_norm": 0.9808379639308646, |
| "kl": 0.05859375, |
| "learning_rate": 5.685471343288672e-09, |
| "loss": 0.0018, |
| "reward": 1.1685742139816284, |
| "reward_std": 0.8473724722862244, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8689258098602295, |
| "step": 655 |
| }, |
| { |
| "completion_length": 367.015625, |
| "epoch": 0.9548762736535662, |
| "grad_norm": 0.9461780907460997, |
| "kl": 0.05078125, |
| "learning_rate": 5.33630517325323e-09, |
| "loss": 0.0026, |
| "reward": 0.7643359303474426, |
| "reward_std": 0.6334630250930786, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9341406226158142, |
| "step": 656 |
| }, |
| { |
| "completion_length": 354.828125, |
| "epoch": 0.9563318777292577, |
| "grad_norm": 1.0187044029454269, |
| "kl": 0.056396484375, |
| "learning_rate": 4.998145789118114e-09, |
| "loss": 0.0004, |
| "reward": 0.34561195969581604, |
| "reward_std": 0.3423462212085724, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.9248437285423279, |
| "step": 657 |
| }, |
| { |
| "completion_length": 356.8125, |
| "epoch": 0.9577874818049491, |
| "grad_norm": 0.9264971002008862, |
| "kl": 0.0478515625, |
| "learning_rate": 4.671000715284146e-09, |
| "loss": 0.0036, |
| "reward": 1.7602018117904663, |
| "reward_std": 0.6684818267822266, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.9477018117904663, |
| "step": 658 |
| }, |
| { |
| "completion_length": 360.484375, |
| "epoch": 0.9592430858806404, |
| "grad_norm": 0.899630251698082, |
| "kl": 0.048583984375, |
| "learning_rate": 4.354877231072307e-09, |
| "loss": 0.0017, |
| "reward": 0.73576819896698, |
| "reward_std": 0.8185403347015381, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.8940234184265137, |
| "step": 659 |
| }, |
| { |
| "completion_length": 368.375, |
| "epoch": 0.9606986899563319, |
| "grad_norm": 0.7875268897862577, |
| "kl": 0.04345703125, |
| "learning_rate": 4.049782370561583e-09, |
| "loss": 0.0017, |
| "reward": 0.9524999856948853, |
| "reward_std": 0.8121271133422852, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9330078363418579, |
| "step": 660 |
| }, |
| { |
| "completion_length": 359.25, |
| "epoch": 0.9621542940320232, |
| "grad_norm": 1.0118312486353827, |
| "kl": 0.046630859375, |
| "learning_rate": 3.755722922432481e-09, |
| "loss": -0.0042, |
| "reward": 1.806471347808838, |
| "reward_std": 0.41752493381500244, |
| "rewards/accuracy_reward": 0.953125, |
| "rewards/format_reward": 0.9470964074134827, |
| "step": 661 |
| }, |
| { |
| "completion_length": 365.0, |
| "epoch": 0.9636098981077147, |
| "grad_norm": 1.029349953751044, |
| "kl": 0.048828125, |
| "learning_rate": 3.4727054298161473e-09, |
| "loss": 0.0019, |
| "reward": 0.17289060354232788, |
| "reward_std": 0.7982980012893677, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.8995833396911621, |
| "step": 662 |
| }, |
| { |
| "completion_length": 381.40625, |
| "epoch": 0.9650655021834061, |
| "grad_norm": 0.8922375425023275, |
| "kl": 0.0478515625, |
| "learning_rate": 3.200736190148545e-09, |
| "loss": 0.0037, |
| "reward": 1.12339186668396, |
| "reward_std": 0.8738340139389038, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.8733919262886047, |
| "step": 663 |
| }, |
| { |
| "completion_length": 347.40625, |
| "epoch": 0.9665211062590975, |
| "grad_norm": 1.046079535967673, |
| "kl": 0.050537109375, |
| "learning_rate": 2.9398212550303945e-09, |
| "loss": 0.0013, |
| "reward": 1.3840559720993042, |
| "reward_std": 0.5832023024559021, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9441732168197632, |
| "step": 664 |
| }, |
| { |
| "completion_length": 339.375, |
| "epoch": 0.9679767103347889, |
| "grad_norm": 0.9349120246936703, |
| "kl": 0.048095703125, |
| "learning_rate": 2.6899664300925607e-09, |
| "loss": -0.0006, |
| "reward": 0.6614192724227905, |
| "reward_std": 0.7153116464614868, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9226171970367432, |
| "step": 665 |
| }, |
| { |
| "completion_length": 369.828125, |
| "epoch": 0.9694323144104804, |
| "grad_norm": 0.8189860245778907, |
| "kl": 0.046875, |
| "learning_rate": 2.451177274866989e-09, |
| "loss": -0.0006, |
| "reward": 0.96561199426651, |
| "reward_std": 0.43986329436302185, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9498568177223206, |
| "step": 666 |
| }, |
| { |
| "completion_length": 359.171875, |
| "epoch": 0.9708879184861717, |
| "grad_norm": 0.9437761108308588, |
| "kl": 0.0478515625, |
| "learning_rate": 2.2234591026626946e-09, |
| "loss": -0.0038, |
| "reward": 0.9382357001304626, |
| "reward_std": 0.76799476146698, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9165430068969727, |
| "step": 667 |
| }, |
| { |
| "completion_length": 378.453125, |
| "epoch": 0.9723435225618632, |
| "grad_norm": 0.9460750716630619, |
| "kl": 0.044677734375, |
| "learning_rate": 2.0068169804478564e-09, |
| "loss": 0.0022, |
| "reward": 0.8572070002555847, |
| "reward_std": 0.639157772064209, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.9332357048988342, |
| "step": 668 |
| }, |
| { |
| "completion_length": 324.75, |
| "epoch": 0.9737991266375546, |
| "grad_norm": 1.0371341680952215, |
| "kl": 0.04833984375, |
| "learning_rate": 1.8012557287367391e-09, |
| "loss": -0.0001, |
| "reward": 0.5965365171432495, |
| "reward_std": 0.1833423525094986, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.9920833706855774, |
| "step": 669 |
| }, |
| { |
| "completion_length": 347.953125, |
| "epoch": 0.975254730713246, |
| "grad_norm": 1.00932152995975, |
| "kl": 0.056640625, |
| "learning_rate": 1.6067799214828926e-09, |
| "loss": 0.0008, |
| "reward": 1.5535807609558105, |
| "reward_std": 0.20138989388942719, |
| "rewards/accuracy_reward": 0.859375, |
| "rewards/format_reward": 0.9731640815734863, |
| "step": 670 |
| }, |
| { |
| "completion_length": 342.890625, |
| "epoch": 0.9767103347889374, |
| "grad_norm": 0.9803179005691086, |
| "kl": 0.05224609375, |
| "learning_rate": 1.4233938859767868e-09, |
| "loss": 0.0033, |
| "reward": 1.337246060371399, |
| "reward_std": 0.5977544784545898, |
| "rewards/accuracy_reward": 0.796875, |
| "rewards/format_reward": 0.9449414610862732, |
| "step": 671 |
| }, |
| { |
| "completion_length": 361.375, |
| "epoch": 0.9781659388646288, |
| "grad_norm": 0.9610606033508311, |
| "kl": 0.0498046875, |
| "learning_rate": 1.251101702750168e-09, |
| "loss": 0.004, |
| "reward": 0.9087890386581421, |
| "reward_std": 0.7360955476760864, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.9337239265441895, |
| "step": 672 |
| }, |
| { |
| "completion_length": 357.65625, |
| "epoch": 0.9796215429403202, |
| "grad_norm": 0.9783219664641567, |
| "kl": 0.048095703125, |
| "learning_rate": 1.0899072054846303e-09, |
| "loss": 0.0021, |
| "reward": 1.361875057220459, |
| "reward_std": 0.5983988046646118, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 0.9243749380111694, |
| "step": 673 |
| }, |
| { |
| "completion_length": 353.78125, |
| "epoch": 0.9810771470160117, |
| "grad_norm": 0.8663994288006219, |
| "kl": 0.05712890625, |
| "learning_rate": 9.398139809268514e-10, |
| "loss": 0.0056, |
| "reward": 1.0991926193237305, |
| "reward_std": 0.4083336889743805, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9410547018051147, |
| "step": 674 |
| }, |
| { |
| "completion_length": 369.703125, |
| "epoch": 0.982532751091703, |
| "grad_norm": 1.0224494770074284, |
| "kl": 0.047119140625, |
| "learning_rate": 8.008253688084887e-10, |
| "loss": -0.0016, |
| "reward": 0.15850260853767395, |
| "reward_std": 0.9653068780899048, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 0.8858072757720947, |
| "step": 675 |
| }, |
| { |
| "completion_length": 349.8125, |
| "epoch": 0.9839883551673945, |
| "grad_norm": 1.0418234894905996, |
| "kl": 0.047607421875, |
| "learning_rate": 6.729444617717961e-10, |
| "loss": -0.0003, |
| "reward": 1.2996224164962769, |
| "reward_std": 0.4023718535900116, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.9947004914283752, |
| "step": 676 |
| }, |
| { |
| "completion_length": 363.703125, |
| "epoch": 0.9854439592430859, |
| "grad_norm": 1.0444763973789726, |
| "kl": 0.046875, |
| "learning_rate": 5.56174105301066e-10, |
| "loss": 0.0024, |
| "reward": 0.9435481429100037, |
| "reward_std": 0.7777630090713501, |
| "rewards/accuracy_reward": 0.671875, |
| "rewards/format_reward": 0.9277669191360474, |
| "step": 677 |
| }, |
| { |
| "completion_length": 343.484375, |
| "epoch": 0.9868995633187773, |
| "grad_norm": 0.97449213723852, |
| "kl": 0.052734375, |
| "learning_rate": 4.5051689765929213e-10, |
| "loss": -0.0009, |
| "reward": 0.13971352577209473, |
| "reward_std": 0.23464931547641754, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.9606120586395264, |
| "step": 678 |
| }, |
| { |
| "completion_length": 360.53125, |
| "epoch": 0.9883551673944687, |
| "grad_norm": 0.9192762084564559, |
| "kl": 0.050048828125, |
| "learning_rate": 3.559751898299934e-10, |
| "loss": 0.0019, |
| "reward": 1.4475326538085938, |
| "reward_std": 0.4938472509384155, |
| "rewards/accuracy_reward": 0.828125, |
| "rewards/format_reward": 0.9631575345993042, |
| "step": 679 |
| }, |
| { |
| "completion_length": 362.625, |
| "epoch": 0.9898107714701602, |
| "grad_norm": 0.7829504697419019, |
| "kl": 0.046875, |
| "learning_rate": 2.725510854653668e-10, |
| "loss": 0.0006, |
| "reward": 0.9168750047683716, |
| "reward_std": 0.1419457346200943, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/format_reward": 0.9950000047683716, |
| "step": 680 |
| }, |
| { |
| "completion_length": 345.0625, |
| "epoch": 0.9912663755458515, |
| "grad_norm": 1.0399738160256335, |
| "kl": 0.060302734375, |
| "learning_rate": 2.002464408392135e-10, |
| "loss": 0.0024, |
| "reward": 1.6794726848602295, |
| "reward_std": 0.7667683959007263, |
| "rewards/accuracy_reward": 0.921875, |
| "rewards/format_reward": 0.9138476848602295, |
| "step": 681 |
| }, |
| { |
| "completion_length": 361.859375, |
| "epoch": 0.992721979621543, |
| "grad_norm": 0.9175441007019236, |
| "kl": 0.048828125, |
| "learning_rate": 1.390628648056391e-10, |
| "loss": 0.001, |
| "reward": 0.7809114456176758, |
| "reward_std": 0.5863924622535706, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/format_reward": 0.9467839002609253, |
| "step": 682 |
| }, |
| { |
| "completion_length": 361.796875, |
| "epoch": 0.9941775836972343, |
| "grad_norm": 0.9367681869995809, |
| "kl": 0.044677734375, |
| "learning_rate": 8.900171876341511e-11, |
| "loss": 0.0024, |
| "reward": 0.8453580737113953, |
| "reward_std": 0.8822904229164124, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9622981548309326, |
| "step": 683 |
| }, |
| { |
| "completion_length": 360.484375, |
| "epoch": 0.9956331877729258, |
| "grad_norm": 0.9988321881169316, |
| "kl": 0.046630859375, |
| "learning_rate": 5.006411662555887e-11, |
| "loss": -0.0066, |
| "reward": 0.6559830904006958, |
| "reward_std": 0.5935498476028442, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9169206023216248, |
| "step": 684 |
| }, |
| { |
| "completion_length": 344.28125, |
| "epoch": 0.9970887918486172, |
| "grad_norm": 1.0544057765799288, |
| "kl": 0.049072265625, |
| "learning_rate": 2.2250924794520175e-11, |
| "loss": 0.0023, |
| "reward": 1.2797396183013916, |
| "reward_std": 0.35647517442703247, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.9755468368530273, |
| "step": 685 |
| }, |
| { |
| "completion_length": 370.171875, |
| "epoch": 0.9985443959243085, |
| "grad_norm": 0.872227207794643, |
| "kl": 0.04541015625, |
| "learning_rate": 5.562762142974353e-12, |
| "loss": 0.0014, |
| "reward": 1.0113476514816284, |
| "reward_std": 0.7452950477600098, |
| "rewards/accuracy_reward": 0.703125, |
| "rewards/format_reward": 0.9019725918769836, |
| "step": 686 |
| }, |
| { |
| "completion_length": 382.890625, |
| "epoch": 1.0, |
| "grad_norm": 0.9830707932910157, |
| "kl": 0.042724609375, |
| "learning_rate": 0.0, |
| "loss": 0.0014, |
| "reward": 1.15053391456604, |
| "reward_std": 1.0555355548858643, |
| "rewards/accuracy_reward": 0.765625, |
| "rewards/format_reward": 0.8536588549613953, |
| "step": 687 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 687, |
| "total_flos": 0.0, |
| "train_loss": 0.00033314188768961123, |
| "train_runtime": 116282.9812, |
| "train_samples_per_second": 0.047, |
| "train_steps_per_second": 0.006 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 687, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|