TinyLLaVA-Video-R1 / trainer_state.json
Zhang199's picture
first commit
3162f59
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 687,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 146.640625,
"epoch": 0.001455604075691412,
"grad_norm": 2.0520484071530207,
"kl": 0.0,
"learning_rate": 4.7619047619047613e-08,
"loss": 0.0027,
"reward": -0.6811327934265137,
"reward_std": 1.2931699752807617,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.4427083134651184,
"step": 1
},
{
"completion_length": 141.84375,
"epoch": 0.002911208151382824,
"grad_norm": 2.0555621453821775,
"kl": 0.0,
"learning_rate": 9.523809523809523e-08,
"loss": 0.0025,
"reward": -0.39337241649627686,
"reward_std": 1.4007469415664673,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.5158854722976685,
"step": 2
},
{
"completion_length": 131.109375,
"epoch": 0.004366812227074236,
"grad_norm": 2.0052335093689617,
"kl": 0.000347137451171875,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0032,
"reward": -0.4100520610809326,
"reward_std": 1.3564808368682861,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5098437666893005,
"step": 3
},
{
"completion_length": 147.109375,
"epoch": 0.005822416302765648,
"grad_norm": 9.897016495998724,
"kl": 0.0006103515625,
"learning_rate": 1.9047619047619045e-07,
"loss": -0.0005,
"reward": -0.5236002206802368,
"reward_std": 1.1223150491714478,
"rewards/accuracy_reward": 0.203125,
"rewards/format_reward": 0.5694466233253479,
"step": 4
},
{
"completion_length": 115.609375,
"epoch": 0.00727802037845706,
"grad_norm": 2.2619632214263254,
"kl": 0.000347137451171875,
"learning_rate": 2.3809523809523806e-07,
"loss": 0.0001,
"reward": -0.4710937440395355,
"reward_std": 1.1760644912719727,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.48657551407814026,
"step": 5
},
{
"completion_length": 128.390625,
"epoch": 0.008733624454148471,
"grad_norm": 2.025201991853294,
"kl": 0.000396728515625,
"learning_rate": 2.857142857142857e-07,
"loss": -0.0004,
"reward": -0.3249348998069763,
"reward_std": 1.27274489402771,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.5628255009651184,
"step": 6
},
{
"completion_length": 124.640625,
"epoch": 0.010189228529839884,
"grad_norm": 3.3130345990864427,
"kl": 0.00043487548828125,
"learning_rate": 3.333333333333333e-07,
"loss": -0.0007,
"reward": -0.6294205188751221,
"reward_std": 1.305110216140747,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.44719398021698,
"step": 7
},
{
"completion_length": 125.171875,
"epoch": 0.011644832605531296,
"grad_norm": 2.2278530283663818,
"kl": 0.0003452301025390625,
"learning_rate": 3.809523809523809e-07,
"loss": -0.0019,
"reward": -0.396751344203949,
"reward_std": 1.1712387800216675,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.5593684911727905,
"step": 8
},
{
"completion_length": 142.28125,
"epoch": 0.013100436681222707,
"grad_norm": 1.9409494681196875,
"kl": 0.0004138946533203125,
"learning_rate": 4.285714285714285e-07,
"loss": 0.001,
"reward": -0.7128320336341858,
"reward_std": 0.9856235980987549,
"rewards/accuracy_reward": 0.140625,
"rewards/format_reward": 0.5527409315109253,
"step": 9
},
{
"completion_length": 139.53125,
"epoch": 0.01455604075691412,
"grad_norm": 4.067951440500351,
"kl": 0.000446319580078125,
"learning_rate": 4.761904761904761e-07,
"loss": 0.0003,
"reward": -0.6937109231948853,
"reward_std": 1.2031134366989136,
"rewards/accuracy_reward": 0.171875,
"rewards/format_reward": 0.483502596616745,
"step": 10
},
{
"completion_length": 131.96875,
"epoch": 0.01601164483260553,
"grad_norm": 1.9513622824677586,
"kl": 0.0004024505615234375,
"learning_rate": 5.238095238095238e-07,
"loss": -0.0019,
"reward": -0.44268879294395447,
"reward_std": 1.2512993812561035,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5169466137886047,
"step": 11
},
{
"completion_length": 128.296875,
"epoch": 0.017467248908296942,
"grad_norm": 2.6073182015857483,
"kl": 0.0003662109375,
"learning_rate": 5.714285714285714e-07,
"loss": 0.0019,
"reward": -0.11339845508337021,
"reward_std": 1.379326581954956,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.5592578053474426,
"step": 12
},
{
"completion_length": 131.5625,
"epoch": 0.018922852983988356,
"grad_norm": 2.038224554514447,
"kl": 0.0004138946533203125,
"learning_rate": 6.19047619047619e-07,
"loss": 0.0019,
"reward": -0.5053775906562805,
"reward_std": 1.0422844886779785,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.5684505105018616,
"step": 13
},
{
"completion_length": 130.015625,
"epoch": 0.020378457059679767,
"grad_norm": 8.376191441689384,
"kl": 0.00080108642578125,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0024,
"reward": -0.5506510734558105,
"reward_std": 1.289644479751587,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.45373696088790894,
"step": 14
},
{
"completion_length": 143.125,
"epoch": 0.021834061135371178,
"grad_norm": 8.674544330550578,
"kl": 0.0009918212890625,
"learning_rate": 7.142857142857143e-07,
"loss": 0.001,
"reward": -0.517591118812561,
"reward_std": 1.3039864301681519,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.48772138357162476,
"step": 15
},
{
"completion_length": 152.59375,
"epoch": 0.023289665211062592,
"grad_norm": 2.5986008608286215,
"kl": 0.0004673004150390625,
"learning_rate": 7.619047619047618e-07,
"loss": 0.0034,
"reward": -0.47635418176651,
"reward_std": 1.3350398540496826,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.4758723974227905,
"step": 16
},
{
"completion_length": 126.203125,
"epoch": 0.024745269286754003,
"grad_norm": 4.121392021849402,
"kl": 0.00116729736328125,
"learning_rate": 8.095238095238095e-07,
"loss": -0.0004,
"reward": -0.13250651955604553,
"reward_std": 1.1838587522506714,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.5381835699081421,
"step": 17
},
{
"completion_length": 130.640625,
"epoch": 0.026200873362445413,
"grad_norm": 3.933006083601506,
"kl": 0.000957489013671875,
"learning_rate": 8.57142857142857e-07,
"loss": -0.0009,
"reward": -0.29349610209465027,
"reward_std": 1.1805915832519531,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.5601236820220947,
"step": 18
},
{
"completion_length": 134.53125,
"epoch": 0.027656477438136828,
"grad_norm": 2.7373891464842437,
"kl": 0.00140380859375,
"learning_rate": 9.047619047619047e-07,
"loss": 0.0008,
"reward": -0.2783724069595337,
"reward_std": 1.172222375869751,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.5397005081176758,
"step": 19
},
{
"completion_length": 130.90625,
"epoch": 0.02911208151382824,
"grad_norm": 2.400503043349488,
"kl": 0.000904083251953125,
"learning_rate": 9.523809523809522e-07,
"loss": -0.0003,
"reward": -0.4976627826690674,
"reward_std": 1.1664050817489624,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.4811263084411621,
"step": 20
},
{
"completion_length": 139.421875,
"epoch": 0.03056768558951965,
"grad_norm": 1.7123015669045267,
"kl": 0.0011749267578125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": -0.6022005081176758,
"reward_std": 0.8686124682426453,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.5883203148841858,
"step": 21
},
{
"completion_length": 144.796875,
"epoch": 0.03202328966521106,
"grad_norm": 1.9937512224942535,
"kl": 0.00160980224609375,
"learning_rate": 9.99994437237857e-07,
"loss": 0.0011,
"reward": -0.14617840945720673,
"reward_std": 1.3580482006072998,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.5967773199081421,
"step": 22
},
{
"completion_length": 124.6875,
"epoch": 0.033478893740902474,
"grad_norm": 2.0814826578918852,
"kl": 0.00128936767578125,
"learning_rate": 9.999777490752055e-07,
"loss": 0.0029,
"reward": -0.21195964515209198,
"reward_std": 1.2764006853103638,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.5546939969062805,
"step": 23
},
{
"completion_length": 140.765625,
"epoch": 0.034934497816593885,
"grad_norm": 1.77589421887042,
"kl": 0.0024871826171875,
"learning_rate": 9.999499358833744e-07,
"loss": 0.0023,
"reward": -0.4434700608253479,
"reward_std": 1.2595456838607788,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.589902400970459,
"step": 24
},
{
"completion_length": 135.53125,
"epoch": 0.036390101892285295,
"grad_norm": 1.9978141335810846,
"kl": 0.0023956298828125,
"learning_rate": 9.999109982812366e-07,
"loss": 0.0004,
"reward": 0.10095702111721039,
"reward_std": 1.251359462738037,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.6653059720993042,
"step": 25
},
{
"completion_length": 143.03125,
"epoch": 0.03784570596797671,
"grad_norm": 2.05478507206434,
"kl": 0.001953125,
"learning_rate": 9.998609371351943e-07,
"loss": 0.0017,
"reward": -0.4979492127895355,
"reward_std": 1.096949815750122,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.6168684959411621,
"step": 26
},
{
"completion_length": 142.390625,
"epoch": 0.039301310043668124,
"grad_norm": 9.38751585986725,
"kl": 0.2451171875,
"learning_rate": 9.997997535591607e-07,
"loss": 0.0004,
"reward": -0.13885416090488434,
"reward_std": 1.2641448974609375,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.5863932371139526,
"step": 27
},
{
"completion_length": 122.46875,
"epoch": 0.040756914119359534,
"grad_norm": 2.02901198267971,
"kl": 0.002410888671875,
"learning_rate": 9.997274489145347e-07,
"loss": 0.0019,
"reward": 0.0855598971247673,
"reward_std": 1.314124345779419,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.5462110042572021,
"step": 28
},
{
"completion_length": 138.21875,
"epoch": 0.042212518195050945,
"grad_norm": 1.9186274674092676,
"kl": 0.003692626953125,
"learning_rate": 9.9964402481017e-07,
"loss": 0.0007,
"reward": 0.14319661259651184,
"reward_std": 1.2557392120361328,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.6084700226783752,
"step": 29
},
{
"completion_length": 133.484375,
"epoch": 0.043668122270742356,
"grad_norm": 1.8000804655795466,
"kl": 0.00665283203125,
"learning_rate": 9.995494831023408e-07,
"loss": -0.0004,
"reward": -0.0694987028837204,
"reward_std": 1.264149785041809,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.6460742354393005,
"step": 30
},
{
"completion_length": 132.125,
"epoch": 0.04512372634643377,
"grad_norm": 2.7510703510304624,
"kl": 0.0225830078125,
"learning_rate": 9.994438258946988e-07,
"loss": -0.0004,
"reward": -0.2966731786727905,
"reward_std": 1.048037052154541,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.6053190231323242,
"step": 31
},
{
"completion_length": 138.5625,
"epoch": 0.046579330422125184,
"grad_norm": 1.702872778334861,
"kl": 0.0030059814453125,
"learning_rate": 9.993270555382281e-07,
"loss": -0.0019,
"reward": 0.10822266340255737,
"reward_std": 1.0069178342819214,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.6695116758346558,
"step": 32
},
{
"completion_length": 153.203125,
"epoch": 0.048034934497816595,
"grad_norm": 1.6790629896240423,
"kl": 0.0033721923828125,
"learning_rate": 9.991991746311915e-07,
"loss": -0.0004,
"reward": 0.37479168176651,
"reward_std": 1.2309496402740479,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.7032031416893005,
"step": 33
},
{
"completion_length": 122.59375,
"epoch": 0.049490538573508006,
"grad_norm": 1.926350880972545,
"kl": 0.004180908203125,
"learning_rate": 9.99060186019073e-07,
"loss": -0.0042,
"reward": 0.2343815118074417,
"reward_std": 0.9639967083930969,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.6073763370513916,
"step": 34
},
{
"completion_length": 121.46875,
"epoch": 0.050946142649199416,
"grad_norm": 1.9507933134145956,
"kl": 0.006256103515625,
"learning_rate": 9.989100927945153e-07,
"loss": -0.002,
"reward": 0.36109375953674316,
"reward_std": 1.2289752960205078,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.6778646111488342,
"step": 35
},
{
"completion_length": 113.640625,
"epoch": 0.05240174672489083,
"grad_norm": 1.7649346907748196,
"kl": 0.00714111328125,
"learning_rate": 9.9874889829725e-07,
"loss": -0.0019,
"reward": -0.09406250715255737,
"reward_std": 0.98655104637146,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.6245573163032532,
"step": 36
},
{
"completion_length": 140.671875,
"epoch": 0.053857350800582245,
"grad_norm": 1.6599989465316414,
"kl": 0.004180908203125,
"learning_rate": 9.985766061140232e-07,
"loss": 0.0024,
"reward": -0.051347650587558746,
"reward_std": 1.1097118854522705,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.6869857311248779,
"step": 37
},
{
"completion_length": 131.609375,
"epoch": 0.055312954876273655,
"grad_norm": 1.9002315583578684,
"kl": 0.004150390625,
"learning_rate": 9.983932200785172e-07,
"loss": -0.0051,
"reward": 0.016217432916164398,
"reward_std": 1.0292843580245972,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.6955012679100037,
"step": 38
},
{
"completion_length": 133.46875,
"epoch": 0.056768558951965066,
"grad_norm": 1.703318134865483,
"kl": 0.005767822265625,
"learning_rate": 9.98198744271263e-07,
"loss": -0.0047,
"reward": 0.3018620014190674,
"reward_std": 1.063158392906189,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.669726550579071,
"step": 39
},
{
"completion_length": 106.578125,
"epoch": 0.05822416302765648,
"grad_norm": 1.9072082640759345,
"kl": 0.020751953125,
"learning_rate": 9.979931830195522e-07,
"loss": -0.0023,
"reward": -0.04225911945104599,
"reward_std": 0.9956592321395874,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.6362695693969727,
"step": 40
},
{
"completion_length": 116.96875,
"epoch": 0.05967976710334789,
"grad_norm": 1.8801906920255198,
"kl": 0.006744384765625,
"learning_rate": 9.977765408973374e-07,
"loss": -0.005,
"reward": 0.21731121838092804,
"reward_std": 0.9288095235824585,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.6459180116653442,
"step": 41
},
{
"completion_length": 118.265625,
"epoch": 0.0611353711790393,
"grad_norm": 1.9537453615984006,
"kl": 0.009765625,
"learning_rate": 9.975488227251329e-07,
"loss": -0.0017,
"reward": -0.03567056730389595,
"reward_std": 1.0284496545791626,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.645800769329071,
"step": 42
},
{
"completion_length": 106.71875,
"epoch": 0.06259097525473072,
"grad_norm": 2.0269318680154806,
"kl": 0.00640869140625,
"learning_rate": 9.973100335699073e-07,
"loss": -0.0026,
"reward": 0.33292967081069946,
"reward_std": 1.1819396018981934,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.6497005224227905,
"step": 43
},
{
"completion_length": 120.84375,
"epoch": 0.06404657933042213,
"grad_norm": 1.954056839145857,
"kl": 0.00958251953125,
"learning_rate": 9.970601787449696e-07,
"loss": 0.0031,
"reward": 0.3369661271572113,
"reward_std": 1.1963883638381958,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.6441406011581421,
"step": 44
},
{
"completion_length": 119.484375,
"epoch": 0.06550218340611354,
"grad_norm": 1.9048660482499535,
"kl": 0.007110595703125,
"learning_rate": 9.967992638098515e-07,
"loss": 0.0017,
"reward": -0.12632161378860474,
"reward_std": 0.811303973197937,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.6902539134025574,
"step": 45
},
{
"completion_length": 108.234375,
"epoch": 0.06695778748180495,
"grad_norm": 1.954242990541008,
"kl": 0.01055908203125,
"learning_rate": 9.965272945701838e-07,
"loss": 0.0053,
"reward": 0.1287955790758133,
"reward_std": 1.0276615619659424,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.6505273580551147,
"step": 46
},
{
"completion_length": 111.203125,
"epoch": 0.06841339155749636,
"grad_norm": 2.0265711266703805,
"kl": 0.0101318359375,
"learning_rate": 9.962442770775673e-07,
"loss": -0.0007,
"reward": 0.44425129890441895,
"reward_std": 1.1991455554962158,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.675071656703949,
"step": 47
},
{
"completion_length": 109.75,
"epoch": 0.06986899563318777,
"grad_norm": 2.071460406757796,
"kl": 0.0103759765625,
"learning_rate": 9.959502176294382e-07,
"loss": 0.0048,
"reward": -0.17327472567558289,
"reward_std": 0.8208831548690796,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.678769588470459,
"step": 48
},
{
"completion_length": 101.453125,
"epoch": 0.07132459970887918,
"grad_norm": 2.1718985191781695,
"kl": 0.021728515625,
"learning_rate": 9.956451227689277e-07,
"loss": -0.0001,
"reward": 0.15954425930976868,
"reward_std": 0.9772539734840393,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.6399348974227905,
"step": 49
},
{
"completion_length": 106.046875,
"epoch": 0.07278020378457059,
"grad_norm": 2.1050743827136533,
"kl": 0.01318359375,
"learning_rate": 9.953289992847158e-07,
"loss": -0.0002,
"reward": 0.5692838430404663,
"reward_std": 1.0812323093414307,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.6715234518051147,
"step": 50
},
{
"completion_length": 102.359375,
"epoch": 0.07423580786026202,
"grad_norm": 1.9030170365144239,
"kl": 0.0135498046875,
"learning_rate": 9.950018542108817e-07,
"loss": 0.0007,
"reward": 0.5510026216506958,
"reward_std": 1.1338927745819092,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.6648176908493042,
"step": 51
},
{
"completion_length": 89.515625,
"epoch": 0.07569141193595343,
"grad_norm": 2.120469098312219,
"kl": 0.0157470703125,
"learning_rate": 9.946636948267467e-07,
"loss": 0.0016,
"reward": 0.5175260901451111,
"reward_std": 1.038745641708374,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.6268749833106995,
"step": 52
},
{
"completion_length": 91.75,
"epoch": 0.07714701601164484,
"grad_norm": 2.086860625748496,
"kl": 0.0145263671875,
"learning_rate": 9.943145286567113e-07,
"loss": -0.0006,
"reward": 0.42378902435302734,
"reward_std": 1.124879240989685,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.6217838525772095,
"step": 53
},
{
"completion_length": 97.9375,
"epoch": 0.07860262008733625,
"grad_norm": 2.019967385216881,
"kl": 0.0146484375,
"learning_rate": 9.93954363470089e-07,
"loss": 0.0035,
"reward": 0.3268880248069763,
"reward_std": 1.1483426094055176,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.6423567533493042,
"step": 54
},
{
"completion_length": 99.625,
"epoch": 0.08005822416302766,
"grad_norm": 2.046031430253623,
"kl": 0.0186767578125,
"learning_rate": 9.935832072809327e-07,
"loss": 0.0026,
"reward": 0.30184245109558105,
"reward_std": 1.0335478782653809,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.6567773818969727,
"step": 55
},
{
"completion_length": 93.578125,
"epoch": 0.08151382823871907,
"grad_norm": 2.1224187100626404,
"kl": 0.0186767578125,
"learning_rate": 9.932010683478573e-07,
"loss": -0.0015,
"reward": 0.7045247554779053,
"reward_std": 1.007187843322754,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6370638608932495,
"step": 56
},
{
"completion_length": 99.71875,
"epoch": 0.08296943231441048,
"grad_norm": 2.0226555290069905,
"kl": 0.019287109375,
"learning_rate": 9.928079551738541e-07,
"loss": 0.004,
"reward": 0.24904297292232513,
"reward_std": 1.0046416521072388,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.6374022960662842,
"step": 57
},
{
"completion_length": 92.59375,
"epoch": 0.08442503639010189,
"grad_norm": 2.031040356216584,
"kl": 0.0264892578125,
"learning_rate": 9.92403876506104e-07,
"loss": -0.0008,
"reward": 0.5398828387260437,
"reward_std": 0.9229820966720581,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.639635443687439,
"step": 58
},
{
"completion_length": 91.3125,
"epoch": 0.0858806404657933,
"grad_norm": 2.018669545596409,
"kl": 0.026611328125,
"learning_rate": 9.919888413357807e-07,
"loss": 0.0024,
"reward": 0.05692709982395172,
"reward_std": 1.0041440725326538,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.6213802099227905,
"step": 59
},
{
"completion_length": 90.390625,
"epoch": 0.08733624454148471,
"grad_norm": 2.0871119564538034,
"kl": 0.0245361328125,
"learning_rate": 9.91562858897852e-07,
"loss": 0.0021,
"reward": 0.27358072996139526,
"reward_std": 0.9488109350204468,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.6085677146911621,
"step": 60
},
{
"completion_length": 90.953125,
"epoch": 0.08879184861717612,
"grad_norm": 2.0846084451671727,
"kl": 0.022216796875,
"learning_rate": 9.91125938670874e-07,
"loss": 0.0028,
"reward": 0.9214128255844116,
"reward_std": 0.9024526476860046,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.6374154090881348,
"step": 61
},
{
"completion_length": 87.796875,
"epoch": 0.09024745269286755,
"grad_norm": 2.0274598852707553,
"kl": 0.02490234375,
"learning_rate": 9.906780903767798e-07,
"loss": -0.0017,
"reward": 0.5340690016746521,
"reward_std": 0.9889509677886963,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.6374675035476685,
"step": 62
},
{
"completion_length": 88.484375,
"epoch": 0.09170305676855896,
"grad_norm": 2.37260281994688,
"kl": 0.02880859375,
"learning_rate": 9.902193239806634e-07,
"loss": 0.0023,
"reward": 0.42514973878860474,
"reward_std": 0.8481540679931641,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.6194596290588379,
"step": 63
},
{
"completion_length": 86.8125,
"epoch": 0.09315866084425037,
"grad_norm": 2.128381129079585,
"kl": 0.023193359375,
"learning_rate": 9.897496496905583e-07,
"loss": -0.0011,
"reward": 0.3611133098602295,
"reward_std": 1.0007762908935547,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.6315299272537231,
"step": 64
},
{
"completion_length": 83.96875,
"epoch": 0.09461426491994178,
"grad_norm": 2.1833555342889226,
"kl": 0.0262451171875,
"learning_rate": 9.892690779572096e-07,
"loss": 0.0028,
"reward": 0.4069466292858124,
"reward_std": 0.8976852297782898,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.6169465780258179,
"step": 65
},
{
"completion_length": 100.078125,
"epoch": 0.09606986899563319,
"grad_norm": 1.95497475262489,
"kl": 0.02783203125,
"learning_rate": 9.887776194738431e-07,
"loss": -0.0006,
"reward": 0.8320702910423279,
"reward_std": 1.0947431325912476,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.6451822519302368,
"step": 66
},
{
"completion_length": 89.546875,
"epoch": 0.0975254730713246,
"grad_norm": 2.4296708038911046,
"kl": 0.027099609375,
"learning_rate": 9.882752851759247e-07,
"loss": 0.0042,
"reward": 0.3521158695220947,
"reward_std": 0.9503006935119629,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.6041340827941895,
"step": 67
},
{
"completion_length": 90.5,
"epoch": 0.09898107714701601,
"grad_norm": 2.116653369847408,
"kl": 0.0250244140625,
"learning_rate": 9.877620862409192e-07,
"loss": 0.0022,
"reward": 0.1717708259820938,
"reward_std": 0.9714279174804688,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.6394791603088379,
"step": 68
},
{
"completion_length": 88.53125,
"epoch": 0.10043668122270742,
"grad_norm": 2.1712245174516704,
"kl": 0.031982421875,
"learning_rate": 9.872380340880416e-07,
"loss": -0.0013,
"reward": 0.5086783766746521,
"reward_std": 1.0054187774658203,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.6380794048309326,
"step": 69
},
{
"completion_length": 88.265625,
"epoch": 0.10189228529839883,
"grad_norm": 2.206351390007687,
"kl": 0.03466796875,
"learning_rate": 9.867031403780013e-07,
"loss": -0.006,
"reward": 0.5890104174613953,
"reward_std": 0.8417867422103882,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.6225130558013916,
"step": 70
},
{
"completion_length": 91.484375,
"epoch": 0.10334788937409024,
"grad_norm": 2.0985651707368183,
"kl": 0.0296630859375,
"learning_rate": 9.861574170127444e-07,
"loss": -0.0003,
"reward": 0.848574161529541,
"reward_std": 0.9138556718826294,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.6420508027076721,
"step": 71
},
{
"completion_length": 81.1875,
"epoch": 0.10480349344978165,
"grad_norm": 2.3307465788240993,
"kl": 0.037109375,
"learning_rate": 9.85600876135188e-07,
"loss": 0.0021,
"reward": 0.5502018332481384,
"reward_std": 0.8912982940673828,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.6144856214523315,
"step": 72
},
{
"completion_length": 86.5625,
"epoch": 0.10625909752547306,
"grad_norm": 2.1316996208970833,
"kl": 0.02880859375,
"learning_rate": 9.850335301289504e-07,
"loss": 0.0014,
"reward": -0.0640755146741867,
"reward_std": 0.8301602602005005,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.6316145658493042,
"step": 73
},
{
"completion_length": 78.46875,
"epoch": 0.10771470160116449,
"grad_norm": 2.147131800814305,
"kl": 0.03857421875,
"learning_rate": 9.844553916180746e-07,
"loss": -0.0031,
"reward": 0.55866539478302,
"reward_std": 1.0137114524841309,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.606178343296051,
"step": 74
},
{
"completion_length": 94.375,
"epoch": 0.1091703056768559,
"grad_norm": 2.134513729452084,
"kl": 0.037109375,
"learning_rate": 9.838664734667495e-07,
"loss": 0.0012,
"reward": 0.6748111844062805,
"reward_std": 0.863685131072998,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.6456054449081421,
"step": 75
},
{
"completion_length": 77.546875,
"epoch": 0.11062590975254731,
"grad_norm": 2.1654090092198013,
"kl": 0.033447265625,
"learning_rate": 9.832667887790206e-07,
"loss": -0.0003,
"reward": 0.39906901121139526,
"reward_std": 0.7819440364837646,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.6166341304779053,
"step": 76
},
{
"completion_length": 87.6875,
"epoch": 0.11208151382823872,
"grad_norm": 2.2117240462456476,
"kl": 0.033935546875,
"learning_rate": 9.826563508985016e-07,
"loss": -0.0008,
"reward": 0.10059896111488342,
"reward_std": 0.775175929069519,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.591393232345581,
"step": 77
},
{
"completion_length": 84.84375,
"epoch": 0.11353711790393013,
"grad_norm": 2.216126758365653,
"kl": 0.034912109375,
"learning_rate": 9.820351734080754e-07,
"loss": -0.0009,
"reward": 0.46383464336395264,
"reward_std": 0.6518522500991821,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.6175326108932495,
"step": 78
},
{
"completion_length": 71.5,
"epoch": 0.11499272197962154,
"grad_norm": 2.68226792040073,
"kl": 0.04345703125,
"learning_rate": 9.81403270129592e-07,
"loss": -0.0057,
"reward": 1.0123958587646484,
"reward_std": 0.7545869946479797,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.5960806608200073,
"step": 79
},
{
"completion_length": 80.859375,
"epoch": 0.11644832605531295,
"grad_norm": 2.249003158750042,
"kl": 0.03564453125,
"learning_rate": 9.807606551235627e-07,
"loss": -0.0056,
"reward": 0.9187760353088379,
"reward_std": 0.778544545173645,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.620690107345581,
"step": 80
},
{
"completion_length": 81.265625,
"epoch": 0.11790393013100436,
"grad_norm": 2.4482747382151695,
"kl": 0.043212890625,
"learning_rate": 9.801073426888446e-07,
"loss": 0.0019,
"reward": 0.7133723497390747,
"reward_std": 0.7031675577163696,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6206510663032532,
"step": 81
},
{
"completion_length": 77.8125,
"epoch": 0.11935953420669577,
"grad_norm": 2.2325416864250265,
"kl": 0.04052734375,
"learning_rate": 9.794433473623248e-07,
"loss": 0.0035,
"reward": 0.6323372721672058,
"reward_std": 0.9128393530845642,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.6164127588272095,
"step": 82
},
{
"completion_length": 82.046875,
"epoch": 0.12081513828238719,
"grad_norm": 2.1020592126259126,
"kl": 0.032958984375,
"learning_rate": 9.787686839185954e-07,
"loss": -0.0001,
"reward": 1.0958268642425537,
"reward_std": 0.6903193593025208,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.622962236404419,
"step": 83
},
{
"completion_length": 75.625,
"epoch": 0.1222707423580786,
"grad_norm": 2.225966097945577,
"kl": 0.041015625,
"learning_rate": 9.780833673696254e-07,
"loss": 0.0035,
"reward": 0.7522070407867432,
"reward_std": 0.7671902179718018,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.6155664920806885,
"step": 84
},
{
"completion_length": 83.359375,
"epoch": 0.12372634643377002,
"grad_norm": 2.2693402594500034,
"kl": 0.0458984375,
"learning_rate": 9.773874129644267e-07,
"loss": -0.0006,
"reward": 0.6304752826690674,
"reward_std": 0.8777204751968384,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.6244465708732605,
"step": 85
},
{
"completion_length": 76.90625,
"epoch": 0.12518195050946143,
"grad_norm": 2.138087833605439,
"kl": 0.0478515625,
"learning_rate": 9.766808361887148e-07,
"loss": -0.0009,
"reward": 0.8740299344062805,
"reward_std": 0.823813796043396,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.6129882335662842,
"step": 86
},
{
"completion_length": 78.640625,
"epoch": 0.12663755458515283,
"grad_norm": 2.644901616780376,
"kl": 0.0400390625,
"learning_rate": 9.759636527645632e-07,
"loss": 0.0005,
"reward": 0.7980924248695374,
"reward_std": 0.8134920001029968,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.6185481548309326,
"step": 87
},
{
"completion_length": 73.578125,
"epoch": 0.12809315866084425,
"grad_norm": 2.308714342568071,
"kl": 0.047607421875,
"learning_rate": 9.752358786500558e-07,
"loss": 0.0015,
"reward": 0.7030664682388306,
"reward_std": 0.874236524105072,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6089127659797668,
"step": 88
},
{
"completion_length": 82.390625,
"epoch": 0.12954876273653565,
"grad_norm": 2.2405963594949982,
"kl": 0.038330078125,
"learning_rate": 9.744975300389293e-07,
"loss": 0.0043,
"reward": 0.5240559577941895,
"reward_std": 0.8964687585830688,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.6264387369155884,
"step": 89
},
{
"completion_length": 80.78125,
"epoch": 0.13100436681222707,
"grad_norm": 2.2976726395774385,
"kl": 0.034423828125,
"learning_rate": 9.737486233602147e-07,
"loss": 0.0015,
"reward": 0.5632421970367432,
"reward_std": 0.7829184532165527,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.6213281750679016,
"step": 90
},
{
"completion_length": 78.078125,
"epoch": 0.1324599708879185,
"grad_norm": 2.102179867053138,
"kl": 0.05712890625,
"learning_rate": 9.729891752778711e-07,
"loss": -0.0009,
"reward": 0.9788346290588379,
"reward_std": 0.6263606548309326,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.6095768213272095,
"step": 91
},
{
"completion_length": 89.0625,
"epoch": 0.1339155749636099,
"grad_norm": 2.4407305687279295,
"kl": 0.034912109375,
"learning_rate": 9.722192026904144e-07,
"loss": -0.0027,
"reward": 0.9294661283493042,
"reward_std": 0.9660316109657288,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.6371874809265137,
"step": 92
},
{
"completion_length": 82.0625,
"epoch": 0.13537117903930132,
"grad_norm": 2.0885102072083717,
"kl": 0.0400390625,
"learning_rate": 9.71438722730542e-07,
"loss": 0.0019,
"reward": 1.2173632383346558,
"reward_std": 0.5946205854415894,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.6133268475532532,
"step": 93
},
{
"completion_length": 77.515625,
"epoch": 0.13682678311499272,
"grad_norm": 2.356967419590387,
"kl": 0.038330078125,
"learning_rate": 9.706477527647516e-07,
"loss": -0.0035,
"reward": 0.7038216590881348,
"reward_std": 0.6569962501525879,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6144986748695374,
"step": 94
},
{
"completion_length": 81.21875,
"epoch": 0.13828238719068414,
"grad_norm": 2.297586439480163,
"kl": 0.03857421875,
"learning_rate": 9.698463103929541e-07,
"loss": 0.0037,
"reward": 0.02611328661441803,
"reward_std": 0.5660784840583801,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.6138085722923279,
"step": 95
},
{
"completion_length": 78.6875,
"epoch": 0.13973799126637554,
"grad_norm": 2.183856708817705,
"kl": 0.04736328125,
"learning_rate": 9.69034413448083e-07,
"loss": -0.0024,
"reward": 0.8519986867904663,
"reward_std": 0.8133624792098999,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.6173242330551147,
"step": 96
},
{
"completion_length": 71.03125,
"epoch": 0.14119359534206696,
"grad_norm": 2.497616068899775,
"kl": 0.08154296875,
"learning_rate": 9.682120799956961e-07,
"loss": 0.0024,
"reward": 0.7480989694595337,
"reward_std": 0.7188245058059692,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.59375,
"step": 97
},
{
"completion_length": 75.328125,
"epoch": 0.14264919941775836,
"grad_norm": 2.521282427156024,
"kl": 0.0556640625,
"learning_rate": 9.673793283335756e-07,
"loss": -0.002,
"reward": 1.0954035520553589,
"reward_std": 0.6585423946380615,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.6114453077316284,
"step": 98
},
{
"completion_length": 82.234375,
"epoch": 0.14410480349344978,
"grad_norm": 2.1738986479204794,
"kl": 0.04345703125,
"learning_rate": 9.665361769913186e-07,
"loss": -0.0008,
"reward": 0.48210933804512024,
"reward_std": 0.6862488389015198,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.5999479293823242,
"step": 99
},
{
"completion_length": 85.5625,
"epoch": 0.14556040756914118,
"grad_norm": 2.1837000916722658,
"kl": 0.044189453125,
"learning_rate": 9.656826447299271e-07,
"loss": 0.0001,
"reward": 0.8835351467132568,
"reward_std": 0.5615145564079285,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.6321679949760437,
"step": 100
},
{
"completion_length": 83.546875,
"epoch": 0.1470160116448326,
"grad_norm": 2.087738988339914,
"kl": 0.04052734375,
"learning_rate": 9.648187505413884e-07,
"loss": 0.0008,
"reward": 0.7421875,
"reward_std": 0.6311359405517578,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.6250260472297668,
"step": 101
},
{
"completion_length": 88.03125,
"epoch": 0.14847161572052403,
"grad_norm": 2.1250846849492526,
"kl": 0.04638671875,
"learning_rate": 9.639445136482546e-07,
"loss": -0.0005,
"reward": 0.9741601943969727,
"reward_std": 0.681174635887146,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.6371548771858215,
"step": 102
},
{
"completion_length": 74.359375,
"epoch": 0.14992721979621543,
"grad_norm": 2.409258859083457,
"kl": 0.04150390625,
"learning_rate": 9.63059953503213e-07,
"loss": 0.0013,
"reward": 1.1404622793197632,
"reward_std": 0.38158488273620605,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.6089909672737122,
"step": 103
},
{
"completion_length": 87.359375,
"epoch": 0.15138282387190685,
"grad_norm": 2.088104027671848,
"kl": 0.0380859375,
"learning_rate": 9.621650897886541e-07,
"loss": -0.0018,
"reward": 0.7542252540588379,
"reward_std": 0.6290829181671143,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.6356835961341858,
"step": 104
},
{
"completion_length": 80.703125,
"epoch": 0.15283842794759825,
"grad_norm": 1.965318350453434,
"kl": 0.035400390625,
"learning_rate": 9.612599424162343e-07,
"loss": -0.0008,
"reward": 1.0488801002502441,
"reward_std": 0.48284074664115906,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.6244661808013916,
"step": 105
},
{
"completion_length": 93.875,
"epoch": 0.15429403202328967,
"grad_norm": 2.1219245641446975,
"kl": 0.038818359375,
"learning_rate": 9.603445315264316e-07,
"loss": 0.0014,
"reward": 1.0476692914962769,
"reward_std": 0.5120445489883423,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.6368489265441895,
"step": 106
},
{
"completion_length": 86.890625,
"epoch": 0.15574963609898107,
"grad_norm": 2.036182966894686,
"kl": 0.044189453125,
"learning_rate": 9.59418877488098e-07,
"loss": -0.004,
"reward": 0.955091118812561,
"reward_std": 0.804840624332428,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.6345182657241821,
"step": 107
},
{
"completion_length": 90.484375,
"epoch": 0.1572052401746725,
"grad_norm": 2.042159784075266,
"kl": 0.03173828125,
"learning_rate": 9.584830008980067e-07,
"loss": 0.0002,
"reward": 0.3213476538658142,
"reward_std": 0.6555300354957581,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.63895183801651,
"step": 108
},
{
"completion_length": 94.625,
"epoch": 0.1586608442503639,
"grad_norm": 1.9758420259115719,
"kl": 0.040771484375,
"learning_rate": 9.57536922580393e-07,
"loss": -0.0006,
"reward": 1.0781381130218506,
"reward_std": 0.7523989677429199,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.6387500762939453,
"step": 109
},
{
"completion_length": 91.6875,
"epoch": 0.16011644832605532,
"grad_norm": 2.331594748973951,
"kl": 0.0419921875,
"learning_rate": 9.565806635864917e-07,
"loss": -0.0045,
"reward": 0.5411393046379089,
"reward_std": 0.7696890830993652,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.6179623007774353,
"step": 110
},
{
"completion_length": 90.84375,
"epoch": 0.1615720524017467,
"grad_norm": 2.2436142517878648,
"kl": 0.038330078125,
"learning_rate": 9.556142451940679e-07,
"loss": 0.0008,
"reward": 0.2418619841337204,
"reward_std": 0.7631776332855225,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.6416015625,
"step": 111
},
{
"completion_length": 90.125,
"epoch": 0.16302765647743814,
"grad_norm": 1.8856149828565558,
"kl": 0.0478515625,
"learning_rate": 9.546376889069443e-07,
"loss": -0.0035,
"reward": 0.37279295921325684,
"reward_std": 0.6837524175643921,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.6407356858253479,
"step": 112
},
{
"completion_length": 98.203125,
"epoch": 0.16448326055312956,
"grad_norm": 2.173251868707325,
"kl": 0.03466796875,
"learning_rate": 9.536510164545222e-07,
"loss": 0.0007,
"reward": 1.06194007396698,
"reward_std": 0.6035024523735046,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.6475651264190674,
"step": 113
},
{
"completion_length": 103.3125,
"epoch": 0.16593886462882096,
"grad_norm": 2.0219223010119887,
"kl": 0.0439453125,
"learning_rate": 9.526542497912983e-07,
"loss": 0.0012,
"reward": 0.4283137917518616,
"reward_std": 0.6393612027168274,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.6627669334411621,
"step": 114
},
{
"completion_length": 100.796875,
"epoch": 0.16739446870451238,
"grad_norm": 2.0798140558182134,
"kl": 0.031005859375,
"learning_rate": 9.516474110963761e-07,
"loss": -0.0026,
"reward": 1.1091991662979126,
"reward_std": 0.4762105345726013,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.66357421875,
"step": 115
},
{
"completion_length": 100.421875,
"epoch": 0.16885007278020378,
"grad_norm": 2.2236939272257445,
"kl": 0.0478515625,
"learning_rate": 9.506305227729723e-07,
"loss": 0.0058,
"reward": 0.6438932418823242,
"reward_std": 0.509583592414856,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.6582422256469727,
"step": 116
},
{
"completion_length": 101.171875,
"epoch": 0.1703056768558952,
"grad_norm": 2.1980519442731468,
"kl": 0.0281982421875,
"learning_rate": 9.496036074479184e-07,
"loss": -0.001,
"reward": 0.827063798904419,
"reward_std": 0.9014047384262085,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.65771484375,
"step": 117
},
{
"completion_length": 96.125,
"epoch": 0.1717612809315866,
"grad_norm": 2.037191204255324,
"kl": 0.04150390625,
"learning_rate": 9.48566687971157e-07,
"loss": -0.0004,
"reward": 1.3227994441986084,
"reward_std": 0.5190377235412598,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.6540234684944153,
"step": 118
},
{
"completion_length": 109.09375,
"epoch": 0.17321688500727803,
"grad_norm": 2.0272853115442544,
"kl": 0.03466796875,
"learning_rate": 9.475197874152339e-07,
"loss": -0.0,
"reward": -0.18306639790534973,
"reward_std": 0.5607576966285706,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.664681077003479,
"step": 119
},
{
"completion_length": 96.515625,
"epoch": 0.17467248908296942,
"grad_norm": 2.0492268918355374,
"kl": 0.041748046875,
"learning_rate": 9.464629290747842e-07,
"loss": -0.0008,
"reward": 0.7684569954872131,
"reward_std": 0.7265533208847046,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.6427409052848816,
"step": 120
},
{
"completion_length": 102.375,
"epoch": 0.17612809315866085,
"grad_norm": 1.9898855505081523,
"kl": 0.041259765625,
"learning_rate": 9.453961364660142e-07,
"loss": -0.001,
"reward": 0.7294921875,
"reward_std": 0.3104066252708435,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6654947996139526,
"step": 121
},
{
"completion_length": 100.15625,
"epoch": 0.17758369723435224,
"grad_norm": 2.3735086249364046,
"kl": 0.037109375,
"learning_rate": 9.443194333261779e-07,
"loss": -0.0009,
"reward": 1.1522916555404663,
"reward_std": 0.48791223764419556,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.660559892654419,
"step": 122
},
{
"completion_length": 98.1875,
"epoch": 0.17903930131004367,
"grad_norm": 2.2233975652336175,
"kl": 0.02880859375,
"learning_rate": 9.432328436130493e-07,
"loss": -0.0013,
"reward": 1.3009765148162842,
"reward_std": 0.29463040828704834,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.656653642654419,
"step": 123
},
{
"completion_length": 114.890625,
"epoch": 0.1804949053857351,
"grad_norm": 2.0022084670374865,
"kl": 0.037841796875,
"learning_rate": 9.421363915043889e-07,
"loss": 0.0007,
"reward": 0.9197134971618652,
"reward_std": 0.3231443166732788,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.6893749833106995,
"step": 124
},
{
"completion_length": 99.171875,
"epoch": 0.1819505094614265,
"grad_norm": 1.8630637664051692,
"kl": 0.029296875,
"learning_rate": 9.410301013974056e-07,
"loss": -0.0012,
"reward": 0.63825523853302,
"reward_std": 0.3934669494628906,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.6578906178474426,
"step": 125
},
{
"completion_length": 100.859375,
"epoch": 0.18340611353711792,
"grad_norm": 1.9824110470356529,
"kl": 0.039794921875,
"learning_rate": 9.399139979082147e-07,
"loss": -0.0003,
"reward": 0.7100846171379089,
"reward_std": 0.5975295901298523,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6606184244155884,
"step": 126
},
{
"completion_length": 116.953125,
"epoch": 0.1848617176128093,
"grad_norm": 1.8889996265777742,
"kl": 0.027587890625,
"learning_rate": 9.387881058712888e-07,
"loss": 0.005,
"reward": 0.8783528804779053,
"reward_std": 0.8908267021179199,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.6875325441360474,
"step": 127
},
{
"completion_length": 107.375,
"epoch": 0.18631732168850074,
"grad_norm": 2.077089346407135,
"kl": 0.03173828125,
"learning_rate": 9.376524503389065e-07,
"loss": -0.002,
"reward": 0.4938216507434845,
"reward_std": 0.5928758978843689,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.669837236404419,
"step": 128
},
{
"completion_length": 117.109375,
"epoch": 0.18777292576419213,
"grad_norm": 1.8893938122618363,
"kl": 0.0294189453125,
"learning_rate": 9.36507056580594e-07,
"loss": -0.0018,
"reward": 0.7212304472923279,
"reward_std": 0.593620777130127,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.6912304759025574,
"step": 129
},
{
"completion_length": 123.234375,
"epoch": 0.18922852983988356,
"grad_norm": 1.7898651835340285,
"kl": 0.03173828125,
"learning_rate": 9.353519500825637e-07,
"loss": 0.0031,
"reward": 0.4010286331176758,
"reward_std": 0.5465176105499268,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.7014323472976685,
"step": 130
},
{
"completion_length": 104.0,
"epoch": 0.19068413391557495,
"grad_norm": 2.2172071907289346,
"kl": 0.0277099609375,
"learning_rate": 9.341871565471463e-07,
"loss": 0.0001,
"reward": 0.7860090732574463,
"reward_std": 0.4027497172355652,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.6570768356323242,
"step": 131
},
{
"completion_length": 114.546875,
"epoch": 0.19213973799126638,
"grad_norm": 1.886899003286242,
"kl": 0.0296630859375,
"learning_rate": 9.330127018922193e-07,
"loss": -0.0045,
"reward": 0.7923893332481384,
"reward_std": 0.5422953367233276,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.6841601133346558,
"step": 132
},
{
"completion_length": 127.171875,
"epoch": 0.19359534206695778,
"grad_norm": 1.940672486847996,
"kl": 0.0286865234375,
"learning_rate": 9.318286122506302e-07,
"loss": 0.0018,
"reward": 0.7649609446525574,
"reward_std": 0.9051897525787354,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.6996744871139526,
"step": 133
},
{
"completion_length": 122.59375,
"epoch": 0.1950509461426492,
"grad_norm": 2.012027311337077,
"kl": 0.03271484375,
"learning_rate": 9.306349139696154e-07,
"loss": 0.0021,
"reward": 0.9356836080551147,
"reward_std": 0.3557165265083313,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.6972851157188416,
"step": 134
},
{
"completion_length": 125.34375,
"epoch": 0.1965065502183406,
"grad_norm": 1.9803313783521628,
"kl": 0.03466796875,
"learning_rate": 9.29431633610213e-07,
"loss": -0.0039,
"reward": 1.0579817295074463,
"reward_std": 0.7984186410903931,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.6721354126930237,
"step": 135
},
{
"completion_length": 151.75,
"epoch": 0.19796215429403202,
"grad_norm": 1.6537670662618684,
"kl": 0.0308837890625,
"learning_rate": 9.282187979466729e-07,
"loss": 0.0042,
"reward": 0.8291862607002258,
"reward_std": 0.5554116368293762,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.7382357120513916,
"step": 136
},
{
"completion_length": 147.265625,
"epoch": 0.19941775836972345,
"grad_norm": 1.7201446218582972,
"kl": 0.031982421875,
"learning_rate": 9.269964339658604e-07,
"loss": 0.006,
"reward": 1.283261775970459,
"reward_std": 0.5926542282104492,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.7361132502555847,
"step": 137
},
{
"completion_length": 134.375,
"epoch": 0.20087336244541484,
"grad_norm": 1.874315712431397,
"kl": 0.03173828125,
"learning_rate": 9.257645688666555e-07,
"loss": -0.0038,
"reward": 1.069654941558838,
"reward_std": 0.37698203325271606,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.7255665063858032,
"step": 138
},
{
"completion_length": 136.75,
"epoch": 0.20232896652110627,
"grad_norm": 1.814100036642101,
"kl": 0.0301513671875,
"learning_rate": 9.245232300593488e-07,
"loss": 0.0007,
"reward": 0.5383853912353516,
"reward_std": 0.32863178849220276,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.7274609804153442,
"step": 139
},
{
"completion_length": 156.84375,
"epoch": 0.20378457059679767,
"grad_norm": 1.7660157876424847,
"kl": 0.042236328125,
"learning_rate": 9.232724451650302e-07,
"loss": -0.0023,
"reward": 1.042018175125122,
"reward_std": 0.4296218752861023,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.7598958611488342,
"step": 140
},
{
"completion_length": 152.078125,
"epoch": 0.2052401746724891,
"grad_norm": 1.9326818117418496,
"kl": 0.03662109375,
"learning_rate": 9.220122420149752e-07,
"loss": -0.0015,
"reward": 0.7897005081176758,
"reward_std": 0.8649002313613892,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.7523828148841858,
"step": 141
},
{
"completion_length": 146.171875,
"epoch": 0.2066957787481805,
"grad_norm": 2.3170747889179975,
"kl": 0.04150390625,
"learning_rate": 9.207426486500251e-07,
"loss": 0.0029,
"reward": 0.9602214097976685,
"reward_std": 0.48202353715896606,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.7439453601837158,
"step": 142
},
{
"completion_length": 153.78125,
"epoch": 0.2081513828238719,
"grad_norm": 1.8295711779834491,
"kl": 0.0380859375,
"learning_rate": 9.194636933199637e-07,
"loss": -0.0005,
"reward": 0.6159765720367432,
"reward_std": 0.6443432569503784,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.7611978650093079,
"step": 143
},
{
"completion_length": 167.6875,
"epoch": 0.2096069868995633,
"grad_norm": 1.8055857818112881,
"kl": 0.03466796875,
"learning_rate": 9.18175404482888e-07,
"loss": -0.0004,
"reward": 0.7853385210037231,
"reward_std": 0.6822813153266907,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.7819271087646484,
"step": 144
},
{
"completion_length": 149.59375,
"epoch": 0.21106259097525473,
"grad_norm": 1.7611225555314238,
"kl": 0.0361328125,
"learning_rate": 9.168778108045758e-07,
"loss": 0.0014,
"reward": 1.3780207633972168,
"reward_std": 0.5230543613433838,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.7534895539283752,
"step": 145
},
{
"completion_length": 154.515625,
"epoch": 0.21251819505094613,
"grad_norm": 1.8484664379659073,
"kl": 0.03515625,
"learning_rate": 9.155709411578467e-07,
"loss": -0.0028,
"reward": 0.663769543170929,
"reward_std": 0.8413479328155518,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.7554622292518616,
"step": 146
},
{
"completion_length": 169.25,
"epoch": 0.21397379912663755,
"grad_norm": 1.5659465541368485,
"kl": 0.03125,
"learning_rate": 9.14254824621921e-07,
"loss": 0.0025,
"reward": 0.8898242115974426,
"reward_std": 0.61174476146698,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.7810482382774353,
"step": 147
},
{
"completion_length": 183.078125,
"epoch": 0.21542940320232898,
"grad_norm": 1.9240333532034173,
"kl": 0.0299072265625,
"learning_rate": 9.129294904817715e-07,
"loss": 0.0055,
"reward": -0.32472002506256104,
"reward_std": 0.5251954793930054,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.804186224937439,
"step": 148
},
{
"completion_length": 188.078125,
"epoch": 0.21688500727802038,
"grad_norm": 1.584600042627141,
"kl": 0.037353515625,
"learning_rate": 9.115949682274727e-07,
"loss": -0.0006,
"reward": 1.0250911712646484,
"reward_std": 0.7966851592063904,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.7925260066986084,
"step": 149
},
{
"completion_length": 192.75,
"epoch": 0.2183406113537118,
"grad_norm": 1.7968297696360862,
"kl": 0.034912109375,
"learning_rate": 9.102512875535438e-07,
"loss": 0.0024,
"reward": 0.9074022769927979,
"reward_std": 0.5267736315727234,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.8198763132095337,
"step": 150
},
{
"completion_length": 181.109375,
"epoch": 0.2197962154294032,
"grad_norm": 1.487459256780106,
"kl": 0.034912109375,
"learning_rate": 9.088984783582889e-07,
"loss": 0.0008,
"reward": 1.1278971433639526,
"reward_std": 0.6906546950340271,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.771959662437439,
"step": 151
},
{
"completion_length": 168.859375,
"epoch": 0.22125181950509462,
"grad_norm": 1.6688905020817022,
"kl": 0.033203125,
"learning_rate": 9.075365707431311e-07,
"loss": 0.0037,
"reward": 0.38999348878860474,
"reward_std": 0.7020710110664368,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.7868424654006958,
"step": 152
},
{
"completion_length": 189.75,
"epoch": 0.22270742358078602,
"grad_norm": 1.5057560252284368,
"kl": 0.03369140625,
"learning_rate": 9.061655950119429e-07,
"loss": -0.0008,
"reward": 0.7166080474853516,
"reward_std": 0.7406556010246277,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.7738346457481384,
"step": 153
},
{
"completion_length": 195.203125,
"epoch": 0.22416302765647744,
"grad_norm": 1.6050100488071573,
"kl": 0.029296875,
"learning_rate": 9.04785581670372e-07,
"loss": -0.0032,
"reward": 0.00641275942325592,
"reward_std": 0.9503659605979919,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.8109830617904663,
"step": 154
},
{
"completion_length": 209.5625,
"epoch": 0.22561863173216884,
"grad_norm": 1.671842517453717,
"kl": 0.030517578125,
"learning_rate": 9.033965614251622e-07,
"loss": 0.002,
"reward": 1.051744818687439,
"reward_std": 0.6885015964508057,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.8231119513511658,
"step": 155
},
{
"completion_length": 182.359375,
"epoch": 0.22707423580786026,
"grad_norm": 1.7685493760830322,
"kl": 0.03466796875,
"learning_rate": 9.019985651834703e-07,
"loss": 0.0004,
"reward": 1.1568944454193115,
"reward_std": 0.6391496658325195,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.7899413704872131,
"step": 156
},
{
"completion_length": 203.671875,
"epoch": 0.22852983988355166,
"grad_norm": 1.6158229805841648,
"kl": 0.04541015625,
"learning_rate": 9.005916240521787e-07,
"loss": -0.0008,
"reward": 1.3469856977462769,
"reward_std": 0.5584310293197632,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.8243294358253479,
"step": 157
},
{
"completion_length": 198.0,
"epoch": 0.22998544395924309,
"grad_norm": 1.5287888769470326,
"kl": 0.03173828125,
"learning_rate": 8.99175769337203e-07,
"loss": -0.0002,
"reward": 0.5042838454246521,
"reward_std": 0.3807409107685089,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.8300260901451111,
"step": 158
},
{
"completion_length": 198.5625,
"epoch": 0.2314410480349345,
"grad_norm": 1.6980632238166218,
"kl": 0.045654296875,
"learning_rate": 8.97751032542795e-07,
"loss": 0.0048,
"reward": 0.7278580665588379,
"reward_std": 0.8472484350204468,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.809654951095581,
"step": 159
},
{
"completion_length": 209.734375,
"epoch": 0.2328966521106259,
"grad_norm": 1.4763866341269176,
"kl": 0.03759765625,
"learning_rate": 8.963174453708424e-07,
"loss": -0.0002,
"reward": 1.0725455284118652,
"reward_std": 0.9471028447151184,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.8029752969741821,
"step": 160
},
{
"completion_length": 204.84375,
"epoch": 0.23435225618631733,
"grad_norm": 1.509135110734139,
"kl": 0.039794921875,
"learning_rate": 8.94875039720163e-07,
"loss": 0.0021,
"reward": 0.7245508432388306,
"reward_std": 0.28923317790031433,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8137694597244263,
"step": 161
},
{
"completion_length": 205.6875,
"epoch": 0.23580786026200873,
"grad_norm": 1.670333061374936,
"kl": 0.03466796875,
"learning_rate": 8.934238476857949e-07,
"loss": -0.0015,
"reward": 1.22831392288208,
"reward_std": 0.6582134366035461,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.7960742115974426,
"step": 162
},
{
"completion_length": 219.171875,
"epoch": 0.23726346433770015,
"grad_norm": 1.4108741219708183,
"kl": 0.04345703125,
"learning_rate": 8.919639015582828e-07,
"loss": 0.0007,
"reward": 0.8317968845367432,
"reward_std": 0.8654596209526062,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.8558332920074463,
"step": 163
},
{
"completion_length": 221.921875,
"epoch": 0.23871906841339155,
"grad_norm": 1.5210214995683051,
"kl": 0.038330078125,
"learning_rate": 8.904952338229587e-07,
"loss": 0.0028,
"reward": 1.3629167079925537,
"reward_std": 0.4224995970726013,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.8020312786102295,
"step": 164
},
{
"completion_length": 221.109375,
"epoch": 0.24017467248908297,
"grad_norm": 1.570373533347191,
"kl": 0.047119140625,
"learning_rate": 8.890178771592197e-07,
"loss": -0.003,
"reward": 1.1900064945220947,
"reward_std": 0.45047634840011597,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8544987440109253,
"step": 165
},
{
"completion_length": 226.859375,
"epoch": 0.24163027656477437,
"grad_norm": 1.4369155698170966,
"kl": 0.041748046875,
"learning_rate": 8.875318644398007e-07,
"loss": 0.0001,
"reward": 1.1164518594741821,
"reward_std": 0.6411557793617249,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.8440819978713989,
"step": 166
},
{
"completion_length": 217.828125,
"epoch": 0.2430858806404658,
"grad_norm": 1.5294020027875026,
"kl": 0.04150390625,
"learning_rate": 8.860372287300431e-07,
"loss": -0.0025,
"reward": 0.8133528828620911,
"reward_std": 0.3728664517402649,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.8488085269927979,
"step": 167
},
{
"completion_length": 232.46875,
"epoch": 0.2445414847161572,
"grad_norm": 1.4493765933491527,
"kl": 0.043701171875,
"learning_rate": 8.845340032871583e-07,
"loss": 0.0056,
"reward": 0.8572330474853516,
"reward_std": 0.6895395517349243,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.82414710521698,
"step": 168
},
{
"completion_length": 226.5,
"epoch": 0.24599708879184862,
"grad_norm": 1.4764300782122841,
"kl": 0.0556640625,
"learning_rate": 8.83022221559489e-07,
"loss": 0.0009,
"reward": 0.47426432371139526,
"reward_std": 0.8444293737411499,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.8511523604393005,
"step": 169
},
{
"completion_length": 237.8125,
"epoch": 0.24745269286754004,
"grad_norm": 1.6769320026302001,
"kl": 0.0498046875,
"learning_rate": 8.815019171857637e-07,
"loss": -0.0027,
"reward": 1.0890560150146484,
"reward_std": 0.5113028287887573,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.8479622602462769,
"step": 170
},
{
"completion_length": 261.34375,
"epoch": 0.24890829694323144,
"grad_norm": 1.2657644862990098,
"kl": 0.039794921875,
"learning_rate": 8.799731239943487e-07,
"loss": 0.0036,
"reward": 1.0662890672683716,
"reward_std": 0.6503676176071167,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.8455598950386047,
"step": 171
},
{
"completion_length": 242.8125,
"epoch": 0.25036390101892286,
"grad_norm": 1.5140610941306871,
"kl": 0.0439453125,
"learning_rate": 8.784358760024959e-07,
"loss": 0.0,
"reward": 1.0661003589630127,
"reward_std": 0.7506436705589294,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.7991602420806885,
"step": 172
},
{
"completion_length": 272.265625,
"epoch": 0.25181950509461426,
"grad_norm": 1.3002869973291449,
"kl": 0.04345703125,
"learning_rate": 8.768902074155847e-07,
"loss": 0.0012,
"reward": 0.5323176383972168,
"reward_std": 0.7601783275604248,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8805208206176758,
"step": 173
},
{
"completion_length": 260.171875,
"epoch": 0.25327510917030566,
"grad_norm": 1.2395481937954542,
"kl": 0.04345703125,
"learning_rate": 8.753361526263621e-07,
"loss": 0.0013,
"reward": 1.0828580856323242,
"reward_std": 0.8798565864562988,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.8691210746765137,
"step": 174
},
{
"completion_length": 268.65625,
"epoch": 0.2547307132459971,
"grad_norm": 1.324781469590268,
"kl": 0.046875,
"learning_rate": 8.737737462141768e-07,
"loss": -0.0019,
"reward": 1.4227733612060547,
"reward_std": 0.8888717293739319,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.8309636116027832,
"step": 175
},
{
"completion_length": 266.9375,
"epoch": 0.2561863173216885,
"grad_norm": 1.2486364117958004,
"kl": 0.049072265625,
"learning_rate": 8.722030229442095e-07,
"loss": 0.0013,
"reward": 1.1527018547058105,
"reward_std": 0.9395639300346375,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8161133527755737,
"step": 176
},
{
"completion_length": 287.296875,
"epoch": 0.2576419213973799,
"grad_norm": 1.3449980174695708,
"kl": 0.0458984375,
"learning_rate": 8.706240177667001e-07,
"loss": -0.0028,
"reward": 0.6147265434265137,
"reward_std": 0.7540205717086792,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.8909245133399963,
"step": 177
},
{
"completion_length": 270.5,
"epoch": 0.2590975254730713,
"grad_norm": 1.6585671029978815,
"kl": 0.052490234375,
"learning_rate": 8.690367658161694e-07,
"loss": 0.0004,
"reward": 0.9076562523841858,
"reward_std": 0.698926568031311,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.88106769323349,
"step": 178
},
{
"completion_length": 299.03125,
"epoch": 0.26055312954876275,
"grad_norm": 1.27502354364528,
"kl": 0.04443359375,
"learning_rate": 8.674413024106379e-07,
"loss": 0.0011,
"reward": 0.06214843690395355,
"reward_std": 0.6285444498062134,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.8771094083786011,
"step": 179
},
{
"completion_length": 311.5625,
"epoch": 0.26200873362445415,
"grad_norm": 1.1519559026695043,
"kl": 0.044677734375,
"learning_rate": 8.658376630508391e-07,
"loss": 0.0012,
"reward": 0.49518883228302,
"reward_std": 1.1731948852539062,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8484830856323242,
"step": 180
},
{
"completion_length": 338.453125,
"epoch": 0.26346433770014555,
"grad_norm": 1.1630608606801263,
"kl": 0.03662109375,
"learning_rate": 8.642258834194305e-07,
"loss": 0.0,
"reward": 0.5747005343437195,
"reward_std": 1.185742974281311,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.855039119720459,
"step": 181
},
{
"completion_length": 298.203125,
"epoch": 0.264919941775837,
"grad_norm": 1.092799001313928,
"kl": 0.05078125,
"learning_rate": 8.626059993801986e-07,
"loss": 0.0013,
"reward": 1.1474609375,
"reward_std": 1.3674826622009277,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.7989974021911621,
"step": 182
},
{
"completion_length": 316.59375,
"epoch": 0.2663755458515284,
"grad_norm": 1.1905408752898774,
"kl": 0.03662109375,
"learning_rate": 8.609780469772621e-07,
"loss": -0.0003,
"reward": 0.4997330904006958,
"reward_std": 1.4191782474517822,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.7479101419448853,
"step": 183
},
{
"completion_length": 343.40625,
"epoch": 0.2678311499272198,
"grad_norm": 1.1407997883526075,
"kl": 0.038818359375,
"learning_rate": 8.593420624342691e-07,
"loss": -0.0005,
"reward": -0.2410416603088379,
"reward_std": 1.0026159286499023,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.806236982345581,
"step": 184
},
{
"completion_length": 292.59375,
"epoch": 0.2692867540029112,
"grad_norm": 1.3091775192956971,
"kl": 0.057373046875,
"learning_rate": 8.57698082153591e-07,
"loss": 0.0022,
"reward": 0.5668359398841858,
"reward_std": 1.0860553979873657,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.8349218368530273,
"step": 185
},
{
"completion_length": 297.1875,
"epoch": 0.27074235807860264,
"grad_norm": 1.1878094968164294,
"kl": 0.03857421875,
"learning_rate": 8.560461427155128e-07,
"loss": 0.0024,
"reward": 0.8997591137886047,
"reward_std": 1.2781544923782349,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.8541991710662842,
"step": 186
},
{
"completion_length": 283.4375,
"epoch": 0.27219796215429404,
"grad_norm": 1.3178336609529575,
"kl": 0.048828125,
"learning_rate": 8.543862808774191e-07,
"loss": -0.0003,
"reward": 1.5494986772537231,
"reward_std": 0.4223147928714752,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9061654210090637,
"step": 187
},
{
"completion_length": 296.890625,
"epoch": 0.27365356622998543,
"grad_norm": 1.2645455173847566,
"kl": 0.04541015625,
"learning_rate": 8.527185335729765e-07,
"loss": 0.0035,
"reward": 0.7015169262886047,
"reward_std": 1.2943627834320068,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8435481786727905,
"step": 188
},
{
"completion_length": 329.90625,
"epoch": 0.27510917030567683,
"grad_norm": 1.1581867103902632,
"kl": 0.0498046875,
"learning_rate": 8.510429379113113e-07,
"loss": -0.004,
"reward": 0.47220054268836975,
"reward_std": 0.8791263103485107,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8547526001930237,
"step": 189
},
{
"completion_length": 330.734375,
"epoch": 0.2765647743813683,
"grad_norm": 1.242052928211484,
"kl": 0.0419921875,
"learning_rate": 8.493595311761836e-07,
"loss": 0.0019,
"reward": 0.0584830716252327,
"reward_std": 0.9290468096733093,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.7730534076690674,
"step": 190
},
{
"completion_length": 298.8125,
"epoch": 0.2780203784570597,
"grad_norm": 1.0587431693427292,
"kl": 0.04931640625,
"learning_rate": 8.47668350825159e-07,
"loss": 0.002,
"reward": 0.8388606309890747,
"reward_std": 0.9359092712402344,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.8803450465202332,
"step": 191
},
{
"completion_length": 295.765625,
"epoch": 0.2794759825327511,
"grad_norm": 1.2325069141615899,
"kl": 0.044189453125,
"learning_rate": 8.459694344887731e-07,
"loss": -0.0002,
"reward": 1.281217336654663,
"reward_std": 0.7809577584266663,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.8709959983825684,
"step": 192
},
{
"completion_length": 344.203125,
"epoch": 0.28093158660844253,
"grad_norm": 1.0295971882868977,
"kl": 0.03857421875,
"learning_rate": 8.44262819969696e-07,
"loss": 0.0018,
"reward": -0.01013021171092987,
"reward_std": 1.1098777055740356,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.8218880295753479,
"step": 193
},
{
"completion_length": 341.453125,
"epoch": 0.2823871906841339,
"grad_norm": 1.1002232540991268,
"kl": 0.043212890625,
"learning_rate": 8.425485452418905e-07,
"loss": -0.0031,
"reward": 1.3040039539337158,
"reward_std": 1.2101850509643555,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.8196288347244263,
"step": 194
},
{
"completion_length": 313.125,
"epoch": 0.2838427947598253,
"grad_norm": 1.2142916830344008,
"kl": 0.04443359375,
"learning_rate": 8.408266484497664e-07,
"loss": 0.0029,
"reward": 0.5552148222923279,
"reward_std": 0.7991162538528442,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.8420246839523315,
"step": 195
},
{
"completion_length": 295.796875,
"epoch": 0.2852983988355167,
"grad_norm": 1.0476158305891778,
"kl": 0.0439453125,
"learning_rate": 8.39097167907333e-07,
"loss": -0.0038,
"reward": 0.6949739456176758,
"reward_std": 1.1475563049316406,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8432031273841858,
"step": 196
},
{
"completion_length": 286.109375,
"epoch": 0.2867540029112082,
"grad_norm": 1.326917469312428,
"kl": 0.04931640625,
"learning_rate": 8.373601420973463e-07,
"loss": -0.0043,
"reward": 0.5129296779632568,
"reward_std": 0.5196168422698975,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9051302075386047,
"step": 197
},
{
"completion_length": 267.21875,
"epoch": 0.28820960698689957,
"grad_norm": 1.4099736885345076,
"kl": 0.054443359375,
"learning_rate": 8.356156096704514e-07,
"loss": 0.0035,
"reward": 0.9967187643051147,
"reward_std": 0.6953111886978149,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.8652864694595337,
"step": 198
},
{
"completion_length": 291.3125,
"epoch": 0.28966521106259097,
"grad_norm": 1.3657308201508438,
"kl": 0.04296875,
"learning_rate": 8.338636094443241e-07,
"loss": 0.0016,
"reward": 0.6036978960037231,
"reward_std": 0.5057134628295898,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.8577994704246521,
"step": 199
},
{
"completion_length": 301.484375,
"epoch": 0.29112081513828236,
"grad_norm": 1.1206020594596815,
"kl": 0.04345703125,
"learning_rate": 8.32104180402807e-07,
"loss": 0.001,
"reward": 0.6353580951690674,
"reward_std": 1.240022897720337,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8110611438751221,
"step": 200
},
{
"completion_length": 276.0,
"epoch": 0.2925764192139738,
"grad_norm": 1.4814987178126335,
"kl": 0.041015625,
"learning_rate": 8.303373616950406e-07,
"loss": 0.0003,
"reward": 0.9168750047683716,
"reward_std": 0.5409839749336243,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.8954036235809326,
"step": 201
},
{
"completion_length": 284.796875,
"epoch": 0.2940320232896652,
"grad_norm": 1.3014402895334682,
"kl": 0.04296875,
"learning_rate": 8.285631926345943e-07,
"loss": 0.0009,
"reward": 1.4197134971618652,
"reward_std": 0.7836724519729614,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9230338335037231,
"step": 202
},
{
"completion_length": 303.765625,
"epoch": 0.2954876273653566,
"grad_norm": 1.4100146478905302,
"kl": 0.044921875,
"learning_rate": 8.267817126985897e-07,
"loss": 0.0032,
"reward": 1.148378849029541,
"reward_std": 0.8374971151351929,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8262174725532532,
"step": 203
},
{
"completion_length": 259.140625,
"epoch": 0.29694323144104806,
"grad_norm": 1.3834197464383897,
"kl": 0.046630859375,
"learning_rate": 8.249929615268233e-07,
"loss": 0.0032,
"reward": 0.3902604579925537,
"reward_std": 0.24537065625190735,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9234635829925537,
"step": 204
},
{
"completion_length": 309.109375,
"epoch": 0.29839883551673946,
"grad_norm": 1.2269049796652634,
"kl": 0.03857421875,
"learning_rate": 8.231969789208845e-07,
"loss": -0.0007,
"reward": 0.6886327862739563,
"reward_std": 1.174818992614746,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8217057585716248,
"step": 205
},
{
"completion_length": 285.484375,
"epoch": 0.29985443959243085,
"grad_norm": 1.2547594996135933,
"kl": 0.0458984375,
"learning_rate": 8.213938048432696e-07,
"loss": 0.0074,
"reward": 0.8353320360183716,
"reward_std": 0.9169821739196777,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.8625064492225647,
"step": 206
},
{
"completion_length": 269.609375,
"epoch": 0.30131004366812225,
"grad_norm": 1.3754430566964044,
"kl": 0.046142578125,
"learning_rate": 8.195834794164924e-07,
"loss": 0.0009,
"reward": 1.1299283504486084,
"reward_std": 0.8097370862960815,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.91706383228302,
"step": 207
},
{
"completion_length": 278.21875,
"epoch": 0.3027656477438137,
"grad_norm": 1.5758484032719748,
"kl": 0.045166015625,
"learning_rate": 8.17766042922192e-07,
"loss": -0.006,
"reward": 1.1924219131469727,
"reward_std": 0.35831767320632935,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.8785676956176758,
"step": 208
},
{
"completion_length": 255.671875,
"epoch": 0.3042212518195051,
"grad_norm": 1.3050143773992107,
"kl": 0.0458984375,
"learning_rate": 8.15941535800236e-07,
"loss": -0.003,
"reward": 1.0622721910476685,
"reward_std": 0.30105501413345337,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.87548828125,
"step": 209
},
{
"completion_length": 275.90625,
"epoch": 0.3056768558951965,
"grad_norm": 1.1249884714957055,
"kl": 0.041015625,
"learning_rate": 8.141099986478212e-07,
"loss": -0.0028,
"reward": 1.0719857215881348,
"reward_std": 1.051162838935852,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.8484700918197632,
"step": 210
},
{
"completion_length": 285.9375,
"epoch": 0.3071324599708879,
"grad_norm": 1.3343580700065274,
"kl": 0.049560546875,
"learning_rate": 8.122714722185695e-07,
"loss": 0.0006,
"reward": 1.1121940612792969,
"reward_std": 0.45470452308654785,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.8535741567611694,
"step": 211
},
{
"completion_length": 286.96875,
"epoch": 0.30858806404657935,
"grad_norm": 1.2721134321125864,
"kl": 0.04052734375,
"learning_rate": 8.104259974216218e-07,
"loss": 0.0011,
"reward": 1.3401693105697632,
"reward_std": 0.7562965154647827,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9284895658493042,
"step": 212
},
{
"completion_length": 263.28125,
"epoch": 0.31004366812227074,
"grad_norm": 1.2793045432243102,
"kl": 0.0419921875,
"learning_rate": 8.085736153207276e-07,
"loss": -0.0016,
"reward": 1.68442702293396,
"reward_std": 0.5725850462913513,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.8700129985809326,
"step": 213
},
{
"completion_length": 292.578125,
"epoch": 0.31149927219796214,
"grad_norm": 1.1759325546130317,
"kl": 0.039794921875,
"learning_rate": 8.067143671333309e-07,
"loss": -0.0033,
"reward": 0.6993098855018616,
"reward_std": 1.116674542427063,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8843098878860474,
"step": 214
},
{
"completion_length": 293.84375,
"epoch": 0.3129548762736536,
"grad_norm": 1.3260579782974318,
"kl": 0.036865234375,
"learning_rate": 8.048482942296535e-07,
"loss": -0.0036,
"reward": 0.4943684935569763,
"reward_std": 0.884007453918457,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8536393642425537,
"step": 215
},
{
"completion_length": 313.453125,
"epoch": 0.314410480349345,
"grad_norm": 1.244589688629832,
"kl": 0.041259765625,
"learning_rate": 8.02975438131774e-07,
"loss": 0.0005,
"reward": 0.7252734899520874,
"reward_std": 1.0751287937164307,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.8259244561195374,
"step": 216
},
{
"completion_length": 320.90625,
"epoch": 0.3158660844250364,
"grad_norm": 1.2906749232909462,
"kl": 0.0341796875,
"learning_rate": 8.010958405127047e-07,
"loss": 0.0034,
"reward": 1.0169856548309326,
"reward_std": 1.000779628753662,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.8830013275146484,
"step": 217
},
{
"completion_length": 255.71875,
"epoch": 0.3173216885007278,
"grad_norm": 1.309866431103487,
"kl": 0.042236328125,
"learning_rate": 7.992095431954634e-07,
"loss": 0.0027,
"reward": 0.8923372626304626,
"reward_std": 0.30078914761543274,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.8763346672058105,
"step": 218
},
{
"completion_length": 258.109375,
"epoch": 0.31877729257641924,
"grad_norm": 1.3975143387297508,
"kl": 0.040771484375,
"learning_rate": 7.973165881521433e-07,
"loss": 0.0012,
"reward": 1.0036718845367432,
"reward_std": 0.9058128595352173,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.8883463740348816,
"step": 219
},
{
"completion_length": 276.6875,
"epoch": 0.32023289665211063,
"grad_norm": 1.2837989231665352,
"kl": 0.03759765625,
"learning_rate": 7.954170175029791e-07,
"loss": -0.0022,
"reward": 0.8400716185569763,
"reward_std": 0.7807621955871582,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9180272817611694,
"step": 220
},
{
"completion_length": 280.125,
"epoch": 0.32168850072780203,
"grad_norm": 1.4163987682065777,
"kl": 0.04736328125,
"learning_rate": 7.935108735154092e-07,
"loss": 0.0044,
"reward": 0.7034765481948853,
"reward_std": 0.44748401641845703,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8766406178474426,
"step": 221
},
{
"completion_length": 277.4375,
"epoch": 0.3231441048034934,
"grad_norm": 1.2806875439187504,
"kl": 0.04150390625,
"learning_rate": 7.915981986031366e-07,
"loss": 0.0006,
"reward": 1.4459569454193115,
"reward_std": 0.6417677402496338,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.8500716090202332,
"step": 222
},
{
"completion_length": 304.625,
"epoch": 0.3245997088791849,
"grad_norm": 1.1924907907457698,
"kl": 0.037353515625,
"learning_rate": 7.896790353251835e-07,
"loss": 0.0001,
"reward": 0.46823570132255554,
"reward_std": 0.7769339084625244,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.8649283647537231,
"step": 223
},
{
"completion_length": 303.1875,
"epoch": 0.3260553129548763,
"grad_norm": 1.1503304963010326,
"kl": 0.03466796875,
"learning_rate": 7.877534263849451e-07,
"loss": 0.0005,
"reward": 1.0099999904632568,
"reward_std": 0.964565634727478,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.8617057204246521,
"step": 224
},
{
"completion_length": 282.5,
"epoch": 0.32751091703056767,
"grad_norm": 1.1761505740028426,
"kl": 0.039306640625,
"learning_rate": 7.858214146292393e-07,
"loss": -0.0016,
"reward": 0.07833331823348999,
"reward_std": 1.1625947952270508,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.8088932037353516,
"step": 225
},
{
"completion_length": 312.515625,
"epoch": 0.3289665211062591,
"grad_norm": 1.1648396839632396,
"kl": 0.0361328125,
"learning_rate": 7.838830430473538e-07,
"loss": 0.001,
"reward": -0.08507812023162842,
"reward_std": 0.9128743410110474,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.8148698210716248,
"step": 226
},
{
"completion_length": 306.5,
"epoch": 0.3304221251819505,
"grad_norm": 0.9816488205657019,
"kl": 0.036376953125,
"learning_rate": 7.819383547700889e-07,
"loss": 0.0021,
"reward": 0.7689843773841858,
"reward_std": 1.1138982772827148,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.8328125476837158,
"step": 227
},
{
"completion_length": 266.03125,
"epoch": 0.3318777292576419,
"grad_norm": 1.3503817696996208,
"kl": 0.048828125,
"learning_rate": 7.799873930687977e-07,
"loss": 0.002,
"reward": 0.3949218690395355,
"reward_std": 0.8604871034622192,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.8740885257720947,
"step": 228
},
{
"completion_length": 294.15625,
"epoch": 0.3333333333333333,
"grad_norm": 1.0990493246122626,
"kl": 0.03369140625,
"learning_rate": 7.780302013544238e-07,
"loss": 0.0002,
"reward": 1.1642253398895264,
"reward_std": 0.8671401143074036,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8458007574081421,
"step": 229
},
{
"completion_length": 321.265625,
"epoch": 0.33478893740902477,
"grad_norm": 0.9938258028137048,
"kl": 0.037841796875,
"learning_rate": 7.760668231765351e-07,
"loss": -0.0014,
"reward": 0.6853646039962769,
"reward_std": 1.0251922607421875,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8308333158493042,
"step": 230
},
{
"completion_length": 309.53125,
"epoch": 0.33624454148471616,
"grad_norm": 1.123171517093054,
"kl": 0.037353515625,
"learning_rate": 7.740973022223549e-07,
"loss": -0.0024,
"reward": 1.0861718654632568,
"reward_std": 1.1104838848114014,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.8241406083106995,
"step": 231
},
{
"completion_length": 309.796875,
"epoch": 0.33770014556040756,
"grad_norm": 1.0934016007582914,
"kl": 0.033935546875,
"learning_rate": 7.721216823157894e-07,
"loss": -0.0063,
"reward": 1.000657558441162,
"reward_std": 0.6190701127052307,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.875084638595581,
"step": 232
},
{
"completion_length": 277.453125,
"epoch": 0.33915574963609896,
"grad_norm": 1.4389877479066715,
"kl": 0.035400390625,
"learning_rate": 7.701400074164535e-07,
"loss": -0.0021,
"reward": 1.146998643875122,
"reward_std": 0.6130830645561218,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.8996028900146484,
"step": 233
},
{
"completion_length": 292.296875,
"epoch": 0.3406113537117904,
"grad_norm": 1.254688662869592,
"kl": 0.031982421875,
"learning_rate": 7.681523216186911e-07,
"loss": -0.0019,
"reward": 1.3471484184265137,
"reward_std": 0.8775838613510132,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.92557293176651,
"step": 234
},
{
"completion_length": 277.265625,
"epoch": 0.3420669577874818,
"grad_norm": 1.1736210621507872,
"kl": 0.037841796875,
"learning_rate": 7.661586691505961e-07,
"loss": 0.0023,
"reward": 0.8513671159744263,
"reward_std": 0.8359072208404541,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9101953506469727,
"step": 235
},
{
"completion_length": 262.328125,
"epoch": 0.3435225618631732,
"grad_norm": 1.2639347156124685,
"kl": 0.03857421875,
"learning_rate": 7.641590943730258e-07,
"loss": 0.0009,
"reward": 1.421054720878601,
"reward_std": 0.36540117859840393,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9068489670753479,
"step": 236
},
{
"completion_length": 289.109375,
"epoch": 0.34497816593886466,
"grad_norm": 1.2779446490859168,
"kl": 0.0419921875,
"learning_rate": 7.621536417786158e-07,
"loss": 0.0021,
"reward": 0.836510419845581,
"reward_std": 0.8601652383804321,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.8924999833106995,
"step": 237
},
{
"completion_length": 287.0625,
"epoch": 0.34643377001455605,
"grad_norm": 1.255028005763755,
"kl": 0.033935546875,
"learning_rate": 7.601423559907894e-07,
"loss": 0.0035,
"reward": 0.6539322733879089,
"reward_std": 0.5522056818008423,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9175911545753479,
"step": 238
},
{
"completion_length": 289.578125,
"epoch": 0.34788937409024745,
"grad_norm": 1.1981964024570817,
"kl": 0.035400390625,
"learning_rate": 7.581252817627644e-07,
"loss": -0.0022,
"reward": 1.0136327743530273,
"reward_std": 0.5532514452934265,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.8591275215148926,
"step": 239
},
{
"completion_length": 294.765625,
"epoch": 0.34934497816593885,
"grad_norm": 1.3473651417615715,
"kl": 0.039306640625,
"learning_rate": 7.561024639765571e-07,
"loss": -0.0036,
"reward": 0.2712695300579071,
"reward_std": 0.544538676738739,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.8741991519927979,
"step": 240
},
{
"completion_length": 294.953125,
"epoch": 0.3508005822416303,
"grad_norm": 1.2518247684034733,
"kl": 0.039306640625,
"learning_rate": 7.540739476419846e-07,
"loss": -0.0002,
"reward": 0.4361327886581421,
"reward_std": 0.4436946213245392,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.8888281583786011,
"step": 241
},
{
"completion_length": 285.640625,
"epoch": 0.3522561863173217,
"grad_norm": 1.3471385559147369,
"kl": 0.037109375,
"learning_rate": 7.520397778956622e-07,
"loss": 0.0002,
"reward": 0.8719531297683716,
"reward_std": 0.7764471769332886,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9443880319595337,
"step": 242
},
{
"completion_length": 278.0625,
"epoch": 0.3537117903930131,
"grad_norm": 1.245756955751304,
"kl": 0.0458984375,
"learning_rate": 7.5e-07,
"loss": 0.0024,
"reward": 0.9515169262886047,
"reward_std": 0.46474969387054443,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.8945116400718689,
"step": 243
},
{
"completion_length": 273.65625,
"epoch": 0.3551673944687045,
"grad_norm": 1.3186357128195956,
"kl": 0.041015625,
"learning_rate": 7.479546593421947e-07,
"loss": 0.0008,
"reward": 0.5497395992279053,
"reward_std": 0.9145887494087219,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9244270324707031,
"step": 244
},
{
"completion_length": 302.703125,
"epoch": 0.35662299854439594,
"grad_norm": 1.1625630519045944,
"kl": 0.035400390625,
"learning_rate": 7.459038014332209e-07,
"loss": 0.0016,
"reward": 0.05195310711860657,
"reward_std": 0.509819507598877,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.9129297733306885,
"step": 245
},
{
"completion_length": 311.40625,
"epoch": 0.35807860262008734,
"grad_norm": 1.3125269152541976,
"kl": 0.0380859375,
"learning_rate": 7.438474719068173e-07,
"loss": -0.0042,
"reward": 1.84199857711792,
"reward_std": 0.3553314805030823,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 0.9321680068969727,
"step": 246
},
{
"completion_length": 295.4375,
"epoch": 0.35953420669577874,
"grad_norm": 1.2573734594613415,
"kl": 0.036865234375,
"learning_rate": 7.417857165184723e-07,
"loss": 0.0018,
"reward": 0.6018099188804626,
"reward_std": 0.4691210389137268,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9123698472976685,
"step": 247
},
{
"completion_length": 306.125,
"epoch": 0.3609898107714702,
"grad_norm": 1.2433464807920045,
"kl": 0.03564453125,
"learning_rate": 7.397185811444049e-07,
"loss": 0.0005,
"reward": 0.889244794845581,
"reward_std": 0.806644856929779,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.8913020491600037,
"step": 248
},
{
"completion_length": 315.3125,
"epoch": 0.3624454148471616,
"grad_norm": 1.1237789290801656,
"kl": 0.042236328125,
"learning_rate": 7.376461117805449e-07,
"loss": -0.001,
"reward": 1.3869401216506958,
"reward_std": 0.9981783628463745,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.8924478888511658,
"step": 249
},
{
"completion_length": 299.75,
"epoch": 0.363901018922853,
"grad_norm": 1.1632799586225018,
"kl": 0.03759765625,
"learning_rate": 7.355683545415089e-07,
"loss": 0.0006,
"reward": 0.85239577293396,
"reward_std": 0.6559892892837524,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.8976563215255737,
"step": 250
},
{
"completion_length": 279.09375,
"epoch": 0.3653566229985444,
"grad_norm": 1.1514063608017875,
"kl": 0.04248046875,
"learning_rate": 7.33485355659574e-07,
"loss": -0.0002,
"reward": 0.6367447972297668,
"reward_std": 1.1905419826507568,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.913867175579071,
"step": 251
},
{
"completion_length": 319.765625,
"epoch": 0.36681222707423583,
"grad_norm": 0.9469865528331761,
"kl": 0.0361328125,
"learning_rate": 7.313971614836495e-07,
"loss": -0.0009,
"reward": 0.50822913646698,
"reward_std": 0.788324236869812,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8841667175292969,
"step": 252
},
{
"completion_length": 321.515625,
"epoch": 0.3682678311499272,
"grad_norm": 1.1463942286208362,
"kl": 0.038818359375,
"learning_rate": 7.293038184782454e-07,
"loss": -0.0042,
"reward": 1.2917838096618652,
"reward_std": 0.6375994086265564,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.89473956823349,
"step": 253
},
{
"completion_length": 341.265625,
"epoch": 0.3697234352256186,
"grad_norm": 0.9758868458165839,
"kl": 0.032958984375,
"learning_rate": 7.272053732224387e-07,
"loss": 0.004,
"reward": -0.20222003757953644,
"reward_std": 0.5149335265159607,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.8958659172058105,
"step": 254
},
{
"completion_length": 332.5625,
"epoch": 0.37117903930131,
"grad_norm": 1.1030567827011573,
"kl": 0.039794921875,
"learning_rate": 7.251018724088366e-07,
"loss": 0.0035,
"reward": 0.6221875548362732,
"reward_std": 1.2777836322784424,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.87088543176651,
"step": 255
},
{
"completion_length": 296.59375,
"epoch": 0.3726346433770015,
"grad_norm": 1.1648513737927273,
"kl": 0.042724609375,
"learning_rate": 7.22993362842538e-07,
"loss": 0.0038,
"reward": 0.18544921278953552,
"reward_std": 0.922585666179657,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.8398241996765137,
"step": 256
},
{
"completion_length": 325.078125,
"epoch": 0.37409024745269287,
"grad_norm": 1.063758965452858,
"kl": 0.03662109375,
"learning_rate": 7.208798914400915e-07,
"loss": 0.0011,
"reward": 0.6968294382095337,
"reward_std": 1.144836187362671,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8460221290588379,
"step": 257
},
{
"completion_length": 315.84375,
"epoch": 0.37554585152838427,
"grad_norm": 1.1772919287263088,
"kl": 0.036376953125,
"learning_rate": 7.187615052284521e-07,
"loss": -0.0012,
"reward": 1.352858066558838,
"reward_std": 0.5259904861450195,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9533268213272095,
"step": 258
},
{
"completion_length": 317.984375,
"epoch": 0.37700145560407566,
"grad_norm": 1.0657146598710652,
"kl": 0.03759765625,
"learning_rate": 7.166382513439343e-07,
"loss": 0.0,
"reward": 0.8918294310569763,
"reward_std": 0.6458997130393982,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.8966862559318542,
"step": 259
},
{
"completion_length": 319.5,
"epoch": 0.3784570596797671,
"grad_norm": 1.074343844728951,
"kl": 0.037841796875,
"learning_rate": 7.145101770311633e-07,
"loss": -0.0006,
"reward": 0.8149153590202332,
"reward_std": 1.0369267463684082,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.86767578125,
"step": 260
},
{
"completion_length": 308.359375,
"epoch": 0.3799126637554585,
"grad_norm": 1.291348430451882,
"kl": 0.03662109375,
"learning_rate": 7.12377329642024e-07,
"loss": 0.0032,
"reward": 1.153606653213501,
"reward_std": 0.9500166773796082,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.8819010853767395,
"step": 261
},
{
"completion_length": 325.78125,
"epoch": 0.3813682678311499,
"grad_norm": 1.0948918878420022,
"kl": 0.0380859375,
"learning_rate": 7.102397566346072e-07,
"loss": -0.0025,
"reward": 0.6883333325386047,
"reward_std": 0.6285832524299622,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9229167103767395,
"step": 262
},
{
"completion_length": 303.671875,
"epoch": 0.38282387190684136,
"grad_norm": 1.2246966458821558,
"kl": 0.039306640625,
"learning_rate": 7.080975055721537e-07,
"loss": 0.0028,
"reward": 1.2784569263458252,
"reward_std": 0.6068631410598755,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9090169072151184,
"step": 263
},
{
"completion_length": 301.515625,
"epoch": 0.38427947598253276,
"grad_norm": 1.274689095724219,
"kl": 0.034423828125,
"learning_rate": 7.059506241219964e-07,
"loss": 0.0038,
"reward": 0.5575065016746521,
"reward_std": 0.2938391864299774,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9139648079872131,
"step": 264
},
{
"completion_length": 317.65625,
"epoch": 0.38573508005822416,
"grad_norm": 1.1114171001211495,
"kl": 0.037353515625,
"learning_rate": 7.037991600544982e-07,
"loss": 0.0001,
"reward": 0.32826173305511475,
"reward_std": 1.0576179027557373,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.8831444978713989,
"step": 265
},
{
"completion_length": 307.125,
"epoch": 0.38719068413391555,
"grad_norm": 1.0365067938257357,
"kl": 0.03271484375,
"learning_rate": 7.016431612419906e-07,
"loss": 0.0014,
"reward": 0.7509114742279053,
"reward_std": 1.0201146602630615,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.8540234565734863,
"step": 266
},
{
"completion_length": 315.0625,
"epoch": 0.388646288209607,
"grad_norm": 1.1542972359276211,
"kl": 0.039794921875,
"learning_rate": 6.994826756577081e-07,
"loss": 0.0028,
"reward": 0.3422461152076721,
"reward_std": 0.7236604690551758,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.8930273652076721,
"step": 267
},
{
"completion_length": 292.453125,
"epoch": 0.3901018922852984,
"grad_norm": 1.247691385135112,
"kl": 0.03759765625,
"learning_rate": 6.973177513747204e-07,
"loss": 0.004,
"reward": 1.1121549606323242,
"reward_std": 0.2723959684371948,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9376627206802368,
"step": 268
},
{
"completion_length": 327.25,
"epoch": 0.3915574963609898,
"grad_norm": 1.1547141576656188,
"kl": 0.03662109375,
"learning_rate": 6.951484365648627e-07,
"loss": 0.0021,
"reward": 0.8944270610809326,
"reward_std": 0.8052605390548706,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.8651041984558105,
"step": 269
},
{
"completion_length": 303.03125,
"epoch": 0.3930131004366812,
"grad_norm": 1.3851574923540422,
"kl": 0.03955078125,
"learning_rate": 6.929747794976643e-07,
"loss": -0.0045,
"reward": 1.0955729484558105,
"reward_std": 0.38242411613464355,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9138151407241821,
"step": 270
},
{
"completion_length": 271.625,
"epoch": 0.39446870451237265,
"grad_norm": 1.3401818240693362,
"kl": 0.037353515625,
"learning_rate": 6.907968285392743e-07,
"loss": -0.0006,
"reward": 1.7740495204925537,
"reward_std": 0.47597813606262207,
"rewards/accuracy_reward": 0.953125,
"rewards/format_reward": 0.9092708826065063,
"step": 271
},
{
"completion_length": 294.453125,
"epoch": 0.39592430858806404,
"grad_norm": 1.2186833540315785,
"kl": 0.036865234375,
"learning_rate": 6.886146321513849e-07,
"loss": -0.0039,
"reward": 0.8727213740348816,
"reward_std": 0.7979456186294556,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.86697918176651,
"step": 272
},
{
"completion_length": 299.578125,
"epoch": 0.39737991266375544,
"grad_norm": 1.1830905690032625,
"kl": 0.0390625,
"learning_rate": 6.864282388901543e-07,
"loss": -0.0064,
"reward": 0.28068357706069946,
"reward_std": 0.5684016942977905,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.9229100942611694,
"step": 273
},
{
"completion_length": 298.40625,
"epoch": 0.3988355167394469,
"grad_norm": 1.1221914279750511,
"kl": 0.035888671875,
"learning_rate": 6.84237697405125e-07,
"loss": -0.003,
"reward": 1.0086263418197632,
"reward_std": 0.8679967522621155,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.8907877206802368,
"step": 274
},
{
"completion_length": 326.9375,
"epoch": 0.4002911208151383,
"grad_norm": 1.0580769064192943,
"kl": 0.0400390625,
"learning_rate": 6.820430564381419e-07,
"loss": -0.0007,
"reward": 0.6745051741600037,
"reward_std": 0.84361732006073,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8712891340255737,
"step": 275
},
{
"completion_length": 293.09375,
"epoch": 0.4017467248908297,
"grad_norm": 1.2858645148857393,
"kl": 0.04541015625,
"learning_rate": 6.79844364822268e-07,
"loss": -0.0051,
"reward": 0.7683008313179016,
"reward_std": 0.6962195634841919,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.893470048904419,
"step": 276
},
{
"completion_length": 328.234375,
"epoch": 0.4032023289665211,
"grad_norm": 1.0468195864436123,
"kl": 0.038818359375,
"learning_rate": 6.776416714806969e-07,
"loss": -0.0035,
"reward": 1.0561068058013916,
"reward_std": 1.1647756099700928,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.8450260162353516,
"step": 277
},
{
"completion_length": 309.640625,
"epoch": 0.40465793304221254,
"grad_norm": 1.3703569385022722,
"kl": 0.042724609375,
"learning_rate": 6.754350254256652e-07,
"loss": 0.003,
"reward": 1.3729296922683716,
"reward_std": 0.5724613070487976,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9223437309265137,
"step": 278
},
{
"completion_length": 299.453125,
"epoch": 0.40611353711790393,
"grad_norm": 1.1513843198550995,
"kl": 0.0458984375,
"learning_rate": 6.732244757573618e-07,
"loss": 0.0021,
"reward": 0.58970046043396,
"reward_std": 0.6756365299224854,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.8674869537353516,
"step": 279
},
{
"completion_length": 285.890625,
"epoch": 0.40756914119359533,
"grad_norm": 1.293846147841921,
"kl": 0.0390625,
"learning_rate": 6.710100716628344e-07,
"loss": 0.0049,
"reward": 0.7233072519302368,
"reward_std": 0.29892754554748535,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8979557752609253,
"step": 280
},
{
"completion_length": 266.390625,
"epoch": 0.4090247452692867,
"grad_norm": 1.2289155325973107,
"kl": 0.041259765625,
"learning_rate": 6.687918624148963e-07,
"loss": -0.0007,
"reward": 0.04064452648162842,
"reward_std": 0.2825842499732971,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.8914909362792969,
"step": 281
},
{
"completion_length": 297.8125,
"epoch": 0.4104803493449782,
"grad_norm": 1.1879387558526018,
"kl": 0.037109375,
"learning_rate": 6.665698973710288e-07,
"loss": -0.0028,
"reward": 0.9278711080551147,
"reward_std": 0.7110856771469116,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.8445507884025574,
"step": 282
},
{
"completion_length": 300.5625,
"epoch": 0.4119359534206696,
"grad_norm": 1.1205832368306863,
"kl": 0.03759765625,
"learning_rate": 6.643442259722845e-07,
"loss": -0.0021,
"reward": 0.5016796588897705,
"reward_std": 0.8245861530303955,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8688150644302368,
"step": 283
},
{
"completion_length": 291.46875,
"epoch": 0.413391557496361,
"grad_norm": 1.1232126206960045,
"kl": 0.040771484375,
"learning_rate": 6.621148977421855e-07,
"loss": 0.0054,
"reward": 1.0367382764816284,
"reward_std": 0.8832352757453918,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9021419286727905,
"step": 284
},
{
"completion_length": 288.328125,
"epoch": 0.4148471615720524,
"grad_norm": 1.186415542575965,
"kl": 0.035888671875,
"learning_rate": 6.598819622856226e-07,
"loss": -0.0035,
"reward": 1.6761784553527832,
"reward_std": 0.69477379322052,
"rewards/accuracy_reward": 0.921875,
"rewards/format_reward": 0.9105534553527832,
"step": 285
},
{
"completion_length": 292.609375,
"epoch": 0.4163027656477438,
"grad_norm": 1.2789098266158088,
"kl": 0.043212890625,
"learning_rate": 6.576454692877512e-07,
"loss": 0.0009,
"reward": 1.4440560340881348,
"reward_std": 0.40633606910705566,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9008529186248779,
"step": 286
},
{
"completion_length": 300.9375,
"epoch": 0.4177583697234352,
"grad_norm": 1.1125232437916257,
"kl": 0.039306640625,
"learning_rate": 6.554054685128856e-07,
"loss": 0.003,
"reward": 0.5055012702941895,
"reward_std": 0.5191164612770081,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9176367521286011,
"step": 287
},
{
"completion_length": 305.1875,
"epoch": 0.4192139737991266,
"grad_norm": 1.3508336793233238,
"kl": 0.042236328125,
"learning_rate": 6.531620098033918e-07,
"loss": -0.0003,
"reward": 0.727037787437439,
"reward_std": 0.7846024036407471,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9105533957481384,
"step": 288
},
{
"completion_length": 286.953125,
"epoch": 0.42066957787481807,
"grad_norm": 1.355472255753869,
"kl": 0.0419921875,
"learning_rate": 6.509151430785785e-07,
"loss": 0.0057,
"reward": 1.5655077695846558,
"reward_std": 0.363979309797287,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9299349188804626,
"step": 289
},
{
"completion_length": 324.296875,
"epoch": 0.42212518195050946,
"grad_norm": 1.0617875027109431,
"kl": 0.042236328125,
"learning_rate": 6.486649183335862e-07,
"loss": 0.0028,
"reward": 0.3495572805404663,
"reward_std": 0.61403489112854,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.8596354126930237,
"step": 290
},
{
"completion_length": 303.375,
"epoch": 0.42358078602620086,
"grad_norm": 1.2306825555758596,
"kl": 0.040283203125,
"learning_rate": 6.464113856382751e-07,
"loss": 0.0013,
"reward": 0.9183528423309326,
"reward_std": 0.4856081008911133,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.88413405418396,
"step": 291
},
{
"completion_length": 339.09375,
"epoch": 0.42503639010189226,
"grad_norm": 0.9937441597737549,
"kl": 0.035400390625,
"learning_rate": 6.441545951361109e-07,
"loss": -0.0029,
"reward": 0.5682356357574463,
"reward_std": 1.1054012775421143,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.8453580737113953,
"step": 292
},
{
"completion_length": 325.109375,
"epoch": 0.4264919941775837,
"grad_norm": 1.1320952087408245,
"kl": 0.03857421875,
"learning_rate": 6.418945970430485e-07,
"loss": -0.0018,
"reward": 1.6307356357574463,
"reward_std": 0.8666630983352661,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 0.9119856357574463,
"step": 293
},
{
"completion_length": 312.953125,
"epoch": 0.4279475982532751,
"grad_norm": 1.0902739141585274,
"kl": 0.042236328125,
"learning_rate": 6.39631441646415e-07,
"loss": -0.003,
"reward": 0.8973242044448853,
"reward_std": 0.8014050722122192,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.8655273914337158,
"step": 294
},
{
"completion_length": 317.765625,
"epoch": 0.4294032023289665,
"grad_norm": 1.1492762649925403,
"kl": 0.04052734375,
"learning_rate": 6.373651793037916e-07,
"loss": -0.001,
"reward": 0.6366666555404663,
"reward_std": 0.6303349733352661,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9156770706176758,
"step": 295
},
{
"completion_length": 303.640625,
"epoch": 0.43085880640465796,
"grad_norm": 1.1237468628260645,
"kl": 0.044921875,
"learning_rate": 6.35095860441891e-07,
"loss": 0.0027,
"reward": 0.8331836462020874,
"reward_std": 0.38058170676231384,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9308528900146484,
"step": 296
},
{
"completion_length": 309.765625,
"epoch": 0.43231441048034935,
"grad_norm": 1.079089420677735,
"kl": 0.041015625,
"learning_rate": 6.328235355554381e-07,
"loss": 0.0039,
"reward": 1.14655601978302,
"reward_std": 0.9032782912254333,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9077669382095337,
"step": 297
},
{
"completion_length": 327.1875,
"epoch": 0.43377001455604075,
"grad_norm": 1.1012850272675234,
"kl": 0.041015625,
"learning_rate": 6.305482552060441e-07,
"loss": -0.0018,
"reward": 0.41550779342651367,
"reward_std": 0.9270626306533813,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.8848437070846558,
"step": 298
},
{
"completion_length": 321.90625,
"epoch": 0.43522561863173215,
"grad_norm": 1.1334862363298608,
"kl": 0.0439453125,
"learning_rate": 6.282700700210826e-07,
"loss": 0.0016,
"reward": 0.8490754961967468,
"reward_std": 0.8852044939994812,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.950390636920929,
"step": 299
},
{
"completion_length": 319.46875,
"epoch": 0.4366812227074236,
"grad_norm": 1.3237664738865698,
"kl": 0.04052734375,
"learning_rate": 6.259890306925626e-07,
"loss": 0.0024,
"reward": 1.1324349641799927,
"reward_std": 0.6891584992408752,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9130728840827942,
"step": 300
},
{
"completion_length": 340.21875,
"epoch": 0.438136826783115,
"grad_norm": 0.925628291932368,
"kl": 0.039794921875,
"learning_rate": 6.237051879760013e-07,
"loss": -0.0004,
"reward": -0.13870440423488617,
"reward_std": 0.4732888340950012,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.8486002683639526,
"step": 301
},
{
"completion_length": 293.5,
"epoch": 0.4395924308588064,
"grad_norm": 1.3848838366572507,
"kl": 0.048583984375,
"learning_rate": 6.214185926892935e-07,
"loss": 0.0011,
"reward": 0.7430468797683716,
"reward_std": 0.5002673864364624,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9450520873069763,
"step": 302
},
{
"completion_length": 301.5,
"epoch": 0.4410480349344978,
"grad_norm": 1.1287435583620058,
"kl": 0.057373046875,
"learning_rate": 6.191292957115824e-07,
"loss": 0.0004,
"reward": 1.1467642784118652,
"reward_std": 0.2716268002986908,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9220898151397705,
"step": 303
},
{
"completion_length": 313.65625,
"epoch": 0.44250363901018924,
"grad_norm": 1.1031415899812522,
"kl": 0.0419921875,
"learning_rate": 6.168373479821263e-07,
"loss": 0.0023,
"reward": 0.6278645992279053,
"reward_std": 0.4927278757095337,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9092448353767395,
"step": 304
},
{
"completion_length": 324.65625,
"epoch": 0.44395924308588064,
"grad_norm": 1.1753628489624894,
"kl": 0.04296875,
"learning_rate": 6.145428004991649e-07,
"loss": 0.0031,
"reward": 1.2421419620513916,
"reward_std": 0.5803054571151733,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.93757164478302,
"step": 305
},
{
"completion_length": 329.421875,
"epoch": 0.44541484716157204,
"grad_norm": 1.0124718909261303,
"kl": 0.042724609375,
"learning_rate": 6.122457043187862e-07,
"loss": 0.0026,
"reward": 1.2116667032241821,
"reward_std": 0.6427367925643921,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.90095055103302,
"step": 306
},
{
"completion_length": 308.265625,
"epoch": 0.4468704512372635,
"grad_norm": 1.0168849937197908,
"kl": 0.047607421875,
"learning_rate": 6.099461105537888e-07,
"loss": -0.0027,
"reward": 0.7654426693916321,
"reward_std": 0.5294202566146851,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9123828411102295,
"step": 307
},
{
"completion_length": 312.984375,
"epoch": 0.4483260553129549,
"grad_norm": 1.304050444945634,
"kl": 0.04443359375,
"learning_rate": 6.076440703725452e-07,
"loss": -0.0045,
"reward": 0.4798697829246521,
"reward_std": 0.11288902163505554,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9344010353088379,
"step": 308
},
{
"completion_length": 316.625,
"epoch": 0.4497816593886463,
"grad_norm": 1.0318927611491964,
"kl": 0.047607421875,
"learning_rate": 6.053396349978631e-07,
"loss": -0.0003,
"reward": 0.36076819896698,
"reward_std": 0.37749600410461426,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.8674870133399963,
"step": 309
},
{
"completion_length": 321.265625,
"epoch": 0.4512372634643377,
"grad_norm": 1.054780956617541,
"kl": 0.0517578125,
"learning_rate": 6.030328557058463e-07,
"loss": 0.0008,
"reward": 0.5132877826690674,
"reward_std": 0.8486453294754028,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.8326627612113953,
"step": 310
},
{
"completion_length": 321.21875,
"epoch": 0.45269286754002913,
"grad_norm": 1.1712340167228774,
"kl": 0.045654296875,
"learning_rate": 6.007237838247525e-07,
"loss": 0.0004,
"reward": 1.3080989122390747,
"reward_std": 0.7990570068359375,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.91447913646698,
"step": 311
},
{
"completion_length": 306.953125,
"epoch": 0.45414847161572053,
"grad_norm": 1.2366218066275552,
"kl": 0.050048828125,
"learning_rate": 5.984124707338527e-07,
"loss": -0.0022,
"reward": 1.22621750831604,
"reward_std": 0.586514413356781,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.9193814396858215,
"step": 312
},
{
"completion_length": 290.796875,
"epoch": 0.4556040756914119,
"grad_norm": 1.4431350657604642,
"kl": 0.051513671875,
"learning_rate": 5.960989678622864e-07,
"loss": 0.0034,
"reward": 1.7498502731323242,
"reward_std": 0.417039155960083,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.9267643094062805,
"step": 313
},
{
"completion_length": 324.75,
"epoch": 0.4570596797671033,
"grad_norm": 1.2410969845454607,
"kl": 0.0419921875,
"learning_rate": 5.937833266879186e-07,
"loss": -0.0065,
"reward": 1.30293607711792,
"reward_std": 1.056250810623169,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9061523675918579,
"step": 314
},
{
"completion_length": 299.71875,
"epoch": 0.4585152838427948,
"grad_norm": 1.206619606919444,
"kl": 0.049072265625,
"learning_rate": 5.914655987361933e-07,
"loss": 0.0001,
"reward": 1.1884570121765137,
"reward_std": 0.744032621383667,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.8389909267425537,
"step": 315
},
{
"completion_length": 320.125,
"epoch": 0.45997088791848617,
"grad_norm": 1.3350598933022013,
"kl": 0.048583984375,
"learning_rate": 5.891458355789879e-07,
"loss": 0.0027,
"reward": 0.7094922065734863,
"reward_std": 0.5736180543899536,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9550390839576721,
"step": 316
},
{
"completion_length": 317.203125,
"epoch": 0.46142649199417757,
"grad_norm": 1.1181223383097147,
"kl": 0.05078125,
"learning_rate": 5.868240888334652e-07,
"loss": 0.001,
"reward": 0.6829947829246521,
"reward_std": 0.4590635299682617,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9185676574707031,
"step": 317
},
{
"completion_length": 340.171875,
"epoch": 0.462882096069869,
"grad_norm": 1.0567451773483107,
"kl": 0.052734375,
"learning_rate": 5.845004101609246e-07,
"loss": 0.0011,
"reward": 0.6952344179153442,
"reward_std": 0.4979342818260193,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9068880677223206,
"step": 318
},
{
"completion_length": 320.375,
"epoch": 0.4643377001455604,
"grad_norm": 1.216157657027118,
"kl": 0.0498046875,
"learning_rate": 5.82174851265653e-07,
"loss": 0.0047,
"reward": 0.7820637822151184,
"reward_std": 0.6226429343223572,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.8854231834411621,
"step": 319
},
{
"completion_length": 324.625,
"epoch": 0.4657933042212518,
"grad_norm": 1.091320761141732,
"kl": 0.04541015625,
"learning_rate": 5.798474638937747e-07,
"loss": 0.0003,
"reward": 1.2951171398162842,
"reward_std": 0.6852482557296753,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9411588907241821,
"step": 320
},
{
"completion_length": 310.015625,
"epoch": 0.4672489082969432,
"grad_norm": 1.305156958844844,
"kl": 0.054443359375,
"learning_rate": 5.775182998320989e-07,
"loss": -0.0021,
"reward": 1.2777929306030273,
"reward_std": 1.1254336833953857,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.8748111724853516,
"step": 321
},
{
"completion_length": 330.21875,
"epoch": 0.46870451237263466,
"grad_norm": 1.2302790562732457,
"kl": 0.045166015625,
"learning_rate": 5.751874109069684e-07,
"loss": -0.0025,
"reward": 1.56145179271698,
"reward_std": 0.5644485950469971,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9349414110183716,
"step": 322
},
{
"completion_length": 341.703125,
"epoch": 0.47016011644832606,
"grad_norm": 1.0929022937248165,
"kl": 0.04736328125,
"learning_rate": 5.728548489831057e-07,
"loss": 0.0025,
"reward": 0.21611331403255463,
"reward_std": 0.6904685497283936,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.8760481476783752,
"step": 323
},
{
"completion_length": 316.984375,
"epoch": 0.47161572052401746,
"grad_norm": 1.048375002277041,
"kl": 0.04736328125,
"learning_rate": 5.705206659624596e-07,
"loss": -0.001,
"reward": 1.0557878017425537,
"reward_std": 0.761435866355896,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9400846362113953,
"step": 324
},
{
"completion_length": 323.625,
"epoch": 0.47307132459970885,
"grad_norm": 1.1891964320438795,
"kl": 0.0498046875,
"learning_rate": 5.6818491378305e-07,
"loss": -0.0003,
"reward": 1.0568814277648926,
"reward_std": 0.422406941652298,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9388867020606995,
"step": 325
},
{
"completion_length": 358.796875,
"epoch": 0.4745269286754003,
"grad_norm": 1.1285434321384047,
"kl": 0.046630859375,
"learning_rate": 5.658476444178118e-07,
"loss": -0.0037,
"reward": 0.16022136807441711,
"reward_std": 0.4567154049873352,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.9228385090827942,
"step": 326
},
{
"completion_length": 311.484375,
"epoch": 0.4759825327510917,
"grad_norm": 1.2560433618274836,
"kl": 0.05224609375,
"learning_rate": 5.635089098734393e-07,
"loss": 0.0005,
"reward": 1.5854361057281494,
"reward_std": 0.5700551271438599,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9499154090881348,
"step": 327
},
{
"completion_length": 350.953125,
"epoch": 0.4774381368267831,
"grad_norm": 1.0742309488389477,
"kl": 0.046142578125,
"learning_rate": 5.611687621892286e-07,
"loss": 0.0009,
"reward": 1.1679883003234863,
"reward_std": 0.7857703566551208,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9162304401397705,
"step": 328
},
{
"completion_length": 341.96875,
"epoch": 0.47889374090247455,
"grad_norm": 1.1156444737560618,
"kl": 0.046142578125,
"learning_rate": 5.588272534359192e-07,
"loss": 0.005,
"reward": 1.3799349069595337,
"reward_std": 0.4269542694091797,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9424349069595337,
"step": 329
},
{
"completion_length": 329.125,
"epoch": 0.48034934497816595,
"grad_norm": 1.0577263144575106,
"kl": 0.04833984375,
"learning_rate": 5.564844357145364e-07,
"loss": 0.0029,
"reward": 1.4966275691986084,
"reward_std": 0.5279096961021423,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.9164062738418579,
"step": 330
},
{
"completion_length": 336.40625,
"epoch": 0.48180494905385735,
"grad_norm": 1.218724139541029,
"kl": 0.051513671875,
"learning_rate": 5.541403611552309e-07,
"loss": 0.0004,
"reward": 0.5468424558639526,
"reward_std": 0.6334984302520752,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9350846409797668,
"step": 331
},
{
"completion_length": 372.921875,
"epoch": 0.48326055312954874,
"grad_norm": 0.9652643655545916,
"kl": 0.043212890625,
"learning_rate": 5.517950819161196e-07,
"loss": -0.0024,
"reward": 0.44192707538604736,
"reward_std": 1.4124200344085693,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8328776359558105,
"step": 332
},
{
"completion_length": 353.0,
"epoch": 0.4847161572052402,
"grad_norm": 1.0409879516651472,
"kl": 0.046142578125,
"learning_rate": 5.49448650182125e-07,
"loss": 0.0024,
"reward": 0.6646744608879089,
"reward_std": 0.7230473756790161,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8712239265441895,
"step": 333
},
{
"completion_length": 333.578125,
"epoch": 0.4861717612809316,
"grad_norm": 1.0907676643802855,
"kl": 0.052978515625,
"learning_rate": 5.47101118163813e-07,
"loss": 0.0032,
"reward": 1.0463411808013916,
"reward_std": 0.6876223087310791,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9171745181083679,
"step": 334
},
{
"completion_length": 345.171875,
"epoch": 0.487627365356623,
"grad_norm": 1.0768322878334375,
"kl": 0.051025390625,
"learning_rate": 5.447525380962334e-07,
"loss": 0.0012,
"reward": 0.48595699667930603,
"reward_std": 0.8556101322174072,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8658137917518616,
"step": 335
},
{
"completion_length": 357.90625,
"epoch": 0.4890829694323144,
"grad_norm": 1.0818933763915959,
"kl": 0.0439453125,
"learning_rate": 5.424029622377546e-07,
"loss": -0.0016,
"reward": 0.9220898151397705,
"reward_std": 1.0697258710861206,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.8551627397537231,
"step": 336
},
{
"completion_length": 367.09375,
"epoch": 0.49053857350800584,
"grad_norm": 0.9408245385193819,
"kl": 0.046142578125,
"learning_rate": 5.400524428689035e-07,
"loss": -0.0019,
"reward": 0.23695963621139526,
"reward_std": 0.7709278464317322,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.8613867163658142,
"step": 337
},
{
"completion_length": 341.09375,
"epoch": 0.49199417758369723,
"grad_norm": 0.9905746624278071,
"kl": 0.050048828125,
"learning_rate": 5.377010322912008e-07,
"loss": -0.0,
"reward": 0.875012993812561,
"reward_std": 0.9516023397445679,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.8464062213897705,
"step": 338
},
{
"completion_length": 340.9375,
"epoch": 0.49344978165938863,
"grad_norm": 1.1600321561212144,
"kl": 0.052001953125,
"learning_rate": 5.353487828259972e-07,
"loss": 0.0018,
"reward": 0.9952148795127869,
"reward_std": 0.7256425619125366,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9173762798309326,
"step": 339
},
{
"completion_length": 362.953125,
"epoch": 0.4949053857350801,
"grad_norm": 0.9677169222469415,
"kl": 0.043212890625,
"learning_rate": 5.329957468133103e-07,
"loss": -0.0019,
"reward": 0.9681054353713989,
"reward_std": 0.8885953426361084,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.8975194692611694,
"step": 340
},
{
"completion_length": 348.46875,
"epoch": 0.4963609898107715,
"grad_norm": 0.9992111334554165,
"kl": 0.04541015625,
"learning_rate": 5.306419766106581e-07,
"loss": -0.0002,
"reward": 0.6875976324081421,
"reward_std": 0.5412222146987915,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9448763132095337,
"step": 341
},
{
"completion_length": 339.234375,
"epoch": 0.4978165938864629,
"grad_norm": 1.0798384611489298,
"kl": 0.047119140625,
"learning_rate": 5.282875245918962e-07,
"loss": -0.0015,
"reward": 1.2949869632720947,
"reward_std": 0.6558061838150024,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.8986979722976685,
"step": 342
},
{
"completion_length": 328.71875,
"epoch": 0.4992721979621543,
"grad_norm": 1.1460335335257221,
"kl": 0.056884765625,
"learning_rate": 5.259324431460506e-07,
"loss": 0.0016,
"reward": 1.0809309482574463,
"reward_std": 0.42390185594558716,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9173502326011658,
"step": 343
},
{
"completion_length": 327.828125,
"epoch": 0.5007278020378457,
"grad_norm": 1.1732063642209178,
"kl": 0.046142578125,
"learning_rate": 5.235767846761529e-07,
"loss": -0.003,
"reward": 1.85539710521698,
"reward_std": 0.26673340797424316,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 0.9491471648216248,
"step": 344
},
{
"completion_length": 351.890625,
"epoch": 0.5021834061135371,
"grad_norm": 1.1152451844872624,
"kl": 0.0478515625,
"learning_rate": 5.212206015980741e-07,
"loss": 0.0033,
"reward": 1.0626237392425537,
"reward_std": 0.6199690103530884,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9055924415588379,
"step": 345
},
{
"completion_length": 339.921875,
"epoch": 0.5036390101892285,
"grad_norm": 1.1045572732346232,
"kl": 0.046142578125,
"learning_rate": 5.188639463393586e-07,
"loss": 0.0048,
"reward": 1.1775846481323242,
"reward_std": 0.4957619905471802,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9695507287979126,
"step": 346
},
{
"completion_length": 356.515625,
"epoch": 0.50509461426492,
"grad_norm": 1.0563165828060364,
"kl": 0.04638671875,
"learning_rate": 5.165068713380567e-07,
"loss": 0.0018,
"reward": 1.2808787822723389,
"reward_std": 0.6754826903343201,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.8902539610862732,
"step": 347
},
{
"completion_length": 333.265625,
"epoch": 0.5065502183406113,
"grad_norm": 1.1133538964235559,
"kl": 0.0458984375,
"learning_rate": 5.141494290415591e-07,
"loss": 0.001,
"reward": 1.0276042222976685,
"reward_std": 0.6215536594390869,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.951171875,
"step": 348
},
{
"completion_length": 355.34375,
"epoch": 0.5080058224163028,
"grad_norm": 1.0253948324473836,
"kl": 0.04150390625,
"learning_rate": 5.117916719054285e-07,
"loss": 0.0003,
"reward": 0.8305078744888306,
"reward_std": 0.8258182406425476,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.90471351146698,
"step": 349
},
{
"completion_length": 329.4375,
"epoch": 0.5094614264919942,
"grad_norm": 1.1732665473569692,
"kl": 0.049072265625,
"learning_rate": 5.094336523922335e-07,
"loss": 0.0032,
"reward": 0.745130181312561,
"reward_std": 0.2655390501022339,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.948046863079071,
"step": 350
},
{
"completion_length": 358.890625,
"epoch": 0.5109170305676856,
"grad_norm": 1.1759706647822885,
"kl": 0.046142578125,
"learning_rate": 5.07075422970381e-07,
"loss": 0.0022,
"reward": 1.322669267654419,
"reward_std": 0.8540600538253784,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9266666173934937,
"step": 351
},
{
"completion_length": 340.71875,
"epoch": 0.512372634643377,
"grad_norm": 1.280765883903055,
"kl": 0.04931640625,
"learning_rate": 5.047170361129483e-07,
"loss": -0.0051,
"reward": 0.7140104174613953,
"reward_std": 0.7314082384109497,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8824348449707031,
"step": 352
},
{
"completion_length": 365.671875,
"epoch": 0.5138282387190685,
"grad_norm": 1.0208673688095518,
"kl": 0.041259765625,
"learning_rate": 5.023585442965162e-07,
"loss": 0.002,
"reward": 0.8170312643051147,
"reward_std": 0.6544123888015747,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.8933594226837158,
"step": 353
},
{
"completion_length": 343.65625,
"epoch": 0.5152838427947598,
"grad_norm": 1.0616017021974933,
"kl": 0.044921875,
"learning_rate": 5e-07,
"loss": -0.0017,
"reward": 1.360579490661621,
"reward_std": 0.6262814998626709,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9170117378234863,
"step": 354
},
{
"completion_length": 346.1875,
"epoch": 0.5167394468704513,
"grad_norm": 0.9668421147018488,
"kl": 0.04296875,
"learning_rate": 4.976414557034839e-07,
"loss": 0.0022,
"reward": -0.3669726252555847,
"reward_std": 0.33296334743499756,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.9034309387207031,
"step": 355
},
{
"completion_length": 345.4375,
"epoch": 0.5181950509461426,
"grad_norm": 1.2913148050931513,
"kl": 0.041015625,
"learning_rate": 4.952829638870515e-07,
"loss": -0.0009,
"reward": 0.7532292008399963,
"reward_std": 0.7976954579353333,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9168879985809326,
"step": 356
},
{
"completion_length": 368.5,
"epoch": 0.519650655021834,
"grad_norm": 0.8900198844773329,
"kl": 0.040283203125,
"learning_rate": 4.92924577029619e-07,
"loss": -0.0012,
"reward": 1.5500717163085938,
"reward_std": 1.1232110261917114,
"rewards/accuracy_reward": 0.890625,
"rewards/format_reward": 0.8781965970993042,
"step": 357
},
{
"completion_length": 350.390625,
"epoch": 0.5211062590975255,
"grad_norm": 1.0655510860501003,
"kl": 0.050537109375,
"learning_rate": 4.905663476077665e-07,
"loss": -0.0054,
"reward": 1.3917381763458252,
"reward_std": 0.9621044397354126,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9068945050239563,
"step": 358
},
{
"completion_length": 329.6875,
"epoch": 0.5225618631732168,
"grad_norm": 1.1100376051956256,
"kl": 0.048828125,
"learning_rate": 4.882083280945716e-07,
"loss": 0.0028,
"reward": 0.8387891054153442,
"reward_std": 0.6230899691581726,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9546484351158142,
"step": 359
},
{
"completion_length": 346.484375,
"epoch": 0.5240174672489083,
"grad_norm": 1.2301004521682009,
"kl": 0.047607421875,
"learning_rate": 4.85850570958441e-07,
"loss": 0.0047,
"reward": 0.77734375,
"reward_std": 0.2949136197566986,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.93436199426651,
"step": 360
},
{
"completion_length": 324.4375,
"epoch": 0.5254730713245997,
"grad_norm": 1.150417363080493,
"kl": 0.04541015625,
"learning_rate": 4.834931286619432e-07,
"loss": -0.0014,
"reward": 1.0515625476837158,
"reward_std": 0.45344188809394836,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9319791793823242,
"step": 361
},
{
"completion_length": 338.5625,
"epoch": 0.5269286754002911,
"grad_norm": 1.0871105100886687,
"kl": 0.05224609375,
"learning_rate": 4.811360536606415e-07,
"loss": 0.0006,
"reward": 1.0285286903381348,
"reward_std": 0.5896121263504028,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9137629866600037,
"step": 362
},
{
"completion_length": 346.4375,
"epoch": 0.5283842794759825,
"grad_norm": 1.16086348005986,
"kl": 0.047119140625,
"learning_rate": 4.787793984019259e-07,
"loss": -0.0023,
"reward": 1.1483073234558105,
"reward_std": 0.1926122009754181,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9851562976837158,
"step": 363
},
{
"completion_length": 330.25,
"epoch": 0.529839883551674,
"grad_norm": 1.122777391792208,
"kl": 0.04345703125,
"learning_rate": 4.764232153238472e-07,
"loss": -0.0037,
"reward": 1.3178515434265137,
"reward_std": 0.3849244713783264,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9673958420753479,
"step": 364
},
{
"completion_length": 331.015625,
"epoch": 0.5312954876273653,
"grad_norm": 1.2307126175016525,
"kl": 0.047607421875,
"learning_rate": 4.7406755685394943e-07,
"loss": -0.0013,
"reward": 0.8999348878860474,
"reward_std": 0.7801786065101624,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.910559892654419,
"step": 365
},
{
"completion_length": 335.90625,
"epoch": 0.5327510917030568,
"grad_norm": 1.222675950836803,
"kl": 0.04931640625,
"learning_rate": 4.7171247540810377e-07,
"loss": 0.0039,
"reward": 0.7975065112113953,
"reward_std": 0.47923994064331055,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9561002254486084,
"step": 366
},
{
"completion_length": 335.640625,
"epoch": 0.5342066957787481,
"grad_norm": 1.0798428455730729,
"kl": 0.04443359375,
"learning_rate": 4.693580233893419e-07,
"loss": -0.0001,
"reward": 0.7751432657241821,
"reward_std": 0.8045872449874878,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9402864575386047,
"step": 367
},
{
"completion_length": 327.671875,
"epoch": 0.5356622998544396,
"grad_norm": 1.2982851617600433,
"kl": 0.052001953125,
"learning_rate": 4.6700425318668983e-07,
"loss": -0.0008,
"reward": 1.1960091590881348,
"reward_std": 0.4194261431694031,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9822461009025574,
"step": 368
},
{
"completion_length": 311.640625,
"epoch": 0.537117903930131,
"grad_norm": 1.1430814971743037,
"kl": 0.046630859375,
"learning_rate": 4.646512171740027e-07,
"loss": 0.002,
"reward": 0.6666406393051147,
"reward_std": 0.41208669543266296,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9482681751251221,
"step": 369
},
{
"completion_length": 332.796875,
"epoch": 0.5385735080058224,
"grad_norm": 1.1537373469060968,
"kl": 0.043701171875,
"learning_rate": 4.6229896770879925e-07,
"loss": -0.0008,
"reward": 0.8343229293823242,
"reward_std": 0.35879752039909363,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9422265291213989,
"step": 370
},
{
"completion_length": 329.578125,
"epoch": 0.5400291120815138,
"grad_norm": 1.117973192100589,
"kl": 0.04541015625,
"learning_rate": 4.599475571310964e-07,
"loss": 0.0006,
"reward": 0.95947265625,
"reward_std": 0.6068175435066223,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9388346672058105,
"step": 371
},
{
"completion_length": 344.375,
"epoch": 0.5414847161572053,
"grad_norm": 1.196872378128055,
"kl": 0.04931640625,
"learning_rate": 4.5759703776224555e-07,
"loss": 0.0017,
"reward": 1.1496614217758179,
"reward_std": 0.7437654733657837,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9437239170074463,
"step": 372
},
{
"completion_length": 323.984375,
"epoch": 0.5429403202328966,
"grad_norm": 1.2948089305623915,
"kl": 0.050537109375,
"learning_rate": 4.552474619037668e-07,
"loss": -0.0012,
"reward": 1.1745052337646484,
"reward_std": 0.1955292969942093,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9643619656562805,
"step": 373
},
{
"completion_length": 334.40625,
"epoch": 0.5443959243085881,
"grad_norm": 1.1752903191077182,
"kl": 0.045166015625,
"learning_rate": 4.528988818361869e-07,
"loss": -0.0005,
"reward": 0.9639453291893005,
"reward_std": 0.7098885774612427,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9328385591506958,
"step": 374
},
{
"completion_length": 336.34375,
"epoch": 0.5458515283842795,
"grad_norm": 1.0217530778000898,
"kl": 0.0458984375,
"learning_rate": 4.505513498178751e-07,
"loss": 0.0044,
"reward": 1.0537173748016357,
"reward_std": 0.42270171642303467,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.936022162437439,
"step": 375
},
{
"completion_length": 336.140625,
"epoch": 0.5473071324599709,
"grad_norm": 1.1035824214262215,
"kl": 0.041748046875,
"learning_rate": 4.4820491808388035e-07,
"loss": 0.0024,
"reward": 1.3975260257720947,
"reward_std": 0.5420940518379211,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9560156464576721,
"step": 376
},
{
"completion_length": 319.171875,
"epoch": 0.5487627365356623,
"grad_norm": 1.1858924715525574,
"kl": 0.048583984375,
"learning_rate": 4.45859638844769e-07,
"loss": 0.0027,
"reward": 1.609043002128601,
"reward_std": 0.04546473175287247,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9743945598602295,
"step": 377
},
{
"completion_length": 332.078125,
"epoch": 0.5502183406113537,
"grad_norm": 1.137595761389025,
"kl": 0.051513671875,
"learning_rate": 4.4351556428546365e-07,
"loss": 0.0042,
"reward": 1.075364589691162,
"reward_std": 0.9671132564544678,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.8985416293144226,
"step": 378
},
{
"completion_length": 365.34375,
"epoch": 0.5516739446870451,
"grad_norm": 1.064698625138544,
"kl": 0.0439453125,
"learning_rate": 4.411727465640808e-07,
"loss": 0.0007,
"reward": 0.705507755279541,
"reward_std": 0.2698853611946106,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.958437442779541,
"step": 379
},
{
"completion_length": 332.1875,
"epoch": 0.5531295487627366,
"grad_norm": 0.954227809746772,
"kl": 0.047119140625,
"learning_rate": 4.388312378107714e-07,
"loss": 0.0019,
"reward": 0.017766959965229034,
"reward_std": 0.4671540856361389,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.9211132526397705,
"step": 380
},
{
"completion_length": 357.34375,
"epoch": 0.5545851528384279,
"grad_norm": 1.0746234998917534,
"kl": 0.04638671875,
"learning_rate": 4.364910901265606e-07,
"loss": 0.0009,
"reward": 0.9512304663658142,
"reward_std": 0.9008158445358276,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9242773056030273,
"step": 381
},
{
"completion_length": 353.171875,
"epoch": 0.5560407569141194,
"grad_norm": 1.1000427889048656,
"kl": 0.04638671875,
"learning_rate": 4.341523555821881e-07,
"loss": 0.001,
"reward": 0.6827343702316284,
"reward_std": 0.32222965359687805,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8887369632720947,
"step": 382
},
{
"completion_length": 330.015625,
"epoch": 0.5574963609898108,
"grad_norm": 1.0062009920301431,
"kl": 0.052734375,
"learning_rate": 4.3181508621695015e-07,
"loss": 0.0044,
"reward": 1.2704167366027832,
"reward_std": 0.3719358444213867,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.9685026407241821,
"step": 383
},
{
"completion_length": 358.90625,
"epoch": 0.5589519650655022,
"grad_norm": 1.0500378444634122,
"kl": 0.055419921875,
"learning_rate": 4.294793340375404e-07,
"loss": 0.0022,
"reward": 0.5693033933639526,
"reward_std": 0.7073459029197693,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9236393570899963,
"step": 384
},
{
"completion_length": 317.875,
"epoch": 0.5604075691411936,
"grad_norm": 1.2585391427250612,
"kl": 0.052978515625,
"learning_rate": 4.271451510168943e-07,
"loss": 0.0019,
"reward": 1.0041340589523315,
"reward_std": 0.6024578809738159,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9240299463272095,
"step": 385
},
{
"completion_length": 340.671875,
"epoch": 0.5618631732168851,
"grad_norm": 1.171525425235524,
"kl": 0.055419921875,
"learning_rate": 4.248125890930316e-07,
"loss": 0.001,
"reward": 0.5767643451690674,
"reward_std": 0.9831317067146301,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.868079423904419,
"step": 386
},
{
"completion_length": 348.484375,
"epoch": 0.5633187772925764,
"grad_norm": 1.1137932938055821,
"kl": 0.0498046875,
"learning_rate": 4.22481700167901e-07,
"loss": 0.0038,
"reward": 1.6028320789337158,
"reward_std": 0.785223126411438,
"rewards/accuracy_reward": 0.890625,
"rewards/format_reward": 0.9309570789337158,
"step": 387
},
{
"completion_length": 357.015625,
"epoch": 0.5647743813682679,
"grad_norm": 1.1683554196934958,
"kl": 0.04541015625,
"learning_rate": 4.201525361062254e-07,
"loss": 0.002,
"reward": 0.885696530342102,
"reward_std": 0.9379022121429443,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9048242568969727,
"step": 388
},
{
"completion_length": 349.765625,
"epoch": 0.5662299854439592,
"grad_norm": 1.0285239615931752,
"kl": 0.045654296875,
"learning_rate": 4.17825148734347e-07,
"loss": -0.0016,
"reward": 1.4252278804779053,
"reward_std": 0.555104672908783,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9401627779006958,
"step": 389
},
{
"completion_length": 344.578125,
"epoch": 0.5676855895196506,
"grad_norm": 0.9824416716745002,
"kl": 0.046875,
"learning_rate": 4.154995898390755e-07,
"loss": 0.0001,
"reward": 0.30329427123069763,
"reward_std": 0.8160994648933411,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.8936979174613953,
"step": 390
},
{
"completion_length": 362.265625,
"epoch": 0.5691411935953421,
"grad_norm": 1.0347605373720594,
"kl": 0.04345703125,
"learning_rate": 4.131759111665348e-07,
"loss": 0.0006,
"reward": 0.3581119775772095,
"reward_std": 0.6576919555664062,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.8985155820846558,
"step": 391
},
{
"completion_length": 354.8125,
"epoch": 0.5705967976710334,
"grad_norm": 1.0801343806418962,
"kl": 0.04736328125,
"learning_rate": 4.1085416442101203e-07,
"loss": 0.0022,
"reward": 1.1929036378860474,
"reward_std": 1.0718597173690796,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8935286402702332,
"step": 392
},
{
"completion_length": 337.828125,
"epoch": 0.5720524017467249,
"grad_norm": 0.9437613999658896,
"kl": 0.042236328125,
"learning_rate": 4.0853440126380666e-07,
"loss": -0.0027,
"reward": 1.2306054830551147,
"reward_std": 0.06505398452281952,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9732356667518616,
"step": 393
},
{
"completion_length": 353.15625,
"epoch": 0.5735080058224163,
"grad_norm": 1.0240210848060178,
"kl": 0.042724609375,
"learning_rate": 4.0621667331208156e-07,
"loss": 0.0022,
"reward": 1.047447919845581,
"reward_std": 0.5841343402862549,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9282682538032532,
"step": 394
},
{
"completion_length": 371.28125,
"epoch": 0.5749636098981077,
"grad_norm": 0.9665992608409852,
"kl": 0.04931640625,
"learning_rate": 4.0390103213771363e-07,
"loss": -0.0038,
"reward": 0.1188020408153534,
"reward_std": 0.9848098158836365,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.8411197662353516,
"step": 395
},
{
"completion_length": 341.75,
"epoch": 0.5764192139737991,
"grad_norm": 1.2322164783845382,
"kl": 0.04833984375,
"learning_rate": 4.015875292661473e-07,
"loss": 0.0022,
"reward": 0.32570311427116394,
"reward_std": 0.9908155798912048,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.9544401168823242,
"step": 396
},
{
"completion_length": 333.5625,
"epoch": 0.5778748180494906,
"grad_norm": 1.082096942344887,
"kl": 0.0498046875,
"learning_rate": 3.9927621617524736e-07,
"loss": 0.0069,
"reward": 1.061464786529541,
"reward_std": 0.4430278539657593,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.8951106667518616,
"step": 397
},
{
"completion_length": 349.640625,
"epoch": 0.5793304221251819,
"grad_norm": 1.1730680460847622,
"kl": 0.046142578125,
"learning_rate": 3.969671442941538e-07,
"loss": 0.0014,
"reward": 0.3286914527416229,
"reward_std": 0.6564339995384216,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9076367616653442,
"step": 398
},
{
"completion_length": 347.859375,
"epoch": 0.5807860262008734,
"grad_norm": 0.936643056977854,
"kl": 0.048095703125,
"learning_rate": 3.94660365002137e-07,
"loss": 0.001,
"reward": -0.10537109524011612,
"reward_std": 0.5952030420303345,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.8903840780258179,
"step": 399
},
{
"completion_length": 361.015625,
"epoch": 0.5822416302765647,
"grad_norm": 0.8588675688237927,
"kl": 0.047119140625,
"learning_rate": 3.923559296274549e-07,
"loss": -0.0018,
"reward": 0.81843101978302,
"reward_std": 0.8495593667030334,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.895423173904419,
"step": 400
},
{
"completion_length": 327.78125,
"epoch": 0.5836972343522562,
"grad_norm": 1.1927270347808032,
"kl": 0.046142578125,
"learning_rate": 3.900538894462112e-07,
"loss": -0.0011,
"reward": 0.4946874976158142,
"reward_std": 0.5388225317001343,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9400129914283752,
"step": 401
},
{
"completion_length": 335.125,
"epoch": 0.5851528384279476,
"grad_norm": 1.1416346509953534,
"kl": 0.050048828125,
"learning_rate": 3.877542956812136e-07,
"loss": -0.0009,
"reward": 1.2370572090148926,
"reward_std": 0.7617213129997253,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.9355729222297668,
"step": 402
},
{
"completion_length": 339.640625,
"epoch": 0.586608442503639,
"grad_norm": 1.2059860449472852,
"kl": 0.048828125,
"learning_rate": 3.8545719950083503e-07,
"loss": 0.0032,
"reward": 1.136875033378601,
"reward_std": 0.3422207832336426,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9610416293144226,
"step": 403
},
{
"completion_length": 311.203125,
"epoch": 0.5880640465793304,
"grad_norm": 1.2427528264658227,
"kl": 0.057373046875,
"learning_rate": 3.831626520178738e-07,
"loss": 0.0021,
"reward": 1.1694010496139526,
"reward_std": 0.21720562875270844,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9581640958786011,
"step": 404
},
{
"completion_length": 337.6875,
"epoch": 0.5895196506550219,
"grad_norm": 1.1003634317266355,
"kl": 0.05224609375,
"learning_rate": 3.8087070428841753e-07,
"loss": 0.0002,
"reward": 0.48251304030418396,
"reward_std": 0.9312198758125305,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9204167127609253,
"step": 405
},
{
"completion_length": 339.484375,
"epoch": 0.5909752547307132,
"grad_norm": 0.9678972898174403,
"kl": 0.044677734375,
"learning_rate": 3.785814073107064e-07,
"loss": 0.0013,
"reward": 1.5340235233306885,
"reward_std": 0.5008847117424011,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.94998699426651,
"step": 406
},
{
"completion_length": 308.375,
"epoch": 0.5924308588064047,
"grad_norm": 1.047424956436592,
"kl": 0.047607421875,
"learning_rate": 3.762948120239988e-07,
"loss": -0.0021,
"reward": 0.9827408790588379,
"reward_std": 0.627294659614563,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9047591090202332,
"step": 407
},
{
"completion_length": 317.78125,
"epoch": 0.5938864628820961,
"grad_norm": 1.31987404266799,
"kl": 0.049560546875,
"learning_rate": 3.7401096930743746e-07,
"loss": -0.0035,
"reward": 1.220130205154419,
"reward_std": 0.5085282325744629,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9641405940055847,
"step": 408
},
{
"completion_length": 317.328125,
"epoch": 0.5953420669577875,
"grad_norm": 1.258115336162508,
"kl": 0.0458984375,
"learning_rate": 3.717299299789175e-07,
"loss": 0.0015,
"reward": 0.8206315040588379,
"reward_std": 0.16567295789718628,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.980527400970459,
"step": 409
},
{
"completion_length": 311.0625,
"epoch": 0.5967976710334789,
"grad_norm": 1.0797344698183151,
"kl": 0.0400390625,
"learning_rate": 3.6945174479395584e-07,
"loss": 0.0006,
"reward": 0.4124348759651184,
"reward_std": 0.5062814950942993,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.9456771612167358,
"step": 410
},
{
"completion_length": 326.015625,
"epoch": 0.5982532751091703,
"grad_norm": 0.9392678790614373,
"kl": 0.051513671875,
"learning_rate": 3.6717646444456193e-07,
"loss": 0.0014,
"reward": 0.35626304149627686,
"reward_std": 0.4074528217315674,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9396615028381348,
"step": 411
},
{
"completion_length": 320.078125,
"epoch": 0.5997088791848617,
"grad_norm": 1.3610941268678678,
"kl": 0.044921875,
"learning_rate": 3.649041395581089e-07,
"loss": 0.0001,
"reward": 0.6594465970993042,
"reward_std": 0.6716386079788208,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9064517617225647,
"step": 412
},
{
"completion_length": 327.734375,
"epoch": 0.6011644832605532,
"grad_norm": 1.2129906269391977,
"kl": 0.048095703125,
"learning_rate": 3.6263482069620865e-07,
"loss": -0.0036,
"reward": 1.199205756187439,
"reward_std": 0.3400747776031494,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9805078506469727,
"step": 413
},
{
"completion_length": 322.78125,
"epoch": 0.6026200873362445,
"grad_norm": 1.0044565070885603,
"kl": 0.047607421875,
"learning_rate": 3.6036855835358496e-07,
"loss": 0.0017,
"reward": 0.64725261926651,
"reward_std": 0.4769784212112427,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9339061975479126,
"step": 414
},
{
"completion_length": 356.59375,
"epoch": 0.604075691411936,
"grad_norm": 1.144784540293772,
"kl": 0.0478515625,
"learning_rate": 3.581054029569516e-07,
"loss": -0.005,
"reward": 0.7913802266120911,
"reward_std": 0.649086058139801,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9033724069595337,
"step": 415
},
{
"completion_length": 327.5625,
"epoch": 0.6055312954876274,
"grad_norm": 1.1441222735837988,
"kl": 0.050537109375,
"learning_rate": 3.55845404863889e-07,
"loss": 0.0033,
"reward": 1.7637498378753662,
"reward_std": 0.5468021631240845,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.9512500166893005,
"step": 416
},
{
"completion_length": 304.46875,
"epoch": 0.6069868995633187,
"grad_norm": 1.2559225472205957,
"kl": 0.04638671875,
"learning_rate": 3.535886143617248e-07,
"loss": -0.0045,
"reward": 0.8564192652702332,
"reward_std": 0.08503374457359314,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9544661641120911,
"step": 417
},
{
"completion_length": 319.765625,
"epoch": 0.6084425036390102,
"grad_norm": 1.1233374483471503,
"kl": 0.048583984375,
"learning_rate": 3.513350816664138e-07,
"loss": 0.0005,
"reward": 1.250644564628601,
"reward_std": 0.6800768375396729,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.93666011095047,
"step": 418
},
{
"completion_length": 319.390625,
"epoch": 0.6098981077147017,
"grad_norm": 1.1149493029080904,
"kl": 0.04736328125,
"learning_rate": 3.4908485692142164e-07,
"loss": 0.0031,
"reward": 1.9074804782867432,
"reward_std": 0.20882548391819,
"rewards/accuracy_reward": 0.984375,
"rewards/format_reward": 0.9543554782867432,
"step": 419
},
{
"completion_length": 322.484375,
"epoch": 0.611353711790393,
"grad_norm": 1.1771319406793013,
"kl": 0.05078125,
"learning_rate": 3.4683799019660833e-07,
"loss": -0.0012,
"reward": 0.4527343511581421,
"reward_std": 0.1347397118806839,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9300520420074463,
"step": 420
},
{
"completion_length": 320.984375,
"epoch": 0.6128093158660844,
"grad_norm": 1.253919818449011,
"kl": 0.04541015625,
"learning_rate": 3.4459453148711437e-07,
"loss": -0.0025,
"reward": 1.2054883241653442,
"reward_std": 0.34866058826446533,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9499544501304626,
"step": 421
},
{
"completion_length": 308.75,
"epoch": 0.6142649199417758,
"grad_norm": 1.2228103194283946,
"kl": 0.0546875,
"learning_rate": 3.423545307122488e-07,
"loss": 0.0045,
"reward": 1.38002610206604,
"reward_std": 0.5425729751586914,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9391015768051147,
"step": 422
},
{
"completion_length": 334.921875,
"epoch": 0.6157205240174672,
"grad_norm": 1.0491711033896147,
"kl": 0.0517578125,
"learning_rate": 3.4011803771437735e-07,
"loss": 0.0027,
"reward": 1.441979169845581,
"reward_std": 0.9929073452949524,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9042838215827942,
"step": 423
},
{
"completion_length": 341.25,
"epoch": 0.6171761280931587,
"grad_norm": 1.1145287911850592,
"kl": 0.046142578125,
"learning_rate": 3.378851022578146e-07,
"loss": -0.0005,
"reward": 0.8648567199707031,
"reward_std": 0.6062259674072266,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.94259113073349,
"step": 424
},
{
"completion_length": 323.328125,
"epoch": 0.61863173216885,
"grad_norm": 1.2328706888305836,
"kl": 0.059326171875,
"learning_rate": 3.356557740277156e-07,
"loss": -0.0007,
"reward": 1.2751758098602295,
"reward_std": 0.894574761390686,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9299153685569763,
"step": 425
},
{
"completion_length": 331.3125,
"epoch": 0.6200873362445415,
"grad_norm": 1.047466025286303,
"kl": 0.048828125,
"learning_rate": 3.334301026289712e-07,
"loss": 0.0037,
"reward": 1.5336458683013916,
"reward_std": 0.252202570438385,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.9498567581176758,
"step": 426
},
{
"completion_length": 338.671875,
"epoch": 0.6215429403202329,
"grad_norm": 1.0464703790815202,
"kl": 0.04833984375,
"learning_rate": 3.312081375851038e-07,
"loss": 0.0015,
"reward": 1.5034700632095337,
"reward_std": 0.9077455997467041,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.9250195026397705,
"step": 427
},
{
"completion_length": 334.8125,
"epoch": 0.6229985443959243,
"grad_norm": 1.1030610799748342,
"kl": 0.047119140625,
"learning_rate": 3.2898992833716563e-07,
"loss": 0.002,
"reward": 1.0385351181030273,
"reward_std": 0.5042393803596497,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9632617235183716,
"step": 428
},
{
"completion_length": 341.953125,
"epoch": 0.6244541484716157,
"grad_norm": 0.9140662347223634,
"kl": 0.05224609375,
"learning_rate": 3.2677552424263834e-07,
"loss": -0.0018,
"reward": 0.9026367664337158,
"reward_std": 0.9520148038864136,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.8862174153327942,
"step": 429
},
{
"completion_length": 355.328125,
"epoch": 0.6259097525473072,
"grad_norm": 1.0653314224895472,
"kl": 0.046630859375,
"learning_rate": 3.2456497457433475e-07,
"loss": 0.0019,
"reward": 0.8542708158493042,
"reward_std": 0.8604847192764282,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.8829036355018616,
"step": 430
},
{
"completion_length": 353.859375,
"epoch": 0.6273653566229985,
"grad_norm": 1.0396201810405379,
"kl": 0.0478515625,
"learning_rate": 3.2235832851930315e-07,
"loss": -0.0011,
"reward": 1.552734375,
"reward_std": 0.19750574231147766,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.974609375,
"step": 431
},
{
"completion_length": 330.3125,
"epoch": 0.62882096069869,
"grad_norm": 1.149398763893242,
"kl": 0.048828125,
"learning_rate": 3.201556351777321e-07,
"loss": 0.0008,
"reward": 1.3697071075439453,
"reward_std": 0.5695977210998535,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9322070479393005,
"step": 432
},
{
"completion_length": 339.59375,
"epoch": 0.6302765647743813,
"grad_norm": 1.2047591749520952,
"kl": 0.045654296875,
"learning_rate": 3.1795694356185797e-07,
"loss": -0.0004,
"reward": 1.2822070121765137,
"reward_std": 0.7462552785873413,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9377539157867432,
"step": 433
},
{
"completion_length": 354.828125,
"epoch": 0.6317321688500728,
"grad_norm": 1.0709344253665385,
"kl": 0.056396484375,
"learning_rate": 3.157623025948751e-07,
"loss": 0.0054,
"reward": 0.5403645634651184,
"reward_std": 0.7718614339828491,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9401432275772095,
"step": 434
},
{
"completion_length": 367.015625,
"epoch": 0.6331877729257642,
"grad_norm": 0.9081689546614261,
"kl": 0.042236328125,
"learning_rate": 3.135717611098457e-07,
"loss": -0.0028,
"reward": 0.5762760639190674,
"reward_std": 0.9539381265640259,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9339452981948853,
"step": 435
},
{
"completion_length": 361.609375,
"epoch": 0.6346433770014556,
"grad_norm": 0.9935443191736562,
"kl": 0.050048828125,
"learning_rate": 3.11385367848615e-07,
"loss": 0.0029,
"reward": 1.0945442914962769,
"reward_std": 0.364665687084198,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9768098592758179,
"step": 436
},
{
"completion_length": 365.984375,
"epoch": 0.636098981077147,
"grad_norm": 1.1539983550423447,
"kl": 0.046142578125,
"learning_rate": 3.0920317146072574e-07,
"loss": -0.003,
"reward": 0.9360742568969727,
"reward_std": 0.656904935836792,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9138607382774353,
"step": 437
},
{
"completion_length": 364.125,
"epoch": 0.6375545851528385,
"grad_norm": 1.0332344243928102,
"kl": 0.04638671875,
"learning_rate": 3.070252205023355e-07,
"loss": -0.0011,
"reward": 0.920188844203949,
"reward_std": 0.4112701714038849,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9488345980644226,
"step": 438
},
{
"completion_length": 363.140625,
"epoch": 0.6390101892285298,
"grad_norm": 0.8434589833001687,
"kl": 0.04345703125,
"learning_rate": 3.048515634351373e-07,
"loss": -0.0007,
"reward": 1.021744728088379,
"reward_std": 0.9885683655738831,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9123697876930237,
"step": 439
},
{
"completion_length": 349.859375,
"epoch": 0.6404657933042213,
"grad_norm": 1.100810311524533,
"kl": 0.049072265625,
"learning_rate": 3.026822486252796e-07,
"loss": 0.004,
"reward": 1.1635351181030273,
"reward_std": 0.23715665936470032,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9604101777076721,
"step": 440
},
{
"completion_length": 361.28125,
"epoch": 0.6419213973799127,
"grad_norm": 0.9996154209580299,
"kl": 0.043701171875,
"learning_rate": 3.005173243422918e-07,
"loss": -0.0018,
"reward": 1.4946484565734863,
"reward_std": 0.36391258239746094,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9633983969688416,
"step": 441
},
{
"completion_length": 355.890625,
"epoch": 0.6433770014556041,
"grad_norm": 1.064092737707726,
"kl": 0.04931640625,
"learning_rate": 2.983568387580093e-07,
"loss": 0.0009,
"reward": 0.9472330808639526,
"reward_std": 1.0713083744049072,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9316080808639526,
"step": 442
},
{
"completion_length": 352.546875,
"epoch": 0.6448326055312955,
"grad_norm": 1.2213995677397735,
"kl": 0.046142578125,
"learning_rate": 2.9620083994550184e-07,
"loss": -0.0025,
"reward": 1.1790754795074463,
"reward_std": 0.2134915590286255,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9699999690055847,
"step": 443
},
{
"completion_length": 343.625,
"epoch": 0.6462882096069869,
"grad_norm": 0.9866828975217354,
"kl": 0.046142578125,
"learning_rate": 2.940493758780037e-07,
"loss": 0.0007,
"reward": 1.290442705154419,
"reward_std": 0.4115654230117798,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.945742130279541,
"step": 444
},
{
"completion_length": 369.09375,
"epoch": 0.6477438136826783,
"grad_norm": 0.7976157219662836,
"kl": 0.04833984375,
"learning_rate": 2.919024944278462e-07,
"loss": -0.0017,
"reward": 1.1873372793197632,
"reward_std": 0.38416576385498047,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9347070455551147,
"step": 445
},
{
"completion_length": 349.140625,
"epoch": 0.6491994177583698,
"grad_norm": 1.034098435240967,
"kl": 0.047119140625,
"learning_rate": 2.8976024336539297e-07,
"loss": -0.001,
"reward": 0.949485719203949,
"reward_std": 0.7218962907791138,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9301497936248779,
"step": 446
},
{
"completion_length": 361.703125,
"epoch": 0.6506550218340611,
"grad_norm": 0.9093087268112631,
"kl": 0.0537109375,
"learning_rate": 2.8762267035797606e-07,
"loss": -0.003,
"reward": 1.5500586032867432,
"reward_std": 0.852545976638794,
"rewards/accuracy_reward": 0.890625,
"rewards/format_reward": 0.8781836032867432,
"step": 447
},
{
"completion_length": 362.65625,
"epoch": 0.6521106259097526,
"grad_norm": 0.9962666508897389,
"kl": 0.047607421875,
"learning_rate": 2.8548982296883685e-07,
"loss": -0.0015,
"reward": 0.7888085842132568,
"reward_std": 0.24111366271972656,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9591341614723206,
"step": 448
},
{
"completion_length": 365.578125,
"epoch": 0.653566229985444,
"grad_norm": 1.0644973570656397,
"kl": 0.047607421875,
"learning_rate": 2.8336174865606583e-07,
"loss": -0.0012,
"reward": 1.458815097808838,
"reward_std": 0.6925506591796875,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9265494346618652,
"step": 449
},
{
"completion_length": 377.78125,
"epoch": 0.6550218340611353,
"grad_norm": 0.8690653477908721,
"kl": 0.04541015625,
"learning_rate": 2.8123849477154806e-07,
"loss": 0.0022,
"reward": 0.6074088215827942,
"reward_std": 0.6371817588806152,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.91539067029953,
"step": 450
},
{
"completion_length": 343.5625,
"epoch": 0.6564774381368268,
"grad_norm": 0.9959008179108618,
"kl": 0.045166015625,
"learning_rate": 2.791201085599084e-07,
"loss": 0.0026,
"reward": 1.1102409362792969,
"reward_std": 0.37362387776374817,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9482877254486084,
"step": 451
},
{
"completion_length": 372.015625,
"epoch": 0.6579330422125182,
"grad_norm": 1.1144444930254087,
"kl": 0.0517578125,
"learning_rate": 2.770066371574621e-07,
"loss": 0.0046,
"reward": 0.5106966495513916,
"reward_std": 0.35071802139282227,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9128841161727905,
"step": 452
},
{
"completion_length": 370.703125,
"epoch": 0.6593886462882096,
"grad_norm": 0.9690064498501554,
"kl": 0.047119140625,
"learning_rate": 2.748981275911633e-07,
"loss": -0.0043,
"reward": 0.20597657561302185,
"reward_std": 0.7211107015609741,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.8844531178474426,
"step": 453
},
{
"completion_length": 364.328125,
"epoch": 0.660844250363901,
"grad_norm": 0.9366506361406456,
"kl": 0.044189453125,
"learning_rate": 2.7279462677756126e-07,
"loss": -0.0036,
"reward": 0.7447395920753479,
"reward_std": 0.6155897974967957,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9127213954925537,
"step": 454
},
{
"completion_length": 350.484375,
"epoch": 0.6622998544395924,
"grad_norm": 1.0086261231667006,
"kl": 0.047607421875,
"learning_rate": 2.7069618152175464e-07,
"loss": -0.0014,
"reward": 1.0693293809890747,
"reward_std": 0.44240403175354004,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9077669382095337,
"step": 455
},
{
"completion_length": 352.890625,
"epoch": 0.6637554585152838,
"grad_norm": 1.017662444972447,
"kl": 0.051513671875,
"learning_rate": 2.6860283851635063e-07,
"loss": -0.0016,
"reward": 1.1334245204925537,
"reward_std": 0.8161476850509644,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9266666769981384,
"step": 456
},
{
"completion_length": 363.09375,
"epoch": 0.6652110625909753,
"grad_norm": 0.9088775376959461,
"kl": 0.049072265625,
"learning_rate": 2.6651464434042596e-07,
"loss": 0.0007,
"reward": 1.1859569549560547,
"reward_std": 0.7904758453369141,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9305403232574463,
"step": 457
},
{
"completion_length": 358.1875,
"epoch": 0.6666666666666666,
"grad_norm": 0.956917415239815,
"kl": 0.054931640625,
"learning_rate": 2.6443164545849113e-07,
"loss": -0.0023,
"reward": 0.31612628698349,
"reward_std": 0.6449480652809143,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.8944726586341858,
"step": 458
},
{
"completion_length": 346.40625,
"epoch": 0.6681222707423581,
"grad_norm": 1.1043376858496878,
"kl": 0.048828125,
"learning_rate": 2.6235388821945495e-07,
"loss": 0.0022,
"reward": 1.320787787437439,
"reward_std": 0.5146819353103638,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9273632764816284,
"step": 459
},
{
"completion_length": 358.453125,
"epoch": 0.6695778748180495,
"grad_norm": 1.1203658951352162,
"kl": 0.054443359375,
"learning_rate": 2.602814188555951e-07,
"loss": -0.004,
"reward": 0.97126305103302,
"reward_std": 0.8944222927093506,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9062368869781494,
"step": 460
},
{
"completion_length": 373.3125,
"epoch": 0.6710334788937409,
"grad_norm": 1.0496443261695805,
"kl": 0.04443359375,
"learning_rate": 2.5821428348152786e-07,
"loss": -0.0001,
"reward": 0.44576171040534973,
"reward_std": 1.2483347654342651,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.8427278399467468,
"step": 461
},
{
"completion_length": 344.953125,
"epoch": 0.6724890829694323,
"grad_norm": 1.053156674987038,
"kl": 0.050048828125,
"learning_rate": 2.561525280931828e-07,
"loss": 0.0003,
"reward": 0.9904752373695374,
"reward_std": 0.34265437722206116,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9724804759025574,
"step": 462
},
{
"completion_length": 370.65625,
"epoch": 0.6739446870451238,
"grad_norm": 0.940859770356533,
"kl": 0.046875,
"learning_rate": 2.5409619856677913e-07,
"loss": -0.0002,
"reward": 0.28072917461395264,
"reward_std": 0.7043707370758057,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.9188151359558105,
"step": 463
},
{
"completion_length": 350.5,
"epoch": 0.6754002911208151,
"grad_norm": 0.9974930451895179,
"kl": 0.052490234375,
"learning_rate": 2.5204534065780533e-07,
"loss": 0.0036,
"reward": 1.0816080570220947,
"reward_std": 0.7134989500045776,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9230663776397705,
"step": 464
},
{
"completion_length": 344.09375,
"epoch": 0.6768558951965066,
"grad_norm": 1.0005662998651903,
"kl": 0.05419921875,
"learning_rate": 2.500000000000001e-07,
"loss": 0.0036,
"reward": 1.0266536474227905,
"reward_std": 0.7755259871482849,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.909726619720459,
"step": 465
},
{
"completion_length": 373.8125,
"epoch": 0.6783114992721979,
"grad_norm": 0.8697379409789556,
"kl": 0.050048828125,
"learning_rate": 2.4796022210433764e-07,
"loss": 0.0015,
"reward": 0.6482356786727905,
"reward_std": 1.0505759716033936,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8648502230644226,
"step": 466
},
{
"completion_length": 350.203125,
"epoch": 0.6797671033478894,
"grad_norm": 1.0448464801188178,
"kl": 0.04345703125,
"learning_rate": 2.4592605235801537e-07,
"loss": 0.0024,
"reward": 1.170351505279541,
"reward_std": 0.9689666032791138,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.91190105676651,
"step": 467
},
{
"completion_length": 349.625,
"epoch": 0.6812227074235808,
"grad_norm": 1.0606796035208255,
"kl": 0.048828125,
"learning_rate": 2.438975360234429e-07,
"loss": 0.0004,
"reward": 1.5108983516693115,
"reward_std": 0.5824877023696899,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9749218821525574,
"step": 468
},
{
"completion_length": 334.25,
"epoch": 0.6826783114992722,
"grad_norm": 1.0491904469297986,
"kl": 0.049072265625,
"learning_rate": 2.4187471823723555e-07,
"loss": -0.0005,
"reward": 1.7903971672058105,
"reward_std": 0.5473105907440186,
"rewards/accuracy_reward": 0.953125,
"rewards/format_reward": 0.9310221672058105,
"step": 469
},
{
"completion_length": 368.1875,
"epoch": 0.6841339155749636,
"grad_norm": 0.8795420237835467,
"kl": 0.04296875,
"learning_rate": 2.3985764400921054e-07,
"loss": -0.0016,
"reward": 0.8880664110183716,
"reward_std": 0.7747170329093933,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9181705713272095,
"step": 470
},
{
"completion_length": 353.734375,
"epoch": 0.6855895196506551,
"grad_norm": 1.0274200723626312,
"kl": 0.051513671875,
"learning_rate": 2.378463582213842e-07,
"loss": 0.0011,
"reward": 1.3674869537353516,
"reward_std": 0.8639088869094849,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9249348640441895,
"step": 471
},
{
"completion_length": 371.578125,
"epoch": 0.6870451237263464,
"grad_norm": 0.9918082918405166,
"kl": 0.04443359375,
"learning_rate": 2.3584090562697424e-07,
"loss": -0.0038,
"reward": 0.47871094942092896,
"reward_std": 0.9248123168945312,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9306640625,
"step": 472
},
{
"completion_length": 367.421875,
"epoch": 0.6885007278020379,
"grad_norm": 0.9925350173805925,
"kl": 0.053466796875,
"learning_rate": 2.33841330849404e-07,
"loss": -0.0016,
"reward": 0.6097005009651184,
"reward_std": 0.6457391381263733,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9147526025772095,
"step": 473
},
{
"completion_length": 354.078125,
"epoch": 0.6899563318777293,
"grad_norm": 0.9429812617452892,
"kl": 0.044189453125,
"learning_rate": 2.3184767838130882e-07,
"loss": -0.002,
"reward": 0.7430534362792969,
"reward_std": 1.0212041139602661,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.8673632740974426,
"step": 474
},
{
"completion_length": 350.171875,
"epoch": 0.6914119359534207,
"grad_norm": 1.2082129742972214,
"kl": 0.04638671875,
"learning_rate": 2.298599925835466e-07,
"loss": 0.0004,
"reward": 0.455533891916275,
"reward_std": 0.4722123444080353,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9366145730018616,
"step": 475
},
{
"completion_length": 366.515625,
"epoch": 0.6928675400291121,
"grad_norm": 1.0274981124427542,
"kl": 0.048828125,
"learning_rate": 2.2787831768421046e-07,
"loss": 0.0033,
"reward": 0.10612629354000092,
"reward_std": 0.7683642506599426,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.9080273509025574,
"step": 476
},
{
"completion_length": 369.484375,
"epoch": 0.6943231441048034,
"grad_norm": 1.0175711783602979,
"kl": 0.04052734375,
"learning_rate": 2.2590269777764514e-07,
"loss": -0.0005,
"reward": 1.336686134338379,
"reward_std": 0.7981055974960327,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9460612535476685,
"step": 477
},
{
"completion_length": 337.78125,
"epoch": 0.6957787481804949,
"grad_norm": 1.1846182441564443,
"kl": 0.048828125,
"learning_rate": 2.2393317682346479e-07,
"loss": 0.0045,
"reward": 0.29799482226371765,
"reward_std": 0.4017283320426941,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.9656379818916321,
"step": 478
},
{
"completion_length": 342.171875,
"epoch": 0.6972343522561864,
"grad_norm": 1.0582752970242983,
"kl": 0.04443359375,
"learning_rate": 2.219697986455762e-07,
"loss": -0.0034,
"reward": 0.9232031106948853,
"reward_std": 0.38019859790802,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.988684892654419,
"step": 479
},
{
"completion_length": 331.15625,
"epoch": 0.6986899563318777,
"grad_norm": 0.956549348435419,
"kl": 0.0537109375,
"learning_rate": 2.2001260693120232e-07,
"loss": -0.0002,
"reward": 0.63113933801651,
"reward_std": 0.21865397691726685,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9857747554779053,
"step": 480
},
{
"completion_length": 320.21875,
"epoch": 0.7001455604075691,
"grad_norm": 1.149313470515847,
"kl": 0.056396484375,
"learning_rate": 2.1806164522991115e-07,
"loss": 0.0058,
"reward": 1.1596614122390747,
"reward_std": 0.2571667730808258,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9488281011581421,
"step": 481
},
{
"completion_length": 347.265625,
"epoch": 0.7016011644832606,
"grad_norm": 1.0533814952805514,
"kl": 0.048095703125,
"learning_rate": 2.1611695695264605e-07,
"loss": -0.0009,
"reward": 0.5740299224853516,
"reward_std": 0.5495303869247437,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9683268666267395,
"step": 482
},
{
"completion_length": 331.109375,
"epoch": 0.7030567685589519,
"grad_norm": 0.934218888424527,
"kl": 0.049560546875,
"learning_rate": 2.1417858537076067e-07,
"loss": 0.0015,
"reward": 1.4878125190734863,
"reward_std": 0.37252911925315857,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9549609422683716,
"step": 483
},
{
"completion_length": 359.734375,
"epoch": 0.7045123726346434,
"grad_norm": 1.0518061573140363,
"kl": 0.046142578125,
"learning_rate": 2.122465736150549e-07,
"loss": 0.0049,
"reward": 1.4919662475585938,
"reward_std": 0.26884597539901733,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9607161283493042,
"step": 484
},
{
"completion_length": 340.125,
"epoch": 0.7059679767103348,
"grad_norm": 0.9147536840648907,
"kl": 0.046630859375,
"learning_rate": 2.1032096467481664e-07,
"loss": 0.0008,
"reward": 0.4311913847923279,
"reward_std": 0.18409988284111023,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.9738216400146484,
"step": 485
},
{
"completion_length": 348.34375,
"epoch": 0.7074235807860262,
"grad_norm": 1.1058446642922939,
"kl": 0.044677734375,
"learning_rate": 2.0840180139686332e-07,
"loss": 0.0021,
"reward": -0.23982422053813934,
"reward_std": 0.7118780612945557,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.941347599029541,
"step": 486
},
{
"completion_length": 348.796875,
"epoch": 0.7088791848617176,
"grad_norm": 0.936587157845369,
"kl": 0.044921875,
"learning_rate": 2.0648912648459072e-07,
"loss": 0.0001,
"reward": 0.8201823234558105,
"reward_std": 0.6160249710083008,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9445312023162842,
"step": 487
},
{
"completion_length": 350.484375,
"epoch": 0.710334788937409,
"grad_norm": 1.0710239125130703,
"kl": 0.0439453125,
"learning_rate": 2.0458298249702095e-07,
"loss": 0.0004,
"reward": 1.0420703887939453,
"reward_std": 0.5851905345916748,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9298177361488342,
"step": 488
},
{
"completion_length": 356.015625,
"epoch": 0.7117903930131004,
"grad_norm": 0.8863497690353069,
"kl": 0.04296875,
"learning_rate": 2.026834118478567e-07,
"loss": -0.0044,
"reward": 1.2863867282867432,
"reward_std": 0.7339239716529846,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9426367282867432,
"step": 489
},
{
"completion_length": 345.890625,
"epoch": 0.7132459970887919,
"grad_norm": 1.0595443329712118,
"kl": 0.0498046875,
"learning_rate": 2.007904568045366e-07,
"loss": -0.0012,
"reward": 0.6528515815734863,
"reward_std": 0.4416601359844208,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9612500071525574,
"step": 490
},
{
"completion_length": 340.859375,
"epoch": 0.7147016011644832,
"grad_norm": 1.102733406546571,
"kl": 0.052001953125,
"learning_rate": 1.9890415948729534e-07,
"loss": -0.0017,
"reward": 1.5185351371765137,
"reward_std": 0.5196734666824341,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.93973308801651,
"step": 491
},
{
"completion_length": 344.796875,
"epoch": 0.7161572052401747,
"grad_norm": 1.013811804087309,
"kl": 0.04296875,
"learning_rate": 1.9702456186822592e-07,
"loss": 0.0057,
"reward": 1.5015950202941895,
"reward_std": 0.2588142156600952,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9537825584411621,
"step": 492
},
{
"completion_length": 323.671875,
"epoch": 0.7176128093158661,
"grad_norm": 1.1965785844666434,
"kl": 0.06201171875,
"learning_rate": 1.9515170577034657e-07,
"loss": -0.0009,
"reward": 1.3499219417572021,
"reward_std": 0.564841091632843,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9540885090827942,
"step": 493
},
{
"completion_length": 337.328125,
"epoch": 0.7190684133915575,
"grad_norm": 1.1320885181605083,
"kl": 0.053955078125,
"learning_rate": 1.93285632866669e-07,
"loss": -0.0003,
"reward": 1.0488346815109253,
"reward_std": 0.430465966463089,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9381445646286011,
"step": 494
},
{
"completion_length": 336.59375,
"epoch": 0.7205240174672489,
"grad_norm": 0.7656987950423686,
"kl": 0.05029296875,
"learning_rate": 1.914263846792725e-07,
"loss": -0.0014,
"reward": 0.2095833271741867,
"reward_std": 0.22649352252483368,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.9746744632720947,
"step": 495
},
{
"completion_length": 345.4375,
"epoch": 0.7219796215429404,
"grad_norm": 1.0169013561590603,
"kl": 0.043701171875,
"learning_rate": 1.895740025783782e-07,
"loss": -0.0001,
"reward": 1.433619737625122,
"reward_std": 0.5415338277816772,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9466406106948853,
"step": 496
},
{
"completion_length": 358.96875,
"epoch": 0.7234352256186317,
"grad_norm": 1.036810610063429,
"kl": 0.0458984375,
"learning_rate": 1.8772852778143062e-07,
"loss": -0.0019,
"reward": 1.3713606595993042,
"reward_std": 0.6118265390396118,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9330794811248779,
"step": 497
},
{
"completion_length": 347.625,
"epoch": 0.7248908296943232,
"grad_norm": 0.9756569197859266,
"kl": 0.0478515625,
"learning_rate": 1.858900013521788e-07,
"loss": -0.0005,
"reward": 1.2231640815734863,
"reward_std": 0.06950952857732773,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9696353673934937,
"step": 498
},
{
"completion_length": 342.609375,
"epoch": 0.7263464337700145,
"grad_norm": 1.104737132675653,
"kl": 0.047119140625,
"learning_rate": 1.8405846419976394e-07,
"loss": 0.0046,
"reward": 1.1284375190734863,
"reward_std": 0.30641376972198486,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9205468893051147,
"step": 499
},
{
"completion_length": 341.171875,
"epoch": 0.727802037845706,
"grad_norm": 1.0685649265760062,
"kl": 0.047119140625,
"learning_rate": 1.8223395707780786e-07,
"loss": -0.0022,
"reward": 0.746009111404419,
"reward_std": 0.37746497988700867,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.95305335521698,
"step": 500
},
{
"completion_length": 339.109375,
"epoch": 0.7292576419213974,
"grad_norm": 1.1494787457478763,
"kl": 0.06201171875,
"learning_rate": 1.8041652058350766e-07,
"loss": 0.0024,
"reward": 0.6165429353713989,
"reward_std": 1.0153552293777466,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9255794286727905,
"step": 501
},
{
"completion_length": 347.234375,
"epoch": 0.7307132459970888,
"grad_norm": 1.0597503387535725,
"kl": 0.044677734375,
"learning_rate": 1.7860619515673032e-07,
"loss": 0.0031,
"reward": 1.761705756187439,
"reward_std": 0.34419363737106323,
"rewards/accuracy_reward": 0.921875,
"rewards/format_reward": 0.9930989742279053,
"step": 502
},
{
"completion_length": 358.609375,
"epoch": 0.7321688500727802,
"grad_norm": 1.1777505730146933,
"kl": 0.0498046875,
"learning_rate": 1.7680302107911544e-07,
"loss": 0.0014,
"reward": 0.7594987154006958,
"reward_std": 0.6815189123153687,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9232096672058105,
"step": 503
},
{
"completion_length": 352.9375,
"epoch": 0.7336244541484717,
"grad_norm": 1.0262394911227826,
"kl": 0.051025390625,
"learning_rate": 1.7500703847317662e-07,
"loss": 0.0005,
"reward": 0.17281901836395264,
"reward_std": 0.6659858226776123,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.8982356786727905,
"step": 504
},
{
"completion_length": 366.046875,
"epoch": 0.735080058224163,
"grad_norm": 0.8757486128031936,
"kl": 0.046142578125,
"learning_rate": 1.7321828730141037e-07,
"loss": -0.003,
"reward": 0.9162174463272095,
"reward_std": 0.6034565567970276,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9449284076690674,
"step": 505
},
{
"completion_length": 350.703125,
"epoch": 0.7365356622998545,
"grad_norm": 1.027601535912833,
"kl": 0.048828125,
"learning_rate": 1.7143680736540572e-07,
"loss": -0.0017,
"reward": 0.7806705236434937,
"reward_std": 0.4177248477935791,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9474283456802368,
"step": 506
},
{
"completion_length": 345.546875,
"epoch": 0.7379912663755459,
"grad_norm": 0.9635347526020929,
"kl": 0.051025390625,
"learning_rate": 1.6966263830495935e-07,
"loss": 0.0,
"reward": 1.5247917175292969,
"reward_std": 0.18780048191547394,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9935416579246521,
"step": 507
},
{
"completion_length": 347.53125,
"epoch": 0.7394468704512372,
"grad_norm": 1.0624576316371095,
"kl": 0.048095703125,
"learning_rate": 1.6789581959719294e-07,
"loss": 0.0018,
"reward": 0.1759960949420929,
"reward_std": 0.6078127026557922,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.9451627731323242,
"step": 508
},
{
"completion_length": 357.21875,
"epoch": 0.7409024745269287,
"grad_norm": 0.9931955691969966,
"kl": 0.045654296875,
"learning_rate": 1.661363905556758e-07,
"loss": 0.0016,
"reward": 0.9983984231948853,
"reward_std": 0.25032860040664673,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9772396087646484,
"step": 509
},
{
"completion_length": 335.78125,
"epoch": 0.74235807860262,
"grad_norm": 1.1766955680022009,
"kl": 0.056640625,
"learning_rate": 1.6438439032954853e-07,
"loss": -0.0019,
"reward": 0.9316536784172058,
"reward_std": 0.9252973794937134,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9565234184265137,
"step": 510
},
{
"completion_length": 351.8125,
"epoch": 0.7438136826783115,
"grad_norm": 1.0104315217544244,
"kl": 0.047119140625,
"learning_rate": 1.6263985790265383e-07,
"loss": -0.0008,
"reward": 0.7268945574760437,
"reward_std": 0.5565764904022217,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9422982335090637,
"step": 511
},
{
"completion_length": 339.59375,
"epoch": 0.745269286754003,
"grad_norm": 1.0913421651153437,
"kl": 0.053955078125,
"learning_rate": 1.609028320926668e-07,
"loss": 0.0001,
"reward": 0.7289843559265137,
"reward_std": 0.42006969451904297,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9395182132720947,
"step": 512
},
{
"completion_length": 328.71875,
"epoch": 0.7467248908296943,
"grad_norm": 1.3247436820269498,
"kl": 0.056396484375,
"learning_rate": 1.5917335155023366e-07,
"loss": -0.0024,
"reward": 1.460852861404419,
"reward_std": 0.3853009045124054,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9632877707481384,
"step": 513
},
{
"completion_length": 343.625,
"epoch": 0.7481804949053857,
"grad_norm": 0.8746716302390848,
"kl": 0.04541015625,
"learning_rate": 1.574514547581095e-07,
"loss": 0.0023,
"reward": 0.48206380009651184,
"reward_std": 0.2509358525276184,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9288216233253479,
"step": 514
},
{
"completion_length": 341.65625,
"epoch": 0.7496360989810772,
"grad_norm": 1.1764333370638496,
"kl": 0.046875,
"learning_rate": 1.557371800303039e-07,
"loss": -0.0049,
"reward": 1.1197460889816284,
"reward_std": 0.38245856761932373,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9527279138565063,
"step": 515
},
{
"completion_length": 347.953125,
"epoch": 0.7510917030567685,
"grad_norm": 0.9861112517300733,
"kl": 0.055419921875,
"learning_rate": 1.5403056551122694e-07,
"loss": -0.0008,
"reward": 1.1204687356948853,
"reward_std": 0.37004244327545166,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.96037757396698,
"step": 516
},
{
"completion_length": 344.5,
"epoch": 0.75254730713246,
"grad_norm": 1.1969290490481124,
"kl": 0.04833984375,
"learning_rate": 1.5233164917484114e-07,
"loss": -0.0003,
"reward": 1.2382487058639526,
"reward_std": 0.022989844903349876,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9881705641746521,
"step": 517
},
{
"completion_length": 351.859375,
"epoch": 0.7540029112081513,
"grad_norm": 0.8687338106556315,
"kl": 0.044677734375,
"learning_rate": 1.5064046882381626e-07,
"loss": 0.0009,
"reward": 0.549817681312561,
"reward_std": 0.1450074017047882,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9933593273162842,
"step": 518
},
{
"completion_length": 342.375,
"epoch": 0.7554585152838428,
"grad_norm": 1.1222413107637277,
"kl": 0.051513671875,
"learning_rate": 1.4895706208868876e-07,
"loss": 0.0008,
"reward": 0.9473632574081421,
"reward_std": 0.6324166655540466,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9235742092132568,
"step": 519
},
{
"completion_length": 361.09375,
"epoch": 0.7569141193595342,
"grad_norm": 0.8767855411501952,
"kl": 0.04248046875,
"learning_rate": 1.4728146642702338e-07,
"loss": 0.0043,
"reward": 0.6358333826065063,
"reward_std": 0.42567020654678345,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9458463788032532,
"step": 520
},
{
"completion_length": 339.40625,
"epoch": 0.7583697234352256,
"grad_norm": 1.1957414378411515,
"kl": 0.046875,
"learning_rate": 1.4561371912258098e-07,
"loss": 0.0018,
"reward": 1.0426563024520874,
"reward_std": 0.49093982577323914,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.972434937953949,
"step": 521
},
{
"completion_length": 356.4375,
"epoch": 0.759825327510917,
"grad_norm": 1.1172524921233873,
"kl": 0.05419921875,
"learning_rate": 1.4395385728448727e-07,
"loss": -0.0025,
"reward": 1.7248958349227905,
"reward_std": 0.41973060369491577,
"rewards/accuracy_reward": 0.921875,
"rewards/format_reward": 0.9582551717758179,
"step": 522
},
{
"completion_length": 338.921875,
"epoch": 0.7612809315866085,
"grad_norm": 1.0582473955837757,
"kl": 0.044921875,
"learning_rate": 1.423019178464091e-07,
"loss": 0.0002,
"reward": 1.3815103769302368,
"reward_std": 0.5760092735290527,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9437500238418579,
"step": 523
},
{
"completion_length": 343.59375,
"epoch": 0.7627365356622998,
"grad_norm": 0.9984931203320635,
"kl": 0.046875,
"learning_rate": 1.406579375657308e-07,
"loss": 0.0039,
"reward": 0.6505143642425537,
"reward_std": 0.33663293719291687,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9126237034797668,
"step": 524
},
{
"completion_length": 341.40625,
"epoch": 0.7641921397379913,
"grad_norm": 1.1754051815019624,
"kl": 0.051513671875,
"learning_rate": 1.3902195302273778e-07,
"loss": -0.004,
"reward": 1.7554621696472168,
"reward_std": 0.6817976236343384,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.9429622888565063,
"step": 525
},
{
"completion_length": 333.265625,
"epoch": 0.7656477438136827,
"grad_norm": 1.0633972900525368,
"kl": 0.05078125,
"learning_rate": 1.373940006198014e-07,
"loss": 0.0001,
"reward": 1.441927194595337,
"reward_std": 0.5073419809341431,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9575521349906921,
"step": 526
},
{
"completion_length": 338.953125,
"epoch": 0.7671033478893741,
"grad_norm": 0.98335951118238,
"kl": 0.0478515625,
"learning_rate": 1.3577411658056965e-07,
"loss": 0.0003,
"reward": 0.7285221219062805,
"reward_std": 0.41777336597442627,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9417122602462769,
"step": 527
},
{
"completion_length": 342.265625,
"epoch": 0.7685589519650655,
"grad_norm": 1.104214294364568,
"kl": 0.048828125,
"learning_rate": 1.3416233694916086e-07,
"loss": -0.0018,
"reward": 1.0809439420700073,
"reward_std": 0.498748779296875,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9592252969741821,
"step": 528
},
{
"completion_length": 365.03125,
"epoch": 0.7700145560407569,
"grad_norm": 0.8864241710102599,
"kl": 0.05322265625,
"learning_rate": 1.325586975893621e-07,
"loss": -0.001,
"reward": 1.0926563739776611,
"reward_std": 0.29806235432624817,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9340364933013916,
"step": 529
},
{
"completion_length": 355.90625,
"epoch": 0.7714701601164483,
"grad_norm": 0.8844305655178724,
"kl": 0.045654296875,
"learning_rate": 1.3096323418383043e-07,
"loss": 0.0012,
"reward": 1.4333789348602295,
"reward_std": 0.4198879301548004,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9490039348602295,
"step": 530
},
{
"completion_length": 343.890625,
"epoch": 0.7729257641921398,
"grad_norm": 1.253496882295048,
"kl": 0.048828125,
"learning_rate": 1.2937598223330005e-07,
"loss": 0.0014,
"reward": 1.2209309339523315,
"reward_std": 0.08089350908994675,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.96630859375,
"step": 531
},
{
"completion_length": 338.375,
"epoch": 0.7743813682678311,
"grad_norm": 1.1782096109944808,
"kl": 0.046630859375,
"learning_rate": 1.2779697705579058e-07,
"loss": 0.0036,
"reward": 0.9942382574081421,
"reward_std": 0.33707913756370544,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.97488933801651,
"step": 532
},
{
"completion_length": 356.609375,
"epoch": 0.7758369723435226,
"grad_norm": 1.0305776858800557,
"kl": 0.04345703125,
"learning_rate": 1.262262537858233e-07,
"loss": -0.0023,
"reward": 0.9166210889816284,
"reward_std": 0.7489937543869019,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9442252516746521,
"step": 533
},
{
"completion_length": 366.984375,
"epoch": 0.777292576419214,
"grad_norm": 1.0309599470321495,
"kl": 0.043212890625,
"learning_rate": 1.2466384737363779e-07,
"loss": 0.0028,
"reward": 0.9351236820220947,
"reward_std": 1.2396140098571777,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9184830784797668,
"step": 534
},
{
"completion_length": 347.921875,
"epoch": 0.7787481804949054,
"grad_norm": 0.9649435296818741,
"kl": 0.05078125,
"learning_rate": 1.231097925844153e-07,
"loss": -0.003,
"reward": 1.10358726978302,
"reward_std": 0.4084942936897278,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.94621741771698,
"step": 535
},
{
"completion_length": 353.625,
"epoch": 0.7802037845705968,
"grad_norm": 1.2290305710541891,
"kl": 0.050537109375,
"learning_rate": 1.215641239975042e-07,
"loss": 0.004,
"reward": 0.35907554626464844,
"reward_std": 0.4438338279724121,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9410285949707031,
"step": 536
},
{
"completion_length": 356.546875,
"epoch": 0.7816593886462883,
"grad_norm": 0.7192201796065968,
"kl": 0.046142578125,
"learning_rate": 1.2002687600565137e-07,
"loss": -0.0018,
"reward": 1.6950325965881348,
"reward_std": 0.20082132518291473,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 0.97628253698349,
"step": 537
},
{
"completion_length": 348.703125,
"epoch": 0.7831149927219796,
"grad_norm": 1.0661868149196754,
"kl": 0.052734375,
"learning_rate": 1.1849808281423635e-07,
"loss": 0.002,
"reward": 1.1826952695846558,
"reward_std": 0.5324006080627441,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9778385162353516,
"step": 538
},
{
"completion_length": 354.0625,
"epoch": 0.784570596797671,
"grad_norm": 1.0553040942110303,
"kl": 0.049072265625,
"learning_rate": 1.1697777844051104e-07,
"loss": -0.0017,
"reward": 1.595253825187683,
"reward_std": 0.9109958410263062,
"rewards/accuracy_reward": 0.890625,
"rewards/format_reward": 0.9233788847923279,
"step": 539
},
{
"completion_length": 352.875,
"epoch": 0.7860262008733624,
"grad_norm": 0.8135854803568479,
"kl": 0.052734375,
"learning_rate": 1.1546599671284158e-07,
"loss": 0.0048,
"reward": 0.06436197459697723,
"reward_std": 0.49560049176216125,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.9329817295074463,
"step": 540
},
{
"completion_length": 356.609375,
"epoch": 0.7874818049490538,
"grad_norm": 1.098756609309082,
"kl": 0.043701171875,
"learning_rate": 1.1396277126995707e-07,
"loss": -0.0004,
"reward": 0.7316992282867432,
"reward_std": 0.6466116905212402,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9482096433639526,
"step": 541
},
{
"completion_length": 360.78125,
"epoch": 0.7889374090247453,
"grad_norm": 1.0121213548186219,
"kl": 0.046630859375,
"learning_rate": 1.1246813556019924e-07,
"loss": -0.0013,
"reward": 1.4149739742279053,
"reward_std": 0.7345938086509705,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9305989742279053,
"step": 542
},
{
"completion_length": 351.8125,
"epoch": 0.7903930131004366,
"grad_norm": 1.1589095869023704,
"kl": 0.04736328125,
"learning_rate": 1.1098212284078035e-07,
"loss": -0.0025,
"reward": 1.481673240661621,
"reward_std": 0.5085718631744385,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9943815469741821,
"step": 543
},
{
"completion_length": 345.8125,
"epoch": 0.7918486171761281,
"grad_norm": 0.9205851416631115,
"kl": 0.048583984375,
"learning_rate": 1.0950476617704124e-07,
"loss": 0.0009,
"reward": 1.3531510829925537,
"reward_std": 0.41115716099739075,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9585286378860474,
"step": 544
},
{
"completion_length": 364.125,
"epoch": 0.7933042212518195,
"grad_norm": 0.9830413575226257,
"kl": 0.041748046875,
"learning_rate": 1.0803609844171719e-07,
"loss": -0.0028,
"reward": 0.51806640625,
"reward_std": 0.560853123664856,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9600846767425537,
"step": 545
},
{
"completion_length": 363.234375,
"epoch": 0.7947598253275109,
"grad_norm": 0.9801296629236323,
"kl": 0.04736328125,
"learning_rate": 1.0657615231420491e-07,
"loss": 0.0013,
"reward": 0.36118483543395996,
"reward_std": 0.406131386756897,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9475390315055847,
"step": 546
},
{
"completion_length": 372.859375,
"epoch": 0.7962154294032023,
"grad_norm": 0.8933107929367656,
"kl": 0.0458984375,
"learning_rate": 1.0512496027983714e-07,
"loss": 0.0033,
"reward": 0.27970701456069946,
"reward_std": 0.969652533531189,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.871985673904419,
"step": 547
},
{
"completion_length": 376.1875,
"epoch": 0.7976710334788938,
"grad_norm": 0.8824515938688806,
"kl": 0.04541015625,
"learning_rate": 1.0368255462915765e-07,
"loss": 0.0006,
"reward": 0.76199871301651,
"reward_std": 0.5525078773498535,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9308398365974426,
"step": 548
},
{
"completion_length": 372.171875,
"epoch": 0.7991266375545851,
"grad_norm": 0.9171428811714127,
"kl": 0.04541015625,
"learning_rate": 1.0224896745720512e-07,
"loss": 0.002,
"reward": 1.1354882717132568,
"reward_std": 1.028761863708496,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.8847330808639526,
"step": 549
},
{
"completion_length": 374.25,
"epoch": 0.8005822416302766,
"grad_norm": 0.9349834701869527,
"kl": 0.04931640625,
"learning_rate": 1.00824230662797e-07,
"loss": 0.0021,
"reward": 0.27811199426651,
"reward_std": 1.2048439979553223,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.9164583683013916,
"step": 550
},
{
"completion_length": 354.34375,
"epoch": 0.8020378457059679,
"grad_norm": 1.0746749824810193,
"kl": 0.04443359375,
"learning_rate": 9.940837594782125e-08,
"loss": 0.001,
"reward": 0.9130924344062805,
"reward_std": 0.9886895418167114,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9405273199081421,
"step": 551
},
{
"completion_length": 369.28125,
"epoch": 0.8034934497816594,
"grad_norm": 0.9674452242381952,
"kl": 0.04443359375,
"learning_rate": 9.800143481652979e-08,
"loss": -0.0007,
"reward": 0.9438866972923279,
"reward_std": 0.6287566423416138,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9270898103713989,
"step": 552
},
{
"completion_length": 375.828125,
"epoch": 0.8049490538573508,
"grad_norm": 0.9793357755444789,
"kl": 0.042236328125,
"learning_rate": 9.660343857483799e-08,
"loss": -0.0034,
"reward": 1.215846300125122,
"reward_std": 1.2285206317901611,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.8707551956176758,
"step": 553
},
{
"completion_length": 355.03125,
"epoch": 0.8064046579330422,
"grad_norm": 0.9372538556562692,
"kl": 0.045166015625,
"learning_rate": 9.521441832962801e-08,
"loss": -0.0035,
"reward": 0.5855793952941895,
"reward_std": 0.4810252785682678,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9439908862113953,
"step": 554
},
{
"completion_length": 340.40625,
"epoch": 0.8078602620087336,
"grad_norm": 1.131677567848908,
"kl": 0.0478515625,
"learning_rate": 9.383440498805712e-08,
"loss": 0.0006,
"reward": 1.6682096719741821,
"reward_std": 0.6599315404891968,
"rewards/accuracy_reward": 0.921875,
"rewards/format_reward": 0.9025846719741821,
"step": 555
},
{
"completion_length": 360.125,
"epoch": 0.8093158660844251,
"grad_norm": 0.9570116291494392,
"kl": 0.048583984375,
"learning_rate": 9.246342925686884e-08,
"loss": -0.0017,
"reward": 0.6679362058639526,
"reward_std": 0.5891071557998657,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9264127612113953,
"step": 556
},
{
"completion_length": 350.296875,
"epoch": 0.8107714701601164,
"grad_norm": 1.0041594196588526,
"kl": 0.047607421875,
"learning_rate": 9.110152164171125e-08,
"loss": 0.0003,
"reward": 1.5817317962646484,
"reward_std": 0.5429558157920837,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9567317962646484,
"step": 557
},
{
"completion_length": 346.484375,
"epoch": 0.8122270742358079,
"grad_norm": 1.1612136223210474,
"kl": 0.045654296875,
"learning_rate": 8.974871244645626e-08,
"loss": -0.0012,
"reward": 1.318763017654419,
"reward_std": 0.8850969672203064,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9250651001930237,
"step": 558
},
{
"completion_length": 346.96875,
"epoch": 0.8136826783114993,
"grad_norm": 0.9594471396023049,
"kl": 0.05224609375,
"learning_rate": 8.840503177252745e-08,
"loss": -0.0016,
"reward": 1.168027400970459,
"reward_std": 0.8090516328811646,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9134830832481384,
"step": 559
},
{
"completion_length": 355.625,
"epoch": 0.8151382823871907,
"grad_norm": 1.165829749030139,
"kl": 0.0458984375,
"learning_rate": 8.707050951822842e-08,
"loss": -0.0029,
"reward": 0.23499347269535065,
"reward_std": 0.6717128753662109,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.9124023914337158,
"step": 560
},
{
"completion_length": 348.375,
"epoch": 0.8165938864628821,
"grad_norm": 0.8797964228562034,
"kl": 0.0439453125,
"learning_rate": 8.574517537807896e-08,
"loss": 0.0035,
"reward": 0.8617708086967468,
"reward_std": 0.06106572225689888,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9768619537353516,
"step": 561
},
{
"completion_length": 356.109375,
"epoch": 0.8180494905385735,
"grad_norm": 0.9540926767615193,
"kl": 0.04833984375,
"learning_rate": 8.442905884215329e-08,
"loss": 0.0001,
"reward": 0.7149023413658142,
"reward_std": 0.884148895740509,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9297460913658142,
"step": 562
},
{
"completion_length": 337.484375,
"epoch": 0.8195050946142649,
"grad_norm": 1.0541518957324363,
"kl": 0.0498046875,
"learning_rate": 8.31221891954243e-08,
"loss": -0.0055,
"reward": 0.9072656631469727,
"reward_std": 0.35776764154434204,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.9742317199707031,
"step": 563
},
{
"completion_length": 349.96875,
"epoch": 0.8209606986899564,
"grad_norm": 1.0803926660638092,
"kl": 0.047119140625,
"learning_rate": 8.182459551711197e-08,
"loss": 0.001,
"reward": 1.44936203956604,
"reward_std": 0.49504512548446655,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9640494585037231,
"step": 564
},
{
"completion_length": 350.421875,
"epoch": 0.8224163027656477,
"grad_norm": 1.0513767136216559,
"kl": 0.0576171875,
"learning_rate": 8.053630668003642e-08,
"loss": 0.0045,
"reward": 0.4982747435569763,
"reward_std": 0.013658922165632248,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9944205284118652,
"step": 565
},
{
"completion_length": 372.59375,
"epoch": 0.8238719068413392,
"grad_norm": 0.9692434119961149,
"kl": 0.04833984375,
"learning_rate": 7.925735134997491e-08,
"loss": 0.0001,
"reward": 0.5592772960662842,
"reward_std": 0.5283293724060059,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9178320169448853,
"step": 566
},
{
"completion_length": 342.453125,
"epoch": 0.8253275109170306,
"grad_norm": 1.0913320663341668,
"kl": 0.048095703125,
"learning_rate": 7.798775798502482e-08,
"loss": 0.0006,
"reward": 1.8661328554153442,
"reward_std": 0.2512561082839966,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 0.9598828554153442,
"step": 567
},
{
"completion_length": 344.15625,
"epoch": 0.826783114992722,
"grad_norm": 1.0413698797681266,
"kl": 0.04638671875,
"learning_rate": 7.672755483496979e-08,
"loss": 0.0034,
"reward": 1.9249870777130127,
"reward_std": 0.19554658234119415,
"rewards/accuracy_reward": 0.984375,
"rewards/format_reward": 0.9718619585037231,
"step": 568
},
{
"completion_length": 355.34375,
"epoch": 0.8282387190684134,
"grad_norm": 1.1642983816622974,
"kl": 0.04541015625,
"learning_rate": 7.547676994065116e-08,
"loss": -0.0025,
"reward": 1.4583983421325684,
"reward_std": 0.6337254643440247,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9729557037353516,
"step": 569
},
{
"completion_length": 358.421875,
"epoch": 0.8296943231441049,
"grad_norm": 0.8768810528600219,
"kl": 0.045654296875,
"learning_rate": 7.423543113334435e-08,
"loss": -0.0034,
"reward": 0.9223567843437195,
"reward_std": 0.24935288727283478,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9516797065734863,
"step": 570
},
{
"completion_length": 377.25,
"epoch": 0.8311499272197962,
"grad_norm": 0.9846837227637683,
"kl": 0.04443359375,
"learning_rate": 7.300356603413965e-08,
"loss": 0.0001,
"reward": 1.3689582347869873,
"reward_std": 1.1683030128479004,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.8845832347869873,
"step": 571
},
{
"completion_length": 360.484375,
"epoch": 0.8326055312954876,
"grad_norm": 1.008771276578655,
"kl": 0.051025390625,
"learning_rate": 7.178120205332716e-08,
"loss": -0.0002,
"reward": 1.635097622871399,
"reward_std": 0.7983198165893555,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 0.9163476228713989,
"step": 572
},
{
"completion_length": 352.296875,
"epoch": 0.834061135371179,
"grad_norm": 1.1642594254533514,
"kl": 0.0498046875,
"learning_rate": 7.056836638978696e-08,
"loss": 0.0036,
"reward": 0.44958335161209106,
"reward_std": 0.6141500473022461,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9442839026451111,
"step": 573
},
{
"completion_length": 336.0625,
"epoch": 0.8355167394468704,
"grad_norm": 1.0723516782661382,
"kl": 0.05029296875,
"learning_rate": 6.936508603038465e-08,
"loss": 0.0009,
"reward": 1.4923112392425537,
"reward_std": 0.3686758875846863,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9570898413658142,
"step": 574
},
{
"completion_length": 362.203125,
"epoch": 0.8369723435225619,
"grad_norm": 0.8489752039238883,
"kl": 0.04443359375,
"learning_rate": 6.817138774936975e-08,
"loss": -0.0025,
"reward": 1.5311849117279053,
"reward_std": 0.48614656925201416,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.952552080154419,
"step": 575
},
{
"completion_length": 321.90625,
"epoch": 0.8384279475982532,
"grad_norm": 1.2381358943286065,
"kl": 0.057861328125,
"learning_rate": 6.698729810778064e-08,
"loss": 0.0007,
"reward": 1.2007226943969727,
"reward_std": 0.3871755599975586,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9507226943969727,
"step": 576
},
{
"completion_length": 345.765625,
"epoch": 0.8398835516739447,
"grad_norm": 0.9963944986671528,
"kl": 0.04638671875,
"learning_rate": 6.58128434528537e-08,
"loss": -0.0006,
"reward": 0.7609505653381348,
"reward_std": 0.811994194984436,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9278906583786011,
"step": 577
},
{
"completion_length": 345.453125,
"epoch": 0.8413391557496361,
"grad_norm": 0.9963445120426506,
"kl": 0.0498046875,
"learning_rate": 6.464804991743628e-08,
"loss": 0.0036,
"reward": 1.5395898818969727,
"reward_std": 0.23713621497154236,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.959876298904419,
"step": 578
},
{
"completion_length": 357.3125,
"epoch": 0.8427947598253275,
"grad_norm": 0.9805282214375126,
"kl": 0.048095703125,
"learning_rate": 6.349294341940592e-08,
"loss": 0.0006,
"reward": 1.4974348545074463,
"reward_std": 0.36625921726226807,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9642578363418579,
"step": 579
},
{
"completion_length": 368.546875,
"epoch": 0.8442503639010189,
"grad_norm": 0.9270265739588998,
"kl": 0.043701171875,
"learning_rate": 6.234754966109351e-08,
"loss": -0.001,
"reward": 1.1266862154006958,
"reward_std": 0.35941964387893677,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9662825465202332,
"step": 580
},
{
"completion_length": 338.53125,
"epoch": 0.8457059679767104,
"grad_norm": 1.0906414405939546,
"kl": 0.049072265625,
"learning_rate": 6.12118941287112e-08,
"loss": 0.002,
"reward": 1.5338281393051147,
"reward_std": 0.24283231794834137,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.9543749690055847,
"step": 581
},
{
"completion_length": 343.84375,
"epoch": 0.8471615720524017,
"grad_norm": 0.9818201939354958,
"kl": 0.04638671875,
"learning_rate": 6.008600209178538e-08,
"loss": -0.0039,
"reward": 1.5875390768051147,
"reward_std": 0.33346858620643616,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9625390768051147,
"step": 582
},
{
"completion_length": 375.671875,
"epoch": 0.8486171761280932,
"grad_norm": 1.0416004517786164,
"kl": 0.04736328125,
"learning_rate": 5.8969898602594325e-08,
"loss": 0.0057,
"reward": 0.5021549463272095,
"reward_std": 0.6197409629821777,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9480664134025574,
"step": 583
},
{
"completion_length": 368.109375,
"epoch": 0.8500727802037845,
"grad_norm": 0.8470887591348754,
"kl": 0.048828125,
"learning_rate": 5.786360849561117e-08,
"loss": 0.0019,
"reward": 1.3838281631469727,
"reward_std": 0.6718348264694214,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9463281631469727,
"step": 584
},
{
"completion_length": 379.875,
"epoch": 0.851528384279476,
"grad_norm": 0.9181167404523447,
"kl": 0.044921875,
"learning_rate": 5.676715638695062e-08,
"loss": 0.0009,
"reward": 0.4963216185569763,
"reward_std": 0.826927900314331,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9025716185569763,
"step": 585
},
{
"completion_length": 366.78125,
"epoch": 0.8529839883551674,
"grad_norm": 1.2129962706817803,
"kl": 0.05419921875,
"learning_rate": 5.5680566673822096e-08,
"loss": -0.0009,
"reward": 0.641100287437439,
"reward_std": 1.051483154296875,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.9454361796379089,
"step": 586
},
{
"completion_length": 352.34375,
"epoch": 0.8544395924308588,
"grad_norm": 0.9410634324985974,
"kl": 0.04931640625,
"learning_rate": 5.4603863533985825e-08,
"loss": -0.0023,
"reward": 1.0430793762207031,
"reward_std": 0.46832895278930664,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9298372268676758,
"step": 587
},
{
"completion_length": 362.78125,
"epoch": 0.8558951965065502,
"grad_norm": 1.0843006668744697,
"kl": 0.044921875,
"learning_rate": 5.353707092521581e-08,
"loss": 0.0068,
"reward": 1.073815107345581,
"reward_std": 0.4660176932811737,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9153515100479126,
"step": 588
},
{
"completion_length": 348.21875,
"epoch": 0.8573508005822417,
"grad_norm": 1.0292956885097058,
"kl": 0.048095703125,
"learning_rate": 5.2480212584766035e-08,
"loss": -0.0002,
"reward": 1.5367252826690674,
"reward_std": 0.8215746879577637,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 0.9117252826690674,
"step": 589
},
{
"completion_length": 356.265625,
"epoch": 0.858806404657933,
"grad_norm": 0.9099297684040104,
"kl": 0.04638671875,
"learning_rate": 5.143331202884299e-08,
"loss": -0.0024,
"reward": 1.44970703125,
"reward_std": 0.39304301142692566,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9629882574081421,
"step": 590
},
{
"completion_length": 364.65625,
"epoch": 0.8602620087336245,
"grad_norm": 0.9309941077444687,
"kl": 0.048583984375,
"learning_rate": 5.039639255208156e-08,
"loss": -0.0002,
"reward": 1.375429630279541,
"reward_std": 0.5939683318138123,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9332812428474426,
"step": 591
},
{
"completion_length": 367.625,
"epoch": 0.8617176128093159,
"grad_norm": 0.8096322535425639,
"kl": 0.044921875,
"learning_rate": 4.9369477227027614e-08,
"loss": -0.0,
"reward": 1.422376275062561,
"reward_std": 0.4559106230735779,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9346680045127869,
"step": 592
},
{
"completion_length": 348.34375,
"epoch": 0.8631732168850073,
"grad_norm": 0.9888972852645406,
"kl": 0.047119140625,
"learning_rate": 4.835258890362387e-08,
"loss": 0.0014,
"reward": 0.8151302337646484,
"reward_std": 0.5511770844459534,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9372526407241821,
"step": 593
},
{
"completion_length": 359.03125,
"epoch": 0.8646288209606987,
"grad_norm": 1.0504733411057308,
"kl": 0.050537109375,
"learning_rate": 4.7345750208701684e-08,
"loss": -0.0021,
"reward": 1.6032031774520874,
"reward_std": 0.6642707586288452,
"rewards/accuracy_reward": 0.890625,
"rewards/format_reward": 0.9313281178474426,
"step": 594
},
{
"completion_length": 360.5625,
"epoch": 0.86608442503639,
"grad_norm": 0.9767718704060626,
"kl": 0.047119140625,
"learning_rate": 4.634898354547778e-08,
"loss": 0.0029,
"reward": 0.6824283599853516,
"reward_std": 0.3918249011039734,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8971809148788452,
"step": 595
},
{
"completion_length": 365.375,
"epoch": 0.8675400291120815,
"grad_norm": 0.7220183880376431,
"kl": 0.0419921875,
"learning_rate": 4.536231109305577e-08,
"loss": -0.0012,
"reward": 1.7962956428527832,
"reward_std": 0.3715449273586273,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.9828841686248779,
"step": 596
},
{
"completion_length": 369.734375,
"epoch": 0.868995633187773,
"grad_norm": 1.013373247031297,
"kl": 0.048828125,
"learning_rate": 4.4385754805932095e-08,
"loss": 0.0003,
"reward": 1.1727409362792969,
"reward_std": 0.9622257351875305,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.9174544215202332,
"step": 597
},
{
"completion_length": 373.765625,
"epoch": 0.8704512372634643,
"grad_norm": 0.9611521678835144,
"kl": 0.0478515625,
"learning_rate": 4.341933641350842e-08,
"loss": 0.0004,
"reward": 0.015071600675582886,
"reward_std": 0.68045574426651,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.9341601729393005,
"step": 598
},
{
"completion_length": 354.03125,
"epoch": 0.8719068413391557,
"grad_norm": 1.1525733336623423,
"kl": 0.046630859375,
"learning_rate": 4.2463077419606976e-08,
"loss": 0.0017,
"reward": 1.3347785472869873,
"reward_std": 0.567120373249054,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9418619871139526,
"step": 599
},
{
"completion_length": 363.328125,
"epoch": 0.8733624454148472,
"grad_norm": 1.0773160423814951,
"kl": 0.048095703125,
"learning_rate": 4.151699910199336e-08,
"loss": 0.0036,
"reward": 0.6707291603088379,
"reward_std": 0.8057493567466736,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.8792187571525574,
"step": 600
},
{
"completion_length": 359.28125,
"epoch": 0.8748180494905385,
"grad_norm": 0.9746476457350777,
"kl": 0.0537109375,
"learning_rate": 4.058112251190193e-08,
"loss": -0.0024,
"reward": 0.4454817771911621,
"reward_std": 0.7263391613960266,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.8914192318916321,
"step": 601
},
{
"completion_length": 380.09375,
"epoch": 0.87627365356623,
"grad_norm": 0.9946978532938755,
"kl": 0.046630859375,
"learning_rate": 3.9655468473568435e-08,
"loss": -0.0026,
"reward": 0.7158983945846558,
"reward_std": 0.7142089009284973,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9306380152702332,
"step": 602
},
{
"completion_length": 370.9375,
"epoch": 0.8777292576419214,
"grad_norm": 1.0502943403626153,
"kl": 0.0498046875,
"learning_rate": 3.8740057583765694e-08,
"loss": -0.0026,
"reward": 1.6357030868530273,
"reward_std": 0.7968278527259827,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 0.9169531464576721,
"step": 603
},
{
"completion_length": 367.421875,
"epoch": 0.8791848617176128,
"grad_norm": 0.8640177808219943,
"kl": 0.048828125,
"learning_rate": 3.783491021134588e-08,
"loss": 0.0039,
"reward": 1.4072265625,
"reward_std": 0.551764965057373,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9208593368530273,
"step": 604
},
{
"completion_length": 366.359375,
"epoch": 0.8806404657933042,
"grad_norm": 0.8946058550020072,
"kl": 0.053466796875,
"learning_rate": 3.694004649678706e-08,
"loss": -0.0015,
"reward": 1.261816382408142,
"reward_std": 0.89951491355896,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9160351753234863,
"step": 605
},
{
"completion_length": 346.609375,
"epoch": 0.8820960698689956,
"grad_norm": 0.9508950509942107,
"kl": 0.046142578125,
"learning_rate": 3.6055486351745324e-08,
"loss": 0.0035,
"reward": 1.8074610233306885,
"reward_std": 0.5373015999794006,
"rewards/accuracy_reward": 0.953125,
"rewards/format_reward": 0.9480859041213989,
"step": 606
},
{
"completion_length": 359.171875,
"epoch": 0.883551673944687,
"grad_norm": 1.0635120558945663,
"kl": 0.050537109375,
"learning_rate": 3.51812494586114e-08,
"loss": 0.0011,
"reward": -0.21584634482860565,
"reward_std": 0.69797682762146,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.8804166316986084,
"step": 607
},
{
"completion_length": 359.65625,
"epoch": 0.8850072780203785,
"grad_norm": 0.9616362685575608,
"kl": 0.0478515625,
"learning_rate": 3.4317355270072954e-08,
"loss": 0.0031,
"reward": 0.5914518237113953,
"reward_std": 0.7840473651885986,
"rewards/accuracy_reward": 0.546875,
"rewards/format_reward": 0.9474804997444153,
"step": 608
},
{
"completion_length": 374.515625,
"epoch": 0.8864628820960698,
"grad_norm": 0.9665570781215977,
"kl": 0.0537109375,
"learning_rate": 3.3463823008681334e-08,
"loss": 0.0009,
"reward": 0.6901432275772095,
"reward_std": 0.44793906807899475,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9485155940055847,
"step": 609
},
{
"completion_length": 348.96875,
"epoch": 0.8879184861717613,
"grad_norm": 1.1514296655136664,
"kl": 0.056396484375,
"learning_rate": 3.2620671666424515e-08,
"loss": -0.0005,
"reward": 1.0574414730072021,
"reward_std": 0.2184952199459076,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9922200441360474,
"step": 610
},
{
"completion_length": 386.796875,
"epoch": 0.8893740902474527,
"grad_norm": 0.8572317909478371,
"kl": 0.046630859375,
"learning_rate": 3.17879200043038e-08,
"loss": -0.0004,
"reward": 0.48298177123069763,
"reward_std": 0.4573482573032379,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.93610680103302,
"step": 611
},
{
"completion_length": 364.4375,
"epoch": 0.8908296943231441,
"grad_norm": 0.80677467402995,
"kl": 0.046142578125,
"learning_rate": 3.0965586551917054e-08,
"loss": 0.0023,
"reward": 0.9373893141746521,
"reward_std": 0.7911602258682251,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9207747578620911,
"step": 612
},
{
"completion_length": 377.734375,
"epoch": 0.8922852983988355,
"grad_norm": 0.7229029396190814,
"kl": 0.046875,
"learning_rate": 3.015368960704584e-08,
"loss": 0.0026,
"reward": 1.031211018562317,
"reward_std": 0.7297381162643433,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.91817706823349,
"step": 613
},
{
"completion_length": 348.046875,
"epoch": 0.893740902474527,
"grad_norm": 1.2890732275734214,
"kl": 0.056640625,
"learning_rate": 2.935224723524843e-08,
"loss": -0.0016,
"reward": -0.2784309983253479,
"reward_std": 0.33096808195114136,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.9581446051597595,
"step": 614
},
{
"completion_length": 341.65625,
"epoch": 0.8951965065502183,
"grad_norm": 1.1372194523462842,
"kl": 0.04638671875,
"learning_rate": 2.8561277269457895e-08,
"loss": -0.0019,
"reward": 1.5267903804779053,
"reward_std": 0.19310006499290466,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.99211585521698,
"step": 615
},
{
"completion_length": 340.84375,
"epoch": 0.8966521106259098,
"grad_norm": 1.1633831512202182,
"kl": 0.054443359375,
"learning_rate": 2.7780797309585603e-08,
"loss": 0.0015,
"reward": 1.0949218273162842,
"reward_std": 0.4195671081542969,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9380208253860474,
"step": 616
},
{
"completion_length": 340.375,
"epoch": 0.8981077147016011,
"grad_norm": 1.0572682704568244,
"kl": 0.0546875,
"learning_rate": 2.701082472212879e-08,
"loss": 0.0005,
"reward": 1.088769555091858,
"reward_std": 1.067856788635254,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9237044453620911,
"step": 617
},
{
"completion_length": 365.3125,
"epoch": 0.8995633187772926,
"grad_norm": 0.8282223286971002,
"kl": 0.054931640625,
"learning_rate": 2.625137663978516e-08,
"loss": 0.0001,
"reward": 1.419173240661621,
"reward_std": 0.7184128761291504,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9299284219741821,
"step": 618
},
{
"completion_length": 354.203125,
"epoch": 0.901018922852984,
"grad_norm": 1.0373995243480785,
"kl": 0.046142578125,
"learning_rate": 2.5502469961070637e-08,
"loss": -0.0006,
"reward": 0.5479947924613953,
"reward_std": 0.4919097423553467,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9511458873748779,
"step": 619
},
{
"completion_length": 370.484375,
"epoch": 0.9024745269286754,
"grad_norm": 0.9212810693479083,
"kl": 0.046142578125,
"learning_rate": 2.4764121349944265e-08,
"loss": 0.0015,
"reward": 0.3472330868244171,
"reward_std": 0.448018878698349,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9308269023895264,
"step": 620
},
{
"completion_length": 358.921875,
"epoch": 0.9039301310043668,
"grad_norm": 0.9860633655951645,
"kl": 0.0478515625,
"learning_rate": 2.4036347235436738e-08,
"loss": 0.0069,
"reward": 0.37333983182907104,
"reward_std": 0.5936962962150574,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9631054997444153,
"step": 621
},
{
"completion_length": 351.390625,
"epoch": 0.9053857350800583,
"grad_norm": 0.9338847956064392,
"kl": 0.04736328125,
"learning_rate": 2.331916381128535e-08,
"loss": 0.0042,
"reward": 1.354824185371399,
"reward_std": 0.553050696849823,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9633399248123169,
"step": 622
},
{
"completion_length": 363.1875,
"epoch": 0.9068413391557496,
"grad_norm": 0.877438715066185,
"kl": 0.04443359375,
"learning_rate": 2.2612587035573226e-08,
"loss": 0.0024,
"reward": 0.77873694896698,
"reward_std": 0.24482090771198273,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9492838382720947,
"step": 623
},
{
"completion_length": 377.453125,
"epoch": 0.9082969432314411,
"grad_norm": 0.9221142915068276,
"kl": 0.04150390625,
"learning_rate": 2.1916632630374577e-08,
"loss": 0.0007,
"reward": 1.2329556941986084,
"reward_std": 0.8266670107841492,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.8883593082427979,
"step": 624
},
{
"completion_length": 357.9375,
"epoch": 0.9097525473071325,
"grad_norm": 1.0177312459104024,
"kl": 0.048583984375,
"learning_rate": 2.123131608140455e-08,
"loss": -0.001,
"reward": 1.74072265625,
"reward_std": 0.59820157289505,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.92822265625,
"step": 625
},
{
"completion_length": 352.171875,
"epoch": 0.9112081513828238,
"grad_norm": 0.8169726421475969,
"kl": 0.051025390625,
"learning_rate": 2.0556652637675144e-08,
"loss": -0.0005,
"reward": 1.4498176574707031,
"reward_std": 0.48735594749450684,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9654426574707031,
"step": 626
},
{
"completion_length": 347.40625,
"epoch": 0.9126637554585153,
"grad_norm": 1.043701865873264,
"kl": 0.049072265625,
"learning_rate": 1.989265731115525e-08,
"loss": 0.003,
"reward": 1.333925724029541,
"reward_std": 0.8411956429481506,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9422720670700073,
"step": 627
},
{
"completion_length": 363.640625,
"epoch": 0.9141193595342066,
"grad_norm": 0.9420184278378598,
"kl": 0.047607421875,
"learning_rate": 1.9239344876437248e-08,
"loss": 0.0015,
"reward": 1.0000911951065063,
"reward_std": 0.5910583138465881,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9331510663032532,
"step": 628
},
{
"completion_length": 355.1875,
"epoch": 0.9155749636098981,
"grad_norm": 0.9554452056503712,
"kl": 0.052978515625,
"learning_rate": 1.8596729870407835e-08,
"loss": 0.0032,
"reward": 1.8222005367279053,
"reward_std": 0.49411827325820923,
"rewards/accuracy_reward": 0.953125,
"rewards/format_reward": 0.9628255367279053,
"step": 629
},
{
"completion_length": 347.140625,
"epoch": 0.9170305676855895,
"grad_norm": 0.9306151510914769,
"kl": 0.046142578125,
"learning_rate": 1.796482659192472e-08,
"loss": 0.0041,
"reward": 1.479524850845337,
"reward_std": 0.41134506464004517,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9424023032188416,
"step": 630
},
{
"completion_length": 374.625,
"epoch": 0.9184861717612809,
"grad_norm": 0.8480676856677177,
"kl": 0.047119140625,
"learning_rate": 1.7343649101498327e-08,
"loss": 0.0028,
"reward": 1.3734700679779053,
"reward_std": 0.3788111209869385,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9359700679779053,
"step": 631
},
{
"completion_length": 353.765625,
"epoch": 0.9199417758369723,
"grad_norm": 0.8894056614696153,
"kl": 0.0595703125,
"learning_rate": 1.6733211220979315e-08,
"loss": -0.0011,
"reward": 0.7032226324081421,
"reward_std": 0.6665799617767334,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.9133919477462769,
"step": 632
},
{
"completion_length": 354.5,
"epoch": 0.9213973799126638,
"grad_norm": 1.2302918226628299,
"kl": 0.0478515625,
"learning_rate": 1.6133526533250563e-08,
"loss": -0.0005,
"reward": 1.0562500953674316,
"reward_std": 0.6852624416351318,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9466797113418579,
"step": 633
},
{
"completion_length": 364.296875,
"epoch": 0.9228529839883551,
"grad_norm": 0.794210762993565,
"kl": 0.042236328125,
"learning_rate": 1.5544608381925285e-08,
"loss": 0.0023,
"reward": 1.3560612201690674,
"reward_std": 0.5529812574386597,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9654362201690674,
"step": 634
},
{
"completion_length": 375.125,
"epoch": 0.9243085880640466,
"grad_norm": 1.1478859293196242,
"kl": 0.046630859375,
"learning_rate": 1.4966469871049604e-08,
"loss": -0.0025,
"reward": 0.9822721481323242,
"reward_std": 0.7605947256088257,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9166861772537231,
"step": 635
},
{
"completion_length": 367.328125,
"epoch": 0.925764192139738,
"grad_norm": 0.9763592041679303,
"kl": 0.0458984375,
"learning_rate": 1.4399123864811902e-08,
"loss": 0.0015,
"reward": 1.4207422733306885,
"reward_std": 0.8127451539039612,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9341145753860474,
"step": 636
},
{
"completion_length": 350.515625,
"epoch": 0.9272197962154294,
"grad_norm": 1.133505236634761,
"kl": 0.053466796875,
"learning_rate": 1.384258298725549e-08,
"loss": -0.002,
"reward": 1.8829882144927979,
"reward_std": 0.3248525857925415,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 0.9767382740974426,
"step": 637
},
{
"completion_length": 362.71875,
"epoch": 0.9286754002911208,
"grad_norm": 0.9584500222009057,
"kl": 0.0439453125,
"learning_rate": 1.3296859621998668e-08,
"loss": 0.0008,
"reward": 1.1840624809265137,
"reward_std": 0.1869007647037506,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9802994728088379,
"step": 638
},
{
"completion_length": 351.375,
"epoch": 0.9301310043668122,
"grad_norm": 1.10049930360626,
"kl": 0.051513671875,
"learning_rate": 1.2761965911958384e-08,
"loss": 0.0002,
"reward": 1.2603776454925537,
"reward_std": 0.6707699298858643,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.9109375476837158,
"step": 639
},
{
"completion_length": 366.0625,
"epoch": 0.9315866084425036,
"grad_norm": 1.1123121973401144,
"kl": 0.047607421875,
"learning_rate": 1.2237913759080676e-08,
"loss": 0.001,
"reward": 0.48548179864883423,
"reward_std": 0.06013864278793335,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9767447710037231,
"step": 640
},
{
"completion_length": 366.96875,
"epoch": 0.9330422125181951,
"grad_norm": 1.0164203486015158,
"kl": 0.047607421875,
"learning_rate": 1.1724714824075332e-08,
"loss": -0.0036,
"reward": 0.3951367139816284,
"reward_std": 0.7576174736022949,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.931347668170929,
"step": 641
},
{
"completion_length": 360.703125,
"epoch": 0.9344978165938864,
"grad_norm": 0.899137742081826,
"kl": 0.045654296875,
"learning_rate": 1.1222380526156927e-08,
"loss": -0.0016,
"reward": 0.7644987106323242,
"reward_std": 0.2871595323085785,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9332357048988342,
"step": 642
},
{
"completion_length": 343.53125,
"epoch": 0.9359534206695779,
"grad_norm": 1.1461565307561832,
"kl": 0.05078125,
"learning_rate": 1.073092204279019e-08,
"loss": -0.0061,
"reward": 1.073411464691162,
"reward_std": 0.4045974314212799,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9620703458786011,
"step": 643
},
{
"completion_length": 359.09375,
"epoch": 0.9374090247452693,
"grad_norm": 0.9684956712684953,
"kl": 0.05224609375,
"learning_rate": 1.0250350309441825e-08,
"loss": 0.0023,
"reward": 1.4904427528381348,
"reward_std": 0.36797118186950684,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9588932394981384,
"step": 644
},
{
"completion_length": 355.78125,
"epoch": 0.9388646288209607,
"grad_norm": 1.0295786196933356,
"kl": 0.0498046875,
"learning_rate": 9.780676019336632e-09,
"loss": -0.0004,
"reward": 1.1840624809265137,
"reward_std": 0.1880166232585907,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9797786474227905,
"step": 645
},
{
"completion_length": 341.78125,
"epoch": 0.9403202328966521,
"grad_norm": 0.8966605070859263,
"kl": 0.048095703125,
"learning_rate": 9.32190962322027e-09,
"loss": 0.0022,
"reward": 1.1638997793197632,
"reward_std": 0.4949903190135956,
"rewards/accuracy_reward": 0.734375,
"rewards/format_reward": 0.9582747220993042,
"step": 646
},
{
"completion_length": 350.515625,
"epoch": 0.9417758369723436,
"grad_norm": 1.1365206921282076,
"kl": 0.050537109375,
"learning_rate": 8.874061329125936e-09,
"loss": 0.0033,
"reward": 0.8706445097923279,
"reward_std": 0.02185887284576893,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9920898675918579,
"step": 647
},
{
"completion_length": 368.203125,
"epoch": 0.9432314410480349,
"grad_norm": 0.9073495313031953,
"kl": 0.048095703125,
"learning_rate": 8.437141102147882e-09,
"loss": -0.0029,
"reward": 0.3735416531562805,
"reward_std": 0.5898939967155457,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.9196093678474426,
"step": 648
},
{
"completion_length": 355.296875,
"epoch": 0.9446870451237264,
"grad_norm": 0.9644518486493937,
"kl": 0.04638671875,
"learning_rate": 8.011158664219253e-09,
"loss": 0.0028,
"reward": 1.4891471862792969,
"reward_std": 0.37131497263908386,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 0.9561783671379089,
"step": 649
},
{
"completion_length": 357.859375,
"epoch": 0.9461426491994177,
"grad_norm": 0.915855189506075,
"kl": 0.052001953125,
"learning_rate": 7.59612349389599e-09,
"loss": -0.0002,
"reward": 1.8685481548309326,
"reward_std": 0.24537202715873718,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 0.9622981548309326,
"step": 650
},
{
"completion_length": 344.15625,
"epoch": 0.9475982532751092,
"grad_norm": 0.9405891712802695,
"kl": 0.050537109375,
"learning_rate": 7.1920448261457715e-09,
"loss": -0.0006,
"reward": 0.40004557371139526,
"reward_std": 0.2768644690513611,
"rewards/accuracy_reward": 0.484375,
"rewards/format_reward": 0.9441732168197632,
"step": 651
},
{
"completion_length": 374.21875,
"epoch": 0.9490538573508006,
"grad_norm": 0.9649816496242495,
"kl": 0.05078125,
"learning_rate": 6.798931652142737e-09,
"loss": -0.0019,
"reward": 0.29408854246139526,
"reward_std": 0.7213122248649597,
"rewards/accuracy_reward": 0.453125,
"rewards/format_reward": 0.9326822757720947,
"step": 652
},
{
"completion_length": 367.28125,
"epoch": 0.950509461426492,
"grad_norm": 0.9385056166107855,
"kl": 0.05224609375,
"learning_rate": 6.416792719067143e-09,
"loss": 0.0026,
"reward": 0.6902929544448853,
"reward_std": 0.43664127588272095,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9491862058639526,
"step": 653
},
{
"completion_length": 349.265625,
"epoch": 0.9519650655021834,
"grad_norm": 1.1227123204211185,
"kl": 0.05419921875,
"learning_rate": 6.045636529911025e-09,
"loss": 0.0015,
"reward": 0.9635221362113953,
"reward_std": 0.18498189747333527,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9879622459411621,
"step": 654
},
{
"completion_length": 356.90625,
"epoch": 0.9534206695778749,
"grad_norm": 0.9808379639308646,
"kl": 0.05859375,
"learning_rate": 5.685471343288672e-09,
"loss": 0.0018,
"reward": 1.1685742139816284,
"reward_std": 0.8473724722862244,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8689258098602295,
"step": 655
},
{
"completion_length": 367.015625,
"epoch": 0.9548762736535662,
"grad_norm": 0.9461780907460997,
"kl": 0.05078125,
"learning_rate": 5.33630517325323e-09,
"loss": 0.0026,
"reward": 0.7643359303474426,
"reward_std": 0.6334630250930786,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9341406226158142,
"step": 656
},
{
"completion_length": 354.828125,
"epoch": 0.9563318777292577,
"grad_norm": 1.0187044029454269,
"kl": 0.056396484375,
"learning_rate": 4.998145789118114e-09,
"loss": 0.0004,
"reward": 0.34561195969581604,
"reward_std": 0.3423462212085724,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.9248437285423279,
"step": 657
},
{
"completion_length": 356.8125,
"epoch": 0.9577874818049491,
"grad_norm": 0.9264971002008862,
"kl": 0.0478515625,
"learning_rate": 4.671000715284146e-09,
"loss": 0.0036,
"reward": 1.7602018117904663,
"reward_std": 0.6684818267822266,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.9477018117904663,
"step": 658
},
{
"completion_length": 360.484375,
"epoch": 0.9592430858806404,
"grad_norm": 0.899630251698082,
"kl": 0.048583984375,
"learning_rate": 4.354877231072307e-09,
"loss": 0.0017,
"reward": 0.73576819896698,
"reward_std": 0.8185403347015381,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.8940234184265137,
"step": 659
},
{
"completion_length": 368.375,
"epoch": 0.9606986899563319,
"grad_norm": 0.7875268897862577,
"kl": 0.04345703125,
"learning_rate": 4.049782370561583e-09,
"loss": 0.0017,
"reward": 0.9524999856948853,
"reward_std": 0.8121271133422852,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9330078363418579,
"step": 660
},
{
"completion_length": 359.25,
"epoch": 0.9621542940320232,
"grad_norm": 1.0118312486353827,
"kl": 0.046630859375,
"learning_rate": 3.755722922432481e-09,
"loss": -0.0042,
"reward": 1.806471347808838,
"reward_std": 0.41752493381500244,
"rewards/accuracy_reward": 0.953125,
"rewards/format_reward": 0.9470964074134827,
"step": 661
},
{
"completion_length": 365.0,
"epoch": 0.9636098981077147,
"grad_norm": 1.029349953751044,
"kl": 0.048828125,
"learning_rate": 3.4727054298161473e-09,
"loss": 0.0019,
"reward": 0.17289060354232788,
"reward_std": 0.7982980012893677,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.8995833396911621,
"step": 662
},
{
"completion_length": 381.40625,
"epoch": 0.9650655021834061,
"grad_norm": 0.8922375425023275,
"kl": 0.0478515625,
"learning_rate": 3.200736190148545e-09,
"loss": 0.0037,
"reward": 1.12339186668396,
"reward_std": 0.8738340139389038,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.8733919262886047,
"step": 663
},
{
"completion_length": 347.40625,
"epoch": 0.9665211062590975,
"grad_norm": 1.046079535967673,
"kl": 0.050537109375,
"learning_rate": 2.9398212550303945e-09,
"loss": 0.0013,
"reward": 1.3840559720993042,
"reward_std": 0.5832023024559021,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9441732168197632,
"step": 664
},
{
"completion_length": 339.375,
"epoch": 0.9679767103347889,
"grad_norm": 0.9349120246936703,
"kl": 0.048095703125,
"learning_rate": 2.6899664300925607e-09,
"loss": -0.0006,
"reward": 0.6614192724227905,
"reward_std": 0.7153116464614868,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9226171970367432,
"step": 665
},
{
"completion_length": 369.828125,
"epoch": 0.9694323144104804,
"grad_norm": 0.8189860245778907,
"kl": 0.046875,
"learning_rate": 2.451177274866989e-09,
"loss": -0.0006,
"reward": 0.96561199426651,
"reward_std": 0.43986329436302185,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9498568177223206,
"step": 666
},
{
"completion_length": 359.171875,
"epoch": 0.9708879184861717,
"grad_norm": 0.9437761108308588,
"kl": 0.0478515625,
"learning_rate": 2.2234591026626946e-09,
"loss": -0.0038,
"reward": 0.9382357001304626,
"reward_std": 0.76799476146698,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9165430068969727,
"step": 667
},
{
"completion_length": 378.453125,
"epoch": 0.9723435225618632,
"grad_norm": 0.9460750716630619,
"kl": 0.044677734375,
"learning_rate": 2.0068169804478564e-09,
"loss": 0.0022,
"reward": 0.8572070002555847,
"reward_std": 0.639157772064209,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.9332357048988342,
"step": 668
},
{
"completion_length": 324.75,
"epoch": 0.9737991266375546,
"grad_norm": 1.0371341680952215,
"kl": 0.04833984375,
"learning_rate": 1.8012557287367391e-09,
"loss": -0.0001,
"reward": 0.5965365171432495,
"reward_std": 0.1833423525094986,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.9920833706855774,
"step": 669
},
{
"completion_length": 347.953125,
"epoch": 0.975254730713246,
"grad_norm": 1.00932152995975,
"kl": 0.056640625,
"learning_rate": 1.6067799214828926e-09,
"loss": 0.0008,
"reward": 1.5535807609558105,
"reward_std": 0.20138989388942719,
"rewards/accuracy_reward": 0.859375,
"rewards/format_reward": 0.9731640815734863,
"step": 670
},
{
"completion_length": 342.890625,
"epoch": 0.9767103347889374,
"grad_norm": 0.9803179005691086,
"kl": 0.05224609375,
"learning_rate": 1.4233938859767868e-09,
"loss": 0.0033,
"reward": 1.337246060371399,
"reward_std": 0.5977544784545898,
"rewards/accuracy_reward": 0.796875,
"rewards/format_reward": 0.9449414610862732,
"step": 671
},
{
"completion_length": 361.375,
"epoch": 0.9781659388646288,
"grad_norm": 0.9610606033508311,
"kl": 0.0498046875,
"learning_rate": 1.251101702750168e-09,
"loss": 0.004,
"reward": 0.9087890386581421,
"reward_std": 0.7360955476760864,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.9337239265441895,
"step": 672
},
{
"completion_length": 357.65625,
"epoch": 0.9796215429403202,
"grad_norm": 0.9783219664641567,
"kl": 0.048095703125,
"learning_rate": 1.0899072054846303e-09,
"loss": 0.0021,
"reward": 1.361875057220459,
"reward_std": 0.5983988046646118,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.9243749380111694,
"step": 673
},
{
"completion_length": 353.78125,
"epoch": 0.9810771470160117,
"grad_norm": 0.8663994288006219,
"kl": 0.05712890625,
"learning_rate": 9.398139809268514e-10,
"loss": 0.0056,
"reward": 1.0991926193237305,
"reward_std": 0.4083336889743805,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9410547018051147,
"step": 674
},
{
"completion_length": 369.703125,
"epoch": 0.982532751091703,
"grad_norm": 1.0224494770074284,
"kl": 0.047119140625,
"learning_rate": 8.008253688084887e-10,
"loss": -0.0016,
"reward": 0.15850260853767395,
"reward_std": 0.9653068780899048,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 0.8858072757720947,
"step": 675
},
{
"completion_length": 349.8125,
"epoch": 0.9839883551673945,
"grad_norm": 1.0418234894905996,
"kl": 0.047607421875,
"learning_rate": 6.729444617717961e-10,
"loss": -0.0003,
"reward": 1.2996224164962769,
"reward_std": 0.4023718535900116,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.9947004914283752,
"step": 676
},
{
"completion_length": 363.703125,
"epoch": 0.9854439592430859,
"grad_norm": 1.0444763973789726,
"kl": 0.046875,
"learning_rate": 5.56174105301066e-10,
"loss": 0.0024,
"reward": 0.9435481429100037,
"reward_std": 0.7777630090713501,
"rewards/accuracy_reward": 0.671875,
"rewards/format_reward": 0.9277669191360474,
"step": 677
},
{
"completion_length": 343.484375,
"epoch": 0.9868995633187773,
"grad_norm": 0.97449213723852,
"kl": 0.052734375,
"learning_rate": 4.5051689765929213e-10,
"loss": -0.0009,
"reward": 0.13971352577209473,
"reward_std": 0.23464931547641754,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.9606120586395264,
"step": 678
},
{
"completion_length": 360.53125,
"epoch": 0.9883551673944687,
"grad_norm": 0.9192762084564559,
"kl": 0.050048828125,
"learning_rate": 3.559751898299934e-10,
"loss": 0.0019,
"reward": 1.4475326538085938,
"reward_std": 0.4938472509384155,
"rewards/accuracy_reward": 0.828125,
"rewards/format_reward": 0.9631575345993042,
"step": 679
},
{
"completion_length": 362.625,
"epoch": 0.9898107714701602,
"grad_norm": 0.7829504697419019,
"kl": 0.046875,
"learning_rate": 2.725510854653668e-10,
"loss": 0.0006,
"reward": 0.9168750047683716,
"reward_std": 0.1419457346200943,
"rewards/accuracy_reward": 0.640625,
"rewards/format_reward": 0.9950000047683716,
"step": 680
},
{
"completion_length": 345.0625,
"epoch": 0.9912663755458515,
"grad_norm": 1.0399738160256335,
"kl": 0.060302734375,
"learning_rate": 2.002464408392135e-10,
"loss": 0.0024,
"reward": 1.6794726848602295,
"reward_std": 0.7667683959007263,
"rewards/accuracy_reward": 0.921875,
"rewards/format_reward": 0.9138476848602295,
"step": 681
},
{
"completion_length": 361.859375,
"epoch": 0.992721979621543,
"grad_norm": 0.9175441007019236,
"kl": 0.048828125,
"learning_rate": 1.390628648056391e-10,
"loss": 0.001,
"reward": 0.7809114456176758,
"reward_std": 0.5863924622535706,
"rewards/accuracy_reward": 0.609375,
"rewards/format_reward": 0.9467839002609253,
"step": 682
},
{
"completion_length": 361.796875,
"epoch": 0.9941775836972343,
"grad_norm": 0.9367681869995809,
"kl": 0.044677734375,
"learning_rate": 8.900171876341511e-11,
"loss": 0.0024,
"reward": 0.8453580737113953,
"reward_std": 0.8822904229164124,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9622981548309326,
"step": 683
},
{
"completion_length": 360.484375,
"epoch": 0.9956331877729258,
"grad_norm": 0.9988321881169316,
"kl": 0.046630859375,
"learning_rate": 5.006411662555887e-11,
"loss": -0.0066,
"reward": 0.6559830904006958,
"reward_std": 0.5935498476028442,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9169206023216248,
"step": 684
},
{
"completion_length": 344.28125,
"epoch": 0.9970887918486172,
"grad_norm": 1.0544057765799288,
"kl": 0.049072265625,
"learning_rate": 2.2250924794520175e-11,
"loss": 0.0023,
"reward": 1.2797396183013916,
"reward_std": 0.35647517442703247,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.9755468368530273,
"step": 685
},
{
"completion_length": 370.171875,
"epoch": 0.9985443959243085,
"grad_norm": 0.872227207794643,
"kl": 0.04541015625,
"learning_rate": 5.562762142974353e-12,
"loss": 0.0014,
"reward": 1.0113476514816284,
"reward_std": 0.7452950477600098,
"rewards/accuracy_reward": 0.703125,
"rewards/format_reward": 0.9019725918769836,
"step": 686
},
{
"completion_length": 382.890625,
"epoch": 1.0,
"grad_norm": 0.9830707932910157,
"kl": 0.042724609375,
"learning_rate": 0.0,
"loss": 0.0014,
"reward": 1.15053391456604,
"reward_std": 1.0555355548858643,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.8536588549613953,
"step": 687
},
{
"epoch": 1.0,
"step": 687,
"total_flos": 0.0,
"train_loss": 0.00033314188768961123,
"train_runtime": 116282.9812,
"train_samples_per_second": 0.047,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1.0,
"max_steps": 687,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}