security_model / trainer_state.json
eddywu's picture
Upload trainer_state.json with huggingface_hub
db8817a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 653,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 166.125,
"epoch": 0.0015313935681470138,
"grad_norm": 14.935008926918243,
"kl": 0.0,
"learning_rate": 9.999942135453495e-07,
"loss": -0.0,
"reward": 3.6875,
"reward_std": 0.5246413946151733,
"rewards/accuracy_reward": 2.3874998092651367,
"rewards/format_reward": 1.0,
"step": 1,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 221.96875,
"epoch": 0.0030627871362940277,
"grad_norm": 9.579596700958447,
"kl": 0.000728607177734375,
"learning_rate": 9.999768543153299e-07,
"loss": 0.0,
"reward": 3.549999952316284,
"reward_std": 0.5580562353134155,
"rewards/accuracy_reward": 2.25,
"rewards/format_reward": 1.0,
"step": 2,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 221.6875,
"epoch": 0.004594180704441042,
"grad_norm": 10.756952732551833,
"kl": 0.00072479248046875,
"learning_rate": 9.99947922711735e-07,
"loss": 0.0,
"reward": 3.4437499046325684,
"reward_std": 0.3714633882045746,
"rewards/accuracy_reward": 2.1437501907348633,
"rewards/format_reward": 1.0,
"step": 3,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 244.75,
"epoch": 0.006125574272588055,
"grad_norm": 8.487559202337772,
"kl": 0.0007781982421875,
"learning_rate": 9.999074194042105e-07,
"loss": 0.0,
"reward": 3.0625,
"reward_std": 0.4976257085800171,
"rewards/accuracy_reward": 1.7625000476837158,
"rewards/format_reward": 1.0,
"step": 4,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 231.3125,
"epoch": 0.007656967840735069,
"grad_norm": 4.6294295976229085,
"kl": 0.000553131103515625,
"learning_rate": 9.998553453302385e-07,
"loss": 0.0,
"reward": 3.59375,
"reward_std": 0.5659699440002441,
"rewards/accuracy_reward": 2.2937498092651367,
"rewards/format_reward": 1.0,
"step": 5,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 198.9375,
"epoch": 0.009188361408882083,
"grad_norm": 6.183404279543758,
"kl": 0.000736236572265625,
"learning_rate": 9.997917016951161e-07,
"loss": 0.0,
"reward": 3.4624998569488525,
"reward_std": 0.5215482711791992,
"rewards/accuracy_reward": 2.1624999046325684,
"rewards/format_reward": 1.0,
"step": 6,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 236.15625,
"epoch": 0.010719754977029096,
"grad_norm": 7.995581870293477,
"kl": 0.000652313232421875,
"learning_rate": 9.997164899719272e-07,
"loss": 0.0,
"reward": 3.1812500953674316,
"reward_std": 0.6596391201019287,
"rewards/accuracy_reward": 1.9562499523162842,
"rewards/format_reward": 1.0,
"step": 7,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 223.53125,
"epoch": 0.01225114854517611,
"grad_norm": 5.97513336353613,
"kl": 0.0009613037109375,
"learning_rate": 9.996297119015088e-07,
"loss": 0.0,
"reward": 3.0562500953674316,
"reward_std": 0.38864773511886597,
"rewards/accuracy_reward": 1.7562501430511475,
"rewards/format_reward": 1.0,
"step": 8,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 193.3125,
"epoch": 0.013782542113323124,
"grad_norm": 21.108476928401185,
"kl": 0.00077056884765625,
"learning_rate": 9.995313694924106e-07,
"loss": 0.0,
"reward": 3.418750047683716,
"reward_std": 0.5562530755996704,
"rewards/accuracy_reward": 2.1187498569488525,
"rewards/format_reward": 1.0,
"step": 9,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 226.625,
"epoch": 0.015313935681470138,
"grad_norm": 13.570920274198468,
"kl": 0.00099945068359375,
"learning_rate": 9.99421465020848e-07,
"loss": 0.0,
"reward": 2.5812501907348633,
"reward_std": 0.4478328227996826,
"rewards/accuracy_reward": 1.3562500476837158,
"rewards/format_reward": 1.0,
"step": 10,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 209.25,
"epoch": 0.016845329249617153,
"grad_norm": 7.2683619536034545,
"kl": 0.001190185546875,
"learning_rate": 9.9930000103065e-07,
"loss": 0.0,
"reward": 3.0875000953674316,
"reward_std": 0.3895391523838043,
"rewards/accuracy_reward": 1.787500023841858,
"rewards/format_reward": 1.0,
"step": 11,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 203.15625,
"epoch": 0.018376722817764167,
"grad_norm": 6.1372904663686905,
"kl": 0.0010833740234375,
"learning_rate": 9.991669803331996e-07,
"loss": 0.0,
"reward": 3.2937498092651367,
"reward_std": 0.5000158548355103,
"rewards/accuracy_reward": 1.993749976158142,
"rewards/format_reward": 1.0,
"step": 12,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 210.75,
"epoch": 0.019908116385911178,
"grad_norm": 12.89544676032573,
"kl": 0.0013580322265625,
"learning_rate": 9.990224060073705e-07,
"loss": 0.0,
"reward": 2.84375,
"reward_std": 0.4483214318752289,
"rewards/accuracy_reward": 1.5437499284744263,
"rewards/format_reward": 1.0,
"step": 13,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 171.71875,
"epoch": 0.021439509954058193,
"grad_norm": 4.748966564107283,
"kl": 0.0010833740234375,
"learning_rate": 9.988662813994532e-07,
"loss": 0.0,
"reward": 3.2125000953674316,
"reward_std": 0.4343854486942291,
"rewards/accuracy_reward": 1.9124999046325684,
"rewards/format_reward": 1.0,
"step": 14,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 216.1875,
"epoch": 0.022970903522205207,
"grad_norm": 4.904787481992538,
"kl": 0.00148773193359375,
"learning_rate": 9.9869861012308e-07,
"loss": 0.0,
"reward": 2.924999952316284,
"reward_std": 0.39761877059936523,
"rewards/accuracy_reward": 1.625,
"rewards/format_reward": 1.0,
"step": 15,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 237.59375,
"epoch": 0.02450229709035222,
"grad_norm": 8.580698497931925,
"kl": 0.0013275146484375,
"learning_rate": 9.985193960591395e-07,
"loss": 0.0,
"reward": 2.3187499046325684,
"reward_std": 0.21549977362155914,
"rewards/accuracy_reward": 1.1687500476837158,
"rewards/format_reward": 1.0,
"step": 16,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 237.59375,
"epoch": 0.026033690658499236,
"grad_norm": 5.153356819387367,
"kl": 0.00135040283203125,
"learning_rate": 9.98328643355688e-07,
"loss": 0.0,
"reward": 3.5249998569488525,
"reward_std": 0.46054166555404663,
"rewards/accuracy_reward": 2.2249999046325684,
"rewards/format_reward": 1.0,
"step": 17,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 210.59375,
"epoch": 0.027565084226646247,
"grad_norm": 10.800790084406668,
"kl": 0.001556396484375,
"learning_rate": 9.981263564278534e-07,
"loss": 0.0,
"reward": 3.2437500953674316,
"reward_std": 0.31860852241516113,
"rewards/accuracy_reward": 1.943750023841858,
"rewards/format_reward": 1.0,
"step": 18,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 203.4375,
"epoch": 0.02909647779479326,
"grad_norm": 8.032416154857067,
"kl": 0.00142669677734375,
"learning_rate": 9.979125399577318e-07,
"loss": 0.0,
"reward": 3.5812501907348633,
"reward_std": 0.592538595199585,
"rewards/accuracy_reward": 2.28125,
"rewards/format_reward": 1.0,
"step": 19,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 216.40625,
"epoch": 0.030627871362940276,
"grad_norm": 8.373934147836081,
"kl": 0.001373291015625,
"learning_rate": 9.976871988942804e-07,
"loss": 0.0,
"reward": 3.6812500953674316,
"reward_std": 0.5628457069396973,
"rewards/accuracy_reward": 2.3812499046325684,
"rewards/format_reward": 1.0,
"step": 20,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 188.21875,
"epoch": 0.03215926493108729,
"grad_norm": 8.031005962405207,
"kl": 0.0020599365234375,
"learning_rate": 9.974503384532027e-07,
"loss": 0.0,
"reward": 3.0062499046325684,
"reward_std": 0.6598080396652222,
"rewards/accuracy_reward": 1.8562499284744263,
"rewards/format_reward": 1.0,
"step": 21,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 233.875,
"epoch": 0.033690658499234305,
"grad_norm": 7.26237019193215,
"kl": 0.00171661376953125,
"learning_rate": 9.972019641168275e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.6160791516304016,
"rewards/accuracy_reward": 2.3874998092651367,
"rewards/format_reward": 1.0,
"step": 22,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 210.90625,
"epoch": 0.03522205206738132,
"grad_norm": 22.405745531866504,
"kl": 0.001953125,
"learning_rate": 9.969420816339821e-07,
"loss": 0.0,
"reward": 3.2437498569488525,
"reward_std": 0.40731462836265564,
"rewards/accuracy_reward": 1.943750023841858,
"rewards/format_reward": 1.0,
"step": 23,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 247.03125,
"epoch": 0.036753445635528334,
"grad_norm": 5.848799650690625,
"kl": 0.0017852783203125,
"learning_rate": 9.966706970198596e-07,
"loss": 0.0,
"reward": 3.5562500953674316,
"reward_std": 0.51361083984375,
"rewards/accuracy_reward": 2.2562499046325684,
"rewards/format_reward": 1.0,
"step": 24,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 194.09375,
"epoch": 0.03828483920367534,
"grad_norm": 7.357989272890045,
"kl": 0.0019989013671875,
"learning_rate": 9.963878165558785e-07,
"loss": 0.0,
"reward": 3.987499952316284,
"reward_std": 0.3582419753074646,
"rewards/accuracy_reward": 2.6875,
"rewards/format_reward": 1.0,
"step": 25,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 196.65625,
"epoch": 0.039816232771822356,
"grad_norm": 9.846159640012448,
"kl": 0.0020751953125,
"learning_rate": 9.960934467895391e-07,
"loss": 0.0,
"reward": 3.1875,
"reward_std": 0.37332883477211,
"rewards/accuracy_reward": 1.8875000476837158,
"rewards/format_reward": 1.0,
"step": 26,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 173.78125,
"epoch": 0.04134762633996937,
"grad_norm": 11.76376909133194,
"kl": 0.00213623046875,
"learning_rate": 9.957875945342706e-07,
"loss": 0.0,
"reward": 3.487499952316284,
"reward_std": 0.6070871949195862,
"rewards/accuracy_reward": 2.1875,
"rewards/format_reward": 1.0,
"step": 27,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 195.40625,
"epoch": 0.042879019908116385,
"grad_norm": 5.532470814281292,
"kl": 0.00244140625,
"learning_rate": 9.954702668692737e-07,
"loss": 0.0,
"reward": 3.231250047683716,
"reward_std": 0.4620515704154968,
"rewards/accuracy_reward": 1.931249976158142,
"rewards/format_reward": 1.0,
"step": 28,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 199.0,
"epoch": 0.0444104134762634,
"grad_norm": 5.1077668198943265,
"kl": 0.001983642578125,
"learning_rate": 9.951414711393568e-07,
"loss": 0.0,
"reward": 2.8375000953674316,
"reward_std": 0.5319792032241821,
"rewards/accuracy_reward": 1.6124999523162842,
"rewards/format_reward": 1.0,
"step": 29,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 196.28125,
"epoch": 0.045941807044410414,
"grad_norm": 9.955982570369333,
"kl": 0.0024566650390625,
"learning_rate": 9.948012149547666e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.4107499122619629,
"rewards/accuracy_reward": 2.387500047683716,
"rewards/format_reward": 1.0,
"step": 30,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 190.34375,
"epoch": 0.04747320061255743,
"grad_norm": 14.926260894095993,
"kl": 0.0030517578125,
"learning_rate": 9.94449506191011e-07,
"loss": 0.0,
"reward": 3.34375,
"reward_std": 0.5044288635253906,
"rewards/accuracy_reward": 2.043750047683716,
"rewards/format_reward": 1.0,
"step": 31,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 230.78125,
"epoch": 0.04900459418070444,
"grad_norm": 5.519507890472418,
"kl": 0.0035400390625,
"learning_rate": 9.94086352988677e-07,
"loss": 0.0,
"reward": 3.4124999046325684,
"reward_std": 0.4231412410736084,
"rewards/accuracy_reward": 2.112499952316284,
"rewards/format_reward": 1.0,
"step": 32,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 235.5,
"epoch": 0.05053598774885146,
"grad_norm": 17.77903651092269,
"kl": 0.003143310546875,
"learning_rate": 9.937117637532426e-07,
"loss": 0.0,
"reward": 2.9937500953674316,
"reward_std": 0.3739195168018341,
"rewards/accuracy_reward": 1.6937501430511475,
"rewards/format_reward": 1.0,
"step": 33,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 208.59375,
"epoch": 0.05206738131699847,
"grad_norm": 7.665468500582905,
"kl": 0.0030517578125,
"learning_rate": 9.933257471548827e-07,
"loss": 0.0,
"reward": 3.4187498092651367,
"reward_std": 0.36628347635269165,
"rewards/accuracy_reward": 2.1187498569488525,
"rewards/format_reward": 1.0,
"step": 34,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 271.78125,
"epoch": 0.05359877488514548,
"grad_norm": 8.63810081218864,
"kl": 0.0032196044921875,
"learning_rate": 9.929283121282675e-07,
"loss": 0.0,
"reward": 3.25,
"reward_std": 0.484809547662735,
"rewards/accuracy_reward": 1.9500000476837158,
"rewards/format_reward": 1.0,
"step": 35,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 246.90625,
"epoch": 0.055130168453292494,
"grad_norm": 5.75915846803757,
"kl": 0.004119873046875,
"learning_rate": 9.925194678723557e-07,
"loss": 0.0,
"reward": 3.1187500953674316,
"reward_std": 0.3506025969982147,
"rewards/accuracy_reward": 1.8187501430511475,
"rewards/format_reward": 1.0,
"step": 36,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 217.3125,
"epoch": 0.05666156202143951,
"grad_norm": 21.5613186498873,
"kl": 0.004913330078125,
"learning_rate": 9.920992238501823e-07,
"loss": 0.0,
"reward": 3.9250001907348633,
"reward_std": 0.48906704783439636,
"rewards/accuracy_reward": 2.625,
"rewards/format_reward": 1.0,
"step": 37,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 244.5,
"epoch": 0.05819295558958652,
"grad_norm": 46.91695755260845,
"kl": 0.00445556640625,
"learning_rate": 9.916675897886394e-07,
"loss": 0.0,
"reward": 3.518749952316284,
"reward_std": 0.4832785129547119,
"rewards/accuracy_reward": 2.293750047683716,
"rewards/format_reward": 1.0,
"step": 38,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 227.8125,
"epoch": 0.05972434915773354,
"grad_norm": 7.145992982612995,
"kl": 0.006683349609375,
"learning_rate": 9.912245756782507e-07,
"loss": 0.0,
"reward": 3.1312499046325684,
"reward_std": 0.4973461627960205,
"rewards/accuracy_reward": 1.9062501192092896,
"rewards/format_reward": 1.0,
"step": 39,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 269.8125,
"epoch": 0.06125574272588055,
"grad_norm": 4.172187313214933,
"kl": 0.0057373046875,
"learning_rate": 9.9077019177294e-07,
"loss": 0.0,
"reward": 2.9437499046325684,
"reward_std": 0.392461895942688,
"rewards/accuracy_reward": 1.6437499523162842,
"rewards/format_reward": 1.0,
"step": 40,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 229.15625,
"epoch": 0.06278713629402756,
"grad_norm": 10.998221333714971,
"kl": 0.007354736328125,
"learning_rate": 9.903044485897955e-07,
"loss": 0.0,
"reward": 3.3499999046325684,
"reward_std": 0.46540987491607666,
"rewards/accuracy_reward": 2.049999952316284,
"rewards/format_reward": 1.0,
"step": 41,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 192.8125,
"epoch": 0.06431852986217458,
"grad_norm": 12.155267553825752,
"kl": 0.00628662109375,
"learning_rate": 9.89827356908824e-07,
"loss": 0.0,
"reward": 3.424999952316284,
"reward_std": 0.5948097109794617,
"rewards/accuracy_reward": 2.125,
"rewards/format_reward": 1.0,
"step": 42,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 250.28125,
"epoch": 0.06584992343032159,
"grad_norm": 5.325364528226017,
"kl": 0.0048828125,
"learning_rate": 9.89338927772703e-07,
"loss": 0.0,
"reward": 2.84375,
"reward_std": 0.35478055477142334,
"rewards/accuracy_reward": 1.5437500476837158,
"rewards/format_reward": 1.0,
"step": 43,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 212.8125,
"epoch": 0.06738131699846861,
"grad_norm": 16.303494875871205,
"kl": 0.00958251953125,
"learning_rate": 9.888391724865245e-07,
"loss": 0.0,
"reward": 2.9375,
"reward_std": 0.3671438694000244,
"rewards/accuracy_reward": 1.712499976158142,
"rewards/format_reward": 1.0,
"step": 44,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 207.59375,
"epoch": 0.06891271056661562,
"grad_norm": 10.183512800331151,
"kl": 0.0079345703125,
"learning_rate": 9.88328102617534e-07,
"loss": 0.0,
"reward": 3.2562499046325684,
"reward_std": 0.332864373922348,
"rewards/accuracy_reward": 2.03125,
"rewards/format_reward": 1.0,
"step": 45,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 271.9375,
"epoch": 0.07044410413476264,
"grad_norm": 5.120622299709726,
"kl": 0.007080078125,
"learning_rate": 9.87805729994862e-07,
"loss": 0.0,
"reward": 3.112499952316284,
"reward_std": 0.562711238861084,
"rewards/accuracy_reward": 1.8125,
"rewards/format_reward": 1.0,
"step": 46,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 237.6875,
"epoch": 0.07197549770290965,
"grad_norm": 4.367373681303007,
"kl": 0.00701904296875,
"learning_rate": 9.872720667092505e-07,
"loss": 0.0,
"reward": 2.9937500953674316,
"reward_std": 0.48836949467658997,
"rewards/accuracy_reward": 1.693750023841858,
"rewards/format_reward": 1.0,
"step": 47,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 256.25,
"epoch": 0.07350689127105667,
"grad_norm": 16.103884519005483,
"kl": 0.006805419921875,
"learning_rate": 9.867271251127727e-07,
"loss": 0.0,
"reward": 3.3125,
"reward_std": 0.3035011291503906,
"rewards/accuracy_reward": 2.012500047683716,
"rewards/format_reward": 1.0,
"step": 48,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 231.03125,
"epoch": 0.07503828483920368,
"grad_norm": 8.788502438995774,
"kl": 0.0089111328125,
"learning_rate": 9.861709178185483e-07,
"loss": 0.0,
"reward": 3.2437498569488525,
"reward_std": 0.5186760425567627,
"rewards/accuracy_reward": 2.018749952316284,
"rewards/format_reward": 1.0,
"step": 49,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 243.125,
"epoch": 0.07656967840735068,
"grad_norm": 4.596890246898075,
"kl": 0.0096435546875,
"learning_rate": 9.856034577004504e-07,
"loss": 0.0,
"reward": 3.081249713897705,
"reward_std": 0.4919166564941406,
"rewards/accuracy_reward": 1.78125,
"rewards/format_reward": 1.0,
"step": 50,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.6875,
"epoch": 0.0781010719754977,
"grad_norm": 13.961517391453988,
"kl": 0.006011962890625,
"learning_rate": 9.850247578928079e-07,
"loss": 0.0,
"reward": 3.53125,
"reward_std": 0.5289031267166138,
"rewards/accuracy_reward": 2.2312498092651367,
"rewards/format_reward": 1.0,
"step": 51,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.0625,
"epoch": 0.07963246554364471,
"grad_norm": 6.3692603502129845,
"kl": 0.00714111328125,
"learning_rate": 9.844348317901016e-07,
"loss": 0.0,
"reward": 3.0749998092651367,
"reward_std": 0.36224496364593506,
"rewards/accuracy_reward": 1.774999976158142,
"rewards/format_reward": 1.0,
"step": 52,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 252.3125,
"epoch": 0.08116385911179173,
"grad_norm": 6.412747585458219,
"kl": 0.00823974609375,
"learning_rate": 9.838336930466539e-07,
"loss": 0.0,
"reward": 3.1937499046325684,
"reward_std": 0.4673565626144409,
"rewards/accuracy_reward": 1.8937498331069946,
"rewards/format_reward": 1.0,
"step": 53,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 231.6875,
"epoch": 0.08269525267993874,
"grad_norm": 5.18721005404993,
"kl": 0.0108642578125,
"learning_rate": 9.832213555763134e-07,
"loss": 0.0,
"reward": 3.0187501907348633,
"reward_std": 0.469946026802063,
"rewards/accuracy_reward": 1.71875,
"rewards/format_reward": 1.0,
"step": 54,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.15625,
"epoch": 0.08422664624808576,
"grad_norm": 7.62980944648513,
"kl": 0.00665283203125,
"learning_rate": 9.82597833552132e-07,
"loss": 0.0,
"reward": 3.081249952316284,
"reward_std": 0.5897158980369568,
"rewards/accuracy_reward": 1.78125,
"rewards/format_reward": 1.0,
"step": 55,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 270.53125,
"epoch": 0.08575803981623277,
"grad_norm": 12.90972700975486,
"kl": 0.0084228515625,
"learning_rate": 9.819631414060372e-07,
"loss": 0.0,
"reward": 3.3812499046325684,
"reward_std": 0.5733855962753296,
"rewards/accuracy_reward": 2.081249952316284,
"rewards/format_reward": 1.0,
"step": 56,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 236.21875,
"epoch": 0.08728943338437979,
"grad_norm": 8.789318251801276,
"kl": 0.00860595703125,
"learning_rate": 9.813172938284986e-07,
"loss": 0.0,
"reward": 3.424999713897705,
"reward_std": 0.5403081774711609,
"rewards/accuracy_reward": 2.125,
"rewards/format_reward": 1.0,
"step": 57,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 256.125,
"epoch": 0.0888208269525268,
"grad_norm": 4.920611432887529,
"kl": 0.0101318359375,
"learning_rate": 9.806603057681868e-07,
"loss": 0.0,
"reward": 3.0999999046325684,
"reward_std": 0.3320375978946686,
"rewards/accuracy_reward": 1.7999999523162842,
"rewards/format_reward": 1.0,
"step": 58,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 266.78125,
"epoch": 0.0903522205206738,
"grad_norm": 6.933811561610319,
"kl": 0.0098876953125,
"learning_rate": 9.799921924316283e-07,
"loss": 0.0,
"reward": 3.5062499046325684,
"reward_std": 0.5279309749603271,
"rewards/accuracy_reward": 2.206249952316284,
"rewards/format_reward": 1.0,
"step": 59,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 250.03125,
"epoch": 0.09188361408882083,
"grad_norm": 6.437867895100921,
"kl": 0.01080322265625,
"learning_rate": 9.793129692828533e-07,
"loss": 0.0,
"reward": 2.9499998092651367,
"reward_std": 0.3246353566646576,
"rewards/accuracy_reward": 1.649999976158142,
"rewards/format_reward": 1.0,
"step": 60,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 275.25,
"epoch": 0.09341500765696784,
"grad_norm": 9.798775021650894,
"kl": 0.009765625,
"learning_rate": 9.786226520430374e-07,
"loss": 0.0,
"reward": 3.406249761581421,
"reward_std": 0.5513333082199097,
"rewards/accuracy_reward": 2.1812498569488525,
"rewards/format_reward": 1.0,
"step": 61,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.96875,
"epoch": 0.09494640122511486,
"grad_norm": 13.876179036973484,
"kl": 0.01104736328125,
"learning_rate": 9.779212566901385e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.5208388566970825,
"rewards/accuracy_reward": 2.762500047683716,
"rewards/format_reward": 1.0,
"step": 62,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 246.78125,
"epoch": 0.09647779479326186,
"grad_norm": 26.754853896720867,
"kl": 0.0223388671875,
"learning_rate": 9.77208799458526e-07,
"loss": 0.0,
"reward": 3.125,
"reward_std": 0.3128255605697632,
"rewards/accuracy_reward": 1.8249999284744263,
"rewards/format_reward": 1.0,
"step": 63,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 243.84375,
"epoch": 0.09800918836140889,
"grad_norm": 7.174348819697357,
"kl": 0.01190185546875,
"learning_rate": 9.76485296838606e-07,
"loss": 0.0,
"reward": 3.0687499046325684,
"reward_std": 0.35120201110839844,
"rewards/accuracy_reward": 1.7687500715255737,
"rewards/format_reward": 1.0,
"step": 64,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 244.0625,
"epoch": 0.0995405819295559,
"grad_norm": 5.899616368592248,
"kl": 0.0130615234375,
"learning_rate": 9.757507655764384e-07,
"loss": 0.0,
"reward": 2.96875,
"reward_std": 0.3577200174331665,
"rewards/accuracy_reward": 1.6687500476837158,
"rewards/format_reward": 1.0,
"step": 65,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 280.125,
"epoch": 0.10107197549770292,
"grad_norm": 5.308949567718103,
"kl": 0.01043701171875,
"learning_rate": 9.75005222673351e-07,
"loss": 0.0,
"reward": 3.4625000953674316,
"reward_std": 0.5306740403175354,
"rewards/accuracy_reward": 2.1624999046325684,
"rewards/format_reward": 1.0,
"step": 66,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 248.09375,
"epoch": 0.10260336906584992,
"grad_norm": 9.621365518129148,
"kl": 0.01171875,
"learning_rate": 9.742486853855444e-07,
"loss": 0.0,
"reward": 3.84375,
"reward_std": 0.5204290151596069,
"rewards/accuracy_reward": 2.5437498092651367,
"rewards/format_reward": 1.0,
"step": 67,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 247.0625,
"epoch": 0.10413476263399694,
"grad_norm": 15.544461775860299,
"kl": 0.0135498046875,
"learning_rate": 9.734811712236936e-07,
"loss": 0.0,
"reward": 3.549999952316284,
"reward_std": 0.57508784532547,
"rewards/accuracy_reward": 2.25,
"rewards/format_reward": 1.0,
"step": 68,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.90625,
"epoch": 0.10566615620214395,
"grad_norm": 6.616923976719133,
"kl": 0.01116943359375,
"learning_rate": 9.727026979525419e-07,
"loss": 0.0,
"reward": 2.893749952316284,
"reward_std": 0.30293795466423035,
"rewards/accuracy_reward": 1.5937498807907104,
"rewards/format_reward": 1.0,
"step": 69,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 253.1875,
"epoch": 0.10719754977029096,
"grad_norm": 5.12310463136158,
"kl": 0.0162353515625,
"learning_rate": 9.719132835904906e-07,
"loss": 0.0,
"reward": 3.28125,
"reward_std": 0.566928505897522,
"rewards/accuracy_reward": 1.9812498092651367,
"rewards/format_reward": 1.0,
"step": 70,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 270.625,
"epoch": 0.10872894333843798,
"grad_norm": 14.726605569458583,
"kl": 0.01287841796875,
"learning_rate": 9.711129464091814e-07,
"loss": 0.0,
"reward": 3.7312498092651367,
"reward_std": 0.5318872928619385,
"rewards/accuracy_reward": 2.4312500953674316,
"rewards/format_reward": 1.0,
"step": 71,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.78125,
"epoch": 0.11026033690658499,
"grad_norm": 4.47517871239952,
"kl": 0.0120849609375,
"learning_rate": 9.703017049330734e-07,
"loss": 0.0,
"reward": 2.768749952316284,
"reward_std": 0.4168527126312256,
"rewards/accuracy_reward": 1.46875,
"rewards/format_reward": 1.0,
"step": 72,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 234.3125,
"epoch": 0.11179173047473201,
"grad_norm": 7.696156798679741,
"kl": 0.0164794921875,
"learning_rate": 9.694795779390145e-07,
"loss": 0.0,
"reward": 3.0,
"reward_std": 0.35017162561416626,
"rewards/accuracy_reward": 1.7000000476837158,
"rewards/format_reward": 1.0,
"step": 73,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 285.28125,
"epoch": 0.11332312404287902,
"grad_norm": 27.537434750426346,
"kl": 0.01446533203125,
"learning_rate": 9.686465844558072e-07,
"loss": 0.0,
"reward": 3.125,
"reward_std": 0.4312995672225952,
"rewards/accuracy_reward": 1.8249999284744263,
"rewards/format_reward": 1.0,
"step": 74,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 285.96875,
"epoch": 0.11485451761102604,
"grad_norm": 5.471264754308217,
"kl": 0.03466796875,
"learning_rate": 9.678027437637677e-07,
"loss": 0.0,
"reward": 2.893749713897705,
"reward_std": 0.3074049949645996,
"rewards/accuracy_reward": 1.59375,
"rewards/format_reward": 1.0,
"step": 75,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 260.46875,
"epoch": 0.11638591117917305,
"grad_norm": 11.665474325673282,
"kl": 0.018310546875,
"learning_rate": 9.669480753942792e-07,
"loss": 0.0,
"reward": 4.068750381469727,
"reward_std": 0.4616953730583191,
"rewards/accuracy_reward": 2.7687501907348633,
"rewards/format_reward": 1.0,
"step": 76,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 278.4375,
"epoch": 0.11791730474732007,
"grad_norm": 6.153184210645685,
"kl": 0.01348876953125,
"learning_rate": 9.66082599129341e-07,
"loss": 0.0,
"reward": 3.28125,
"reward_std": 0.4848785996437073,
"rewards/accuracy_reward": 1.9812500476837158,
"rewards/format_reward": 1.0,
"step": 77,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 294.46875,
"epoch": 0.11944869831546708,
"grad_norm": 5.2430639141452415,
"kl": 0.01300048828125,
"learning_rate": 9.652063350011093e-07,
"loss": 0.0,
"reward": 2.9625000953674316,
"reward_std": 0.3452790677547455,
"rewards/accuracy_reward": 1.6625001430511475,
"rewards/format_reward": 1.0,
"step": 78,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.8125,
"epoch": 0.12098009188361408,
"grad_norm": 119.37951094933942,
"kl": 0.01708984375,
"learning_rate": 9.643193032914353e-07,
"loss": 0.0,
"reward": 3.1624999046325684,
"reward_std": 0.44167351722717285,
"rewards/accuracy_reward": 1.8624999523162842,
"rewards/format_reward": 1.0,
"step": 79,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.53125,
"epoch": 0.1225114854517611,
"grad_norm": 44.72540517746883,
"kl": 0.0125732421875,
"learning_rate": 9.634215245313939e-07,
"loss": 0.0,
"reward": 3.325000047683716,
"reward_std": 0.4470377266407013,
"rewards/accuracy_reward": 2.0250000953674316,
"rewards/format_reward": 1.0,
"step": 80,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.15625,
"epoch": 0.12404287901990811,
"grad_norm": 8.886709452383517,
"kl": 0.017333984375,
"learning_rate": 9.62513019500809e-07,
"loss": 0.0,
"reward": 3.3062498569488525,
"reward_std": 0.4055883288383484,
"rewards/accuracy_reward": 2.0062501430511475,
"rewards/format_reward": 1.0,
"step": 81,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.40625,
"epoch": 0.12557427258805512,
"grad_norm": 7.433627344353906,
"kl": 0.01422119140625,
"learning_rate": 9.615938092277739e-07,
"loss": 0.0,
"reward": 3.731250047683716,
"reward_std": 0.6632749438285828,
"rewards/accuracy_reward": 2.5062499046325684,
"rewards/format_reward": 1.0,
"step": 82,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.53125,
"epoch": 0.12710566615620214,
"grad_norm": 5.928349284260465,
"kl": 0.01544189453125,
"learning_rate": 9.606639149881621e-07,
"loss": 0.0,
"reward": 3.174999952316284,
"reward_std": 0.4570466876029968,
"rewards/accuracy_reward": 1.9500000476837158,
"rewards/format_reward": 1.0,
"step": 83,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 264.09375,
"epoch": 0.12863705972434916,
"grad_norm": 24.753344046181738,
"kl": 0.0159912109375,
"learning_rate": 9.597233583051376e-07,
"loss": 0.0,
"reward": 3.0749998092651367,
"reward_std": 0.4688183665275574,
"rewards/accuracy_reward": 1.850000023841858,
"rewards/format_reward": 1.0,
"step": 84,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 230.8125,
"epoch": 0.13016845329249618,
"grad_norm": 16.606939449922436,
"kl": 0.0205078125,
"learning_rate": 9.587721609486543e-07,
"loss": 0.0,
"reward": 3.59375,
"reward_std": 0.5050134658813477,
"rewards/accuracy_reward": 2.2937498092651367,
"rewards/format_reward": 1.0,
"step": 85,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 276.125,
"epoch": 0.13169984686064318,
"grad_norm": 6.494004286855659,
"kl": 0.0130615234375,
"learning_rate": 9.57810344934954e-07,
"loss": 0.0,
"reward": 3.40625,
"reward_std": 0.4114701449871063,
"rewards/accuracy_reward": 2.106250047683716,
"rewards/format_reward": 1.0,
"step": 86,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.84375,
"epoch": 0.1332312404287902,
"grad_norm": 9.009922489811581,
"kl": 0.0123291015625,
"learning_rate": 9.568379325260556e-07,
"loss": 0.0,
"reward": 3.5874998569488525,
"reward_std": 0.48859333992004395,
"rewards/accuracy_reward": 2.2875001430511475,
"rewards/format_reward": 1.0,
"step": 87,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 265.0,
"epoch": 0.13476263399693722,
"grad_norm": 9.739186172548235,
"kl": 0.0159912109375,
"learning_rate": 9.558549462292402e-07,
"loss": 0.0,
"reward": 3.5812501907348633,
"reward_std": 0.47442397475242615,
"rewards/accuracy_reward": 2.28125,
"rewards/format_reward": 1.0,
"step": 88,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 262.96875,
"epoch": 0.1362940275650842,
"grad_norm": 7.990348369284193,
"kl": 0.0179443359375,
"learning_rate": 9.548614087965304e-07,
"loss": 0.0,
"reward": 3.5249998569488525,
"reward_std": 0.5649663209915161,
"rewards/accuracy_reward": 2.299999952316284,
"rewards/format_reward": 1.0,
"step": 89,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 284.6875,
"epoch": 0.13782542113323124,
"grad_norm": 19.39633992166886,
"kl": 0.01519775390625,
"learning_rate": 9.538573432241637e-07,
"loss": 0.0,
"reward": 2.956249952316284,
"reward_std": 0.3850908875465393,
"rewards/accuracy_reward": 1.65625,
"rewards/format_reward": 1.0,
"step": 90,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.28125,
"epoch": 0.13935681470137826,
"grad_norm": 7.662501804425234,
"kl": 0.01470947265625,
"learning_rate": 9.528427727520591e-07,
"loss": 0.0,
"reward": 3.4937498569488525,
"reward_std": 0.5627257227897644,
"rewards/accuracy_reward": 2.1937501430511475,
"rewards/format_reward": 1.0,
"step": 91,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 237.25,
"epoch": 0.14088820826952528,
"grad_norm": 8.613477189912425,
"kl": 0.020263671875,
"learning_rate": 9.518177208632812e-07,
"loss": 0.0,
"reward": 3.6999998092651367,
"reward_std": 0.6943286657333374,
"rewards/accuracy_reward": 2.3999998569488525,
"rewards/format_reward": 1.0,
"step": 92,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 266.53125,
"epoch": 0.14241960183767227,
"grad_norm": 5.553773740185143,
"kl": 0.0169677734375,
"learning_rate": 9.507822112834946e-07,
"loss": 0.0,
"reward": 2.90625,
"reward_std": 0.40452033281326294,
"rewards/accuracy_reward": 1.681249976158142,
"rewards/format_reward": 1.0,
"step": 93,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.71875,
"epoch": 0.1439509954058193,
"grad_norm": 4.150148471818182,
"kl": 0.016357421875,
"learning_rate": 9.497362679804168e-07,
"loss": 0.0,
"reward": 3.4937500953674316,
"reward_std": 0.4619887173175812,
"rewards/accuracy_reward": 2.1937501430511475,
"rewards/format_reward": 1.0,
"step": 94,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 255.875,
"epoch": 0.14548238897396631,
"grad_norm": 32.823331085292175,
"kl": 0.017578125,
"learning_rate": 9.486799151632612e-07,
"loss": 0.0,
"reward": 3.5562498569488525,
"reward_std": 0.8575383424758911,
"rewards/accuracy_reward": 2.3312501907348633,
"rewards/format_reward": 1.0,
"step": 95,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 303.8125,
"epoch": 0.14701378254211334,
"grad_norm": 3.9194088008606944,
"kl": 0.0166015625,
"learning_rate": 9.47613177282179e-07,
"loss": 0.0,
"reward": 3.8000001907348633,
"reward_std": 0.5585123300552368,
"rewards/accuracy_reward": 2.5,
"rewards/format_reward": 1.0,
"step": 96,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 274.09375,
"epoch": 0.14854517611026033,
"grad_norm": 14.355949768888078,
"kl": 0.019287109375,
"learning_rate": 9.465360790276911e-07,
"loss": 0.0,
"reward": 3.3999998569488525,
"reward_std": 0.5401572585105896,
"rewards/accuracy_reward": 2.0999999046325684,
"rewards/format_reward": 1.0,
"step": 97,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.28125,
"epoch": 0.15007656967840735,
"grad_norm": 102.92002021277533,
"kl": 0.01806640625,
"learning_rate": 9.454486453301189e-07,
"loss": 0.0,
"reward": 3.231250047683716,
"reward_std": 0.5214991569519043,
"rewards/accuracy_reward": 1.931249976158142,
"rewards/format_reward": 1.0,
"step": 98,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 262.78125,
"epoch": 0.15160796324655437,
"grad_norm": 4.0363473222679,
"kl": 0.017578125,
"learning_rate": 9.44350901359005e-07,
"loss": 0.0,
"reward": 3.3812499046325684,
"reward_std": 0.5377869606018066,
"rewards/accuracy_reward": 2.15625,
"rewards/format_reward": 1.0,
"step": 99,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 336.25,
"epoch": 0.15313935681470137,
"grad_norm": 5.493115558081385,
"kl": 0.01470947265625,
"learning_rate": 9.432428725225326e-07,
"loss": 0.0,
"reward": 2.8812499046325684,
"reward_std": 0.5311998128890991,
"rewards/accuracy_reward": 1.5812499523162842,
"rewards/format_reward": 1.0,
"step": 100,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 280.71875,
"epoch": 0.1546707503828484,
"grad_norm": 16.20136610259414,
"kl": 0.020263671875,
"learning_rate": 9.421245844669361e-07,
"loss": 0.0,
"reward": 3.03125,
"reward_std": 0.361322820186615,
"rewards/accuracy_reward": 1.7312500476837158,
"rewards/format_reward": 1.0,
"step": 101,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 247.65625,
"epoch": 0.1562021439509954,
"grad_norm": 12.026042915386354,
"kl": 0.0234375,
"learning_rate": 9.409960630759078e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.6079727411270142,
"rewards/accuracy_reward": 2.450000047683716,
"rewards/format_reward": 1.0,
"step": 102,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 237.5625,
"epoch": 0.15773353751914243,
"grad_norm": 56.156360481293405,
"kl": 0.021240234375,
"learning_rate": 9.398573344699992e-07,
"loss": 0.0,
"reward": 3.706249713897705,
"reward_std": 0.4153931140899658,
"rewards/accuracy_reward": 2.40625,
"rewards/format_reward": 1.0,
"step": 103,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 240.21875,
"epoch": 0.15926493108728942,
"grad_norm": 9.033204445215723,
"kl": 0.0238037109375,
"learning_rate": 9.387084250060162e-07,
"loss": 0.0,
"reward": 3.581249952316284,
"reward_std": 0.43426191806793213,
"rewards/accuracy_reward": 2.3562498092651367,
"rewards/format_reward": 1.0,
"step": 104,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 258.65625,
"epoch": 0.16079632465543645,
"grad_norm": 9.214387456879582,
"kl": 0.022705078125,
"learning_rate": 9.375493612764085e-07,
"loss": 0.0,
"reward": 3.1312499046325684,
"reward_std": 0.5742160081863403,
"rewards/accuracy_reward": 1.8312499523162842,
"rewards/format_reward": 1.0,
"step": 105,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 274.09375,
"epoch": 0.16232771822358347,
"grad_norm": 6.527925149967669,
"kl": 0.02392578125,
"learning_rate": 9.363801701086554e-07,
"loss": 0.0,
"reward": 3.1937499046325684,
"reward_std": 0.3148888349533081,
"rewards/accuracy_reward": 1.8937500715255737,
"rewards/format_reward": 1.0,
"step": 106,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.9375,
"epoch": 0.1638591117917305,
"grad_norm": 9.874956521245124,
"kl": 0.0213623046875,
"learning_rate": 9.35200878564643e-07,
"loss": 0.0,
"reward": 3.081249952316284,
"reward_std": 0.5441991090774536,
"rewards/accuracy_reward": 1.8562500476837158,
"rewards/format_reward": 1.0,
"step": 107,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 243.1875,
"epoch": 0.16539050535987748,
"grad_norm": 30.178604262517098,
"kl": 0.02294921875,
"learning_rate": 9.340115139400399e-07,
"loss": 0.0,
"reward": 3.0562500953674316,
"reward_std": 0.2912874221801758,
"rewards/accuracy_reward": 1.756250023841858,
"rewards/format_reward": 1.0,
"step": 108,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 264.375,
"epoch": 0.1669218989280245,
"grad_norm": 11.310326494662677,
"kl": 0.019775390625,
"learning_rate": 9.32812103763664e-07,
"loss": 0.0,
"reward": 3.3687498569488525,
"reward_std": 0.4435497522354126,
"rewards/accuracy_reward": 2.0687499046325684,
"rewards/format_reward": 1.0,
"step": 109,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 247.0625,
"epoch": 0.16845329249617153,
"grad_norm": 4.252098043893597,
"kl": 0.019287109375,
"learning_rate": 9.316026757968454e-07,
"loss": 0.0,
"reward": 3.831249952316284,
"reward_std": 0.5273396372795105,
"rewards/accuracy_reward": 2.53125,
"rewards/format_reward": 1.0,
"step": 110,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 268.46875,
"epoch": 0.16998468606431852,
"grad_norm": 6.7966917684785395,
"kl": 0.0205078125,
"learning_rate": 9.303832580327844e-07,
"loss": 0.0,
"reward": 3.90625,
"reward_std": 0.5165694952011108,
"rewards/accuracy_reward": 2.606250047683716,
"rewards/format_reward": 1.0,
"step": 111,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 267.90625,
"epoch": 0.17151607963246554,
"grad_norm": 9.197438213715376,
"kl": 0.019775390625,
"learning_rate": 9.291538786959037e-07,
"loss": 0.0,
"reward": 3.1499998569488525,
"reward_std": 0.4710281193256378,
"rewards/accuracy_reward": 1.9250000715255737,
"rewards/format_reward": 1.0,
"step": 112,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.0,
"epoch": 0.17304747320061256,
"grad_norm": 9.499800643366168,
"kl": 0.0203857421875,
"learning_rate": 9.279145662411941e-07,
"loss": 0.0,
"reward": 2.8625001907348633,
"reward_std": 0.39233559370040894,
"rewards/accuracy_reward": 1.6375000476837158,
"rewards/format_reward": 1.0,
"step": 113,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 256.46875,
"epoch": 0.17457886676875958,
"grad_norm": 9.624750095041502,
"kl": 0.0224609375,
"learning_rate": 9.26665349353557e-07,
"loss": 0.0,
"reward": 3.843750238418579,
"reward_std": 0.6045280694961548,
"rewards/accuracy_reward": 2.6187500953674316,
"rewards/format_reward": 1.0,
"step": 114,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.21875,
"epoch": 0.17611026033690658,
"grad_norm": 4.098513181220432,
"kl": 0.0166015625,
"learning_rate": 9.2540625694714e-07,
"loss": 0.0,
"reward": 3.2125000953674316,
"reward_std": 0.3509003221988678,
"rewards/accuracy_reward": 1.9125001430511475,
"rewards/format_reward": 1.0,
"step": 115,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 261.84375,
"epoch": 0.1776416539050536,
"grad_norm": 5.603893898192362,
"kl": 0.02197265625,
"learning_rate": 9.241373181646671e-07,
"loss": 0.0,
"reward": 2.9312498569488525,
"reward_std": 0.3804709315299988,
"rewards/accuracy_reward": 1.631250023841858,
"rewards/format_reward": 1.0,
"step": 116,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 314.75,
"epoch": 0.17917304747320062,
"grad_norm": 11.740130042508287,
"kl": 0.0185546875,
"learning_rate": 9.228585623767658e-07,
"loss": 0.0,
"reward": 2.8812499046325684,
"reward_std": 0.4420754909515381,
"rewards/accuracy_reward": 1.5812499523162842,
"rewards/format_reward": 1.0,
"step": 117,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 275.53125,
"epoch": 0.1807044410413476,
"grad_norm": 4.922266982609114,
"kl": 0.021240234375,
"learning_rate": 9.21570019181285e-07,
"loss": 0.0,
"reward": 3.624999761581421,
"reward_std": 0.5486506819725037,
"rewards/accuracy_reward": 2.3249998092651367,
"rewards/format_reward": 1.0,
"step": 118,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 272.8125,
"epoch": 0.18223583460949463,
"grad_norm": 4.493970548638196,
"kl": 0.0233154296875,
"learning_rate": 9.202717184026123e-07,
"loss": 0.0,
"reward": 3.7437500953674316,
"reward_std": 0.3937029242515564,
"rewards/accuracy_reward": 2.4437499046325684,
"rewards/format_reward": 1.0,
"step": 119,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.40625,
"epoch": 0.18376722817764166,
"grad_norm": 4.869676485468556,
"kl": 0.019287109375,
"learning_rate": 9.189636900909817e-07,
"loss": 0.0,
"reward": 3.9000000953674316,
"reward_std": 0.49319151043891907,
"rewards/accuracy_reward": 2.5999999046325684,
"rewards/format_reward": 1.0,
"step": 120,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 387.625,
"epoch": 0.18529862174578868,
"grad_norm": 6.776203629581503,
"kl": 0.01708984375,
"learning_rate": 9.176459645217794e-07,
"loss": 0.0,
"reward": 3.0999999046325684,
"reward_std": 0.3274829685688019,
"rewards/accuracy_reward": 1.7999999523162842,
"rewards/format_reward": 1.0,
"step": 121,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 288.625,
"epoch": 0.18683001531393567,
"grad_norm": 5.656254719786936,
"kl": 0.0234375,
"learning_rate": 9.163185721948421e-07,
"loss": 0.0,
"reward": 3.1187498569488525,
"reward_std": 0.35278022289276123,
"rewards/accuracy_reward": 1.818750023841858,
"rewards/format_reward": 1.0,
"step": 122,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 253.0,
"epoch": 0.1883614088820827,
"grad_norm": 4.6916404290395075,
"kl": 0.0240478515625,
"learning_rate": 9.14981543833752e-07,
"loss": 0.0,
"reward": 4.212500095367432,
"reward_std": 0.45350727438926697,
"rewards/accuracy_reward": 2.9124999046325684,
"rewards/format_reward": 1.0,
"step": 123,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 289.3125,
"epoch": 0.18989280245022971,
"grad_norm": 14.783330097461596,
"kl": 0.020263671875,
"learning_rate": 9.136349103851252e-07,
"loss": 0.0,
"reward": 2.8999998569488525,
"reward_std": 0.4208742380142212,
"rewards/accuracy_reward": 1.600000023841858,
"rewards/format_reward": 1.0,
"step": 124,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.3125,
"epoch": 0.19142419601837674,
"grad_norm": 3.294759506674053,
"kl": 0.0238037109375,
"learning_rate": 9.122787030178949e-07,
"loss": 0.0,
"reward": 3.4937500953674316,
"reward_std": 0.5771903991699219,
"rewards/accuracy_reward": 2.1937499046325684,
"rewards/format_reward": 1.0,
"step": 125,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 270.0,
"epoch": 0.19295558958652373,
"grad_norm": 5.780643239892506,
"kl": 0.022705078125,
"learning_rate": 9.10912953122591e-07,
"loss": 0.0,
"reward": 3.4499998092651367,
"reward_std": 0.5633983016014099,
"rewards/accuracy_reward": 2.1500000953674316,
"rewards/format_reward": 1.0,
"step": 126,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 324.75,
"epoch": 0.19448698315467075,
"grad_norm": 4.234196031996337,
"kl": 0.0208740234375,
"learning_rate": 9.095376923106129e-07,
"loss": 0.0,
"reward": 3.0562500953674316,
"reward_std": 0.3289462924003601,
"rewards/accuracy_reward": 1.756250023841858,
"rewards/format_reward": 1.0,
"step": 127,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 270.90625,
"epoch": 0.19601837672281777,
"grad_norm": 11.190260547131228,
"kl": 0.0262451171875,
"learning_rate": 9.081529524134975e-07,
"loss": 0.0,
"reward": 3.4937500953674316,
"reward_std": 0.47610414028167725,
"rewards/accuracy_reward": 2.1937499046325684,
"rewards/format_reward": 1.0,
"step": 128,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 288.0,
"epoch": 0.19754977029096477,
"grad_norm": 6.360658284575685,
"kl": 0.0240478515625,
"learning_rate": 9.067587654821837e-07,
"loss": 0.0,
"reward": 3.043750047683716,
"reward_std": 0.4341242015361786,
"rewards/accuracy_reward": 1.743749976158142,
"rewards/format_reward": 1.0,
"step": 129,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.5625,
"epoch": 0.1990811638591118,
"grad_norm": 4.296191720894978,
"kl": 0.020263671875,
"learning_rate": 9.053551637862692e-07,
"loss": 0.0,
"reward": 3.4124999046325684,
"reward_std": 0.5241235494613647,
"rewards/accuracy_reward": 2.1125001907348633,
"rewards/format_reward": 1.0,
"step": 130,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 303.53125,
"epoch": 0.2006125574272588,
"grad_norm": 8.514387720273588,
"kl": 0.0238037109375,
"learning_rate": 9.03942179813264e-07,
"loss": 0.0,
"reward": 3.393749713897705,
"reward_std": 0.42237094044685364,
"rewards/accuracy_reward": 2.09375,
"rewards/format_reward": 1.0,
"step": 131,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 303.21875,
"epoch": 0.20214395099540583,
"grad_norm": 41.89008793742567,
"kl": 0.0228271484375,
"learning_rate": 9.025198462678392e-07,
"loss": 0.0,
"reward": 3.0437498092651367,
"reward_std": 0.44920703768730164,
"rewards/accuracy_reward": 1.743749976158142,
"rewards/format_reward": 1.0,
"step": 132,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.75,
"epoch": 0.20367534456355282,
"grad_norm": 7.194974348985283,
"kl": 0.0244140625,
"learning_rate": 9.010881960710688e-07,
"loss": 0.0,
"reward": 3.4437499046325684,
"reward_std": 0.43165701627731323,
"rewards/accuracy_reward": 2.21875,
"rewards/format_reward": 1.0,
"step": 133,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.5,
"epoch": 0.20520673813169985,
"grad_norm": 10.06890479049601,
"kl": 0.0252685546875,
"learning_rate": 8.996472623596687e-07,
"loss": 0.0,
"reward": 3.6125001907348633,
"reward_std": 0.7025853395462036,
"rewards/accuracy_reward": 2.3874998092651367,
"rewards/format_reward": 1.0,
"step": 134,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.3125,
"epoch": 0.20673813169984687,
"grad_norm": 12.37502990164126,
"kl": 0.0247802734375,
"learning_rate": 8.98197078485229e-07,
"loss": 0.0,
"reward": 3.6374998092651367,
"reward_std": 0.5983169078826904,
"rewards/accuracy_reward": 2.4125001430511475,
"rewards/format_reward": 1.0,
"step": 135,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 323.75,
"epoch": 0.2082695252679939,
"grad_norm": 5.657135150692953,
"kl": 0.0257568359375,
"learning_rate": 8.967376780134426e-07,
"loss": 0.0,
"reward": 2.8312501907348633,
"reward_std": 0.35583776235580444,
"rewards/accuracy_reward": 1.53125,
"rewards/format_reward": 1.0,
"step": 136,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.625,
"epoch": 0.20980091883614088,
"grad_norm": 4.380349757952363,
"kl": 0.029541015625,
"learning_rate": 8.952690947233284e-07,
"loss": 0.0,
"reward": 3.4437501430511475,
"reward_std": 0.32703521847724915,
"rewards/accuracy_reward": 2.21875,
"rewards/format_reward": 1.0,
"step": 137,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.25,
"epoch": 0.2113323124042879,
"grad_norm": 8.823992736109716,
"kl": 0.02587890625,
"learning_rate": 8.937913626064486e-07,
"loss": 0.0,
"reward": 3.2937498092651367,
"reward_std": 0.3964410424232483,
"rewards/accuracy_reward": 2.0687499046325684,
"rewards/format_reward": 1.0,
"step": 138,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 308.3125,
"epoch": 0.21286370597243492,
"grad_norm": 12.102631268395426,
"kl": 0.0272216796875,
"learning_rate": 8.923045158661226e-07,
"loss": 0.0,
"reward": 3.6312499046325684,
"reward_std": 0.4014958143234253,
"rewards/accuracy_reward": 2.331249952316284,
"rewards/format_reward": 1.0,
"step": 139,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 271.65625,
"epoch": 0.21439509954058192,
"grad_norm": 7.913428989372605,
"kl": 0.0242919921875,
"learning_rate": 8.908085889166357e-07,
"loss": 0.0,
"reward": 4.025000095367432,
"reward_std": 0.4050452709197998,
"rewards/accuracy_reward": 2.7250001430511475,
"rewards/format_reward": 1.0,
"step": 140,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 284.5,
"epoch": 0.21592649310872894,
"grad_norm": 4.051622799305994,
"kl": 0.031494140625,
"learning_rate": 8.893036163824414e-07,
"loss": 0.0,
"reward": 3.5250000953674316,
"reward_std": 0.2405874878168106,
"rewards/accuracy_reward": 2.2249999046325684,
"rewards/format_reward": 1.0,
"step": 141,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.21875,
"epoch": 0.21745788667687596,
"grad_norm": 15.525922723951407,
"kl": 0.029052734375,
"learning_rate": 8.877896330973611e-07,
"loss": 0.0,
"reward": 3.7874999046325684,
"reward_std": 0.6023781299591064,
"rewards/accuracy_reward": 2.562500238418579,
"rewards/format_reward": 1.0,
"step": 142,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 290.625,
"epoch": 0.21898928024502298,
"grad_norm": 11.53714498993186,
"kl": 0.025390625,
"learning_rate": 8.862666741037772e-07,
"loss": 0.0,
"reward": 3.2562499046325684,
"reward_std": 0.3409149646759033,
"rewards/accuracy_reward": 1.9562500715255737,
"rewards/format_reward": 1.0,
"step": 143,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 321.5,
"epoch": 0.22052067381316998,
"grad_norm": 6.5118421682140655,
"kl": 0.0262451171875,
"learning_rate": 8.847347746518226e-07,
"loss": 0.0,
"reward": 3.53125,
"reward_std": 0.5107113122940063,
"rewards/accuracy_reward": 2.2312498092651367,
"rewards/format_reward": 1.0,
"step": 144,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 301.53125,
"epoch": 0.222052067381317,
"grad_norm": 9.837705570386271,
"kl": 0.03076171875,
"learning_rate": 8.831939701985636e-07,
"loss": 0.0,
"reward": 3.5249998569488525,
"reward_std": 0.5034339427947998,
"rewards/accuracy_reward": 2.2249999046325684,
"rewards/format_reward": 1.0,
"step": 145,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.84375,
"epoch": 0.22358346094946402,
"grad_norm": 5.862107982116596,
"kl": 0.03173828125,
"learning_rate": 8.81644296407181e-07,
"loss": 0.0,
"reward": 3.0875000953674316,
"reward_std": 0.3950956463813782,
"rewards/accuracy_reward": 1.787500023841858,
"rewards/format_reward": 1.0,
"step": 146,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 345.40625,
"epoch": 0.225114854517611,
"grad_norm": 8.708867217195756,
"kl": 0.028076171875,
"learning_rate": 8.800857891461433e-07,
"loss": 0.0,
"reward": 3.0999999046325684,
"reward_std": 0.3657301366329193,
"rewards/accuracy_reward": 1.7999999523162842,
"rewards/format_reward": 1.0,
"step": 147,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 400.0,
"epoch": 0.22664624808575803,
"grad_norm": 6.510285798481118,
"kl": 0.022216796875,
"learning_rate": 8.785184844883766e-07,
"loss": 0.0,
"reward": 3.487499952316284,
"reward_std": 0.3941337466239929,
"rewards/accuracy_reward": 2.1875,
"rewards/format_reward": 1.0,
"step": 148,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 264.15625,
"epoch": 0.22817764165390506,
"grad_norm": 6.298316940368753,
"kl": 0.031494140625,
"learning_rate": 8.769424187104302e-07,
"loss": 0.0,
"reward": 3.8187499046325684,
"reward_std": 0.49747228622436523,
"rewards/accuracy_reward": 2.518749713897705,
"rewards/format_reward": 1.0,
"step": 149,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.9375,
"epoch": 0.22970903522205208,
"grad_norm": 80.96139230971517,
"kl": 0.02978515625,
"learning_rate": 8.75357628291637e-07,
"loss": 0.0,
"reward": 3.637500047683716,
"reward_std": 0.42090892791748047,
"rewards/accuracy_reward": 2.3375000953674316,
"rewards/format_reward": 1.0,
"step": 150,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 294.09375,
"epoch": 0.23124042879019907,
"grad_norm": 5.201330390482463,
"kl": 0.0311279296875,
"learning_rate": 8.737641499132681e-07,
"loss": 0.0,
"reward": 2.9437499046325684,
"reward_std": 0.14110496640205383,
"rewards/accuracy_reward": 1.6437499523162842,
"rewards/format_reward": 1.0,
"step": 151,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.09375,
"epoch": 0.2327718223583461,
"grad_norm": 6.115410440044349,
"kl": 0.0281982421875,
"learning_rate": 8.721620204576856e-07,
"loss": 0.0,
"reward": 3.1937499046325684,
"reward_std": 0.3890814781188965,
"rewards/accuracy_reward": 1.96875,
"rewards/format_reward": 1.0,
"step": 152,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.3125,
"epoch": 0.2343032159264931,
"grad_norm": 5.4394369395935565,
"kl": 0.032470703125,
"learning_rate": 8.705512770074868e-07,
"loss": 0.0,
"reward": 3.4375,
"reward_std": 0.4925556480884552,
"rewards/accuracy_reward": 2.1374998092651367,
"rewards/format_reward": 1.0,
"step": 153,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.59375,
"epoch": 0.23583460949464014,
"grad_norm": 17.260831202111817,
"kl": 0.028564453125,
"learning_rate": 8.689319568446474e-07,
"loss": 0.0,
"reward": 3.5124998092651367,
"reward_std": 0.4656377136707306,
"rewards/accuracy_reward": 2.2874999046325684,
"rewards/format_reward": 1.0,
"step": 154,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.90625,
"epoch": 0.23736600306278713,
"grad_norm": 3.8749629849633522,
"kl": 0.033203125,
"learning_rate": 8.673040974496584e-07,
"loss": 0.0,
"reward": 3.6999998092651367,
"reward_std": 0.3916766345500946,
"rewards/accuracy_reward": 2.4000000953674316,
"rewards/format_reward": 1.0,
"step": 155,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.59375,
"epoch": 0.23889739663093415,
"grad_norm": 139.35810615334432,
"kl": 0.0322265625,
"learning_rate": 8.656677365006579e-07,
"loss": 0.0,
"reward": 3.768749952316284,
"reward_std": 0.4740249514579773,
"rewards/accuracy_reward": 2.46875,
"rewards/format_reward": 1.0,
"step": 156,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 244.125,
"epoch": 0.24042879019908117,
"grad_norm": 10.567534138609691,
"kl": 0.03125,
"learning_rate": 8.640229118725595e-07,
"loss": 0.0,
"reward": 3.8187499046325684,
"reward_std": 0.6347061395645142,
"rewards/accuracy_reward": 2.5187501907348633,
"rewards/format_reward": 1.0,
"step": 157,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.125,
"epoch": 0.24196018376722817,
"grad_norm": 8.106437182792291,
"kl": 0.029541015625,
"learning_rate": 8.62369661636176e-07,
"loss": 0.0,
"reward": 2.75,
"reward_std": 0.27621468901634216,
"rewards/accuracy_reward": 1.600000023841858,
"rewards/format_reward": 1.0,
"step": 158,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.53125,
"epoch": 0.2434915773353752,
"grad_norm": 4.066291402520282,
"kl": 0.023681640625,
"learning_rate": 8.607080240573372e-07,
"loss": 0.0,
"reward": 4.181249618530273,
"reward_std": 0.4398733973503113,
"rewards/accuracy_reward": 2.8812501430511475,
"rewards/format_reward": 1.0,
"step": 159,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 264.6875,
"epoch": 0.2450229709035222,
"grad_norm": 9.494494792809737,
"kl": 0.03173828125,
"learning_rate": 8.590380375960053e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.47410985827445984,
"rewards/accuracy_reward": 2.5124998092651367,
"rewards/format_reward": 1.0,
"step": 160,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 336.1875,
"epoch": 0.24655436447166923,
"grad_norm": 5.777129184291776,
"kl": 0.0279541015625,
"learning_rate": 8.573597409053837e-07,
"loss": 0.0,
"reward": 3.28125,
"reward_std": 0.570157527923584,
"rewards/accuracy_reward": 1.9812499284744263,
"rewards/format_reward": 1.0,
"step": 161,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 313.21875,
"epoch": 0.24808575803981622,
"grad_norm": 13.152237866980391,
"kl": 0.02734375,
"learning_rate": 8.556731728310234e-07,
"loss": 0.0,
"reward": 3.3499999046325684,
"reward_std": 0.29259437322616577,
"rewards/accuracy_reward": 2.049999952316284,
"rewards/format_reward": 1.0,
"step": 162,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.625,
"epoch": 0.24961715160796324,
"grad_norm": 10.763469615081304,
"kl": 0.031005859375,
"learning_rate": 8.53978372409923e-07,
"loss": 0.0,
"reward": 3.34375,
"reward_std": 0.37417662143707275,
"rewards/accuracy_reward": 2.0437498092651367,
"rewards/format_reward": 1.0,
"step": 163,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.375,
"epoch": 0.25114854517611024,
"grad_norm": 5.323834123530134,
"kl": 0.0283203125,
"learning_rate": 8.522753788696258e-07,
"loss": 0.0,
"reward": 3.4937500953674316,
"reward_std": 0.5279685854911804,
"rewards/accuracy_reward": 2.268749952316284,
"rewards/format_reward": 1.0,
"step": 164,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 321.75,
"epoch": 0.25267993874425726,
"grad_norm": 6.031806927830471,
"kl": 0.027099609375,
"learning_rate": 8.505642316273111e-07,
"loss": 0.0,
"reward": 3.0625,
"reward_std": 0.4299696683883667,
"rewards/accuracy_reward": 1.8375000953674316,
"rewards/format_reward": 1.0,
"step": 165,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.40625,
"epoch": 0.2542113323124043,
"grad_norm": 11.240961051487723,
"kl": 0.0291748046875,
"learning_rate": 8.488449702888827e-07,
"loss": 0.0,
"reward": 3.2125000953674316,
"reward_std": 0.40743768215179443,
"rewards/accuracy_reward": 1.912500023841858,
"rewards/format_reward": 1.0,
"step": 166,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 258.0625,
"epoch": 0.2557427258805513,
"grad_norm": 9.195739213624087,
"kl": 0.03271484375,
"learning_rate": 8.471176346480517e-07,
"loss": 0.0,
"reward": 3.893749713897705,
"reward_std": 0.4122272729873657,
"rewards/accuracy_reward": 2.59375,
"rewards/format_reward": 1.0,
"step": 167,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.25,
"epoch": 0.2572741194486983,
"grad_norm": 8.215813439305336,
"kl": 0.02978515625,
"learning_rate": 8.453822646854154e-07,
"loss": 0.0,
"reward": 3.856250047683716,
"reward_std": 0.3475147485733032,
"rewards/accuracy_reward": 2.5562500953674316,
"rewards/format_reward": 1.0,
"step": 168,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.34375,
"epoch": 0.25880551301684535,
"grad_norm": 5.114773777833558,
"kl": 0.030517578125,
"learning_rate": 8.436389005675324e-07,
"loss": 0.0,
"reward": 3.737499952316284,
"reward_std": 0.6522217988967896,
"rewards/accuracy_reward": 2.4375,
"rewards/format_reward": 1.0,
"step": 169,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 288.25,
"epoch": 0.26033690658499237,
"grad_norm": 7.109534980092122,
"kl": 0.03515625,
"learning_rate": 8.418875826459919e-07,
"loss": 0.0,
"reward": 3.9000000953674316,
"reward_std": 0.40476852655410767,
"rewards/accuracy_reward": 2.5999999046325684,
"rewards/format_reward": 1.0,
"step": 170,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.28125,
"epoch": 0.26186830015313933,
"grad_norm": 4.142263790273646,
"kl": 0.033935546875,
"learning_rate": 8.401283514564815e-07,
"loss": 0.0,
"reward": 3.5562498569488525,
"reward_std": 0.41218000650405884,
"rewards/accuracy_reward": 2.2562499046325684,
"rewards/format_reward": 1.0,
"step": 171,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 348.0625,
"epoch": 0.26339969372128635,
"grad_norm": 9.922744313017875,
"kl": 0.0281982421875,
"learning_rate": 8.383612477178464e-07,
"loss": 0.0,
"reward": 2.8625001907348633,
"reward_std": 0.5244588851928711,
"rewards/accuracy_reward": 1.6375000476837158,
"rewards/format_reward": 1.0,
"step": 172,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.40625,
"epoch": 0.2649310872894334,
"grad_norm": 4.091456334995777,
"kl": 0.0341796875,
"learning_rate": 8.365863123311497e-07,
"loss": 0.0,
"reward": 2.6312499046325684,
"reward_std": 0.1589444875717163,
"rewards/accuracy_reward": 1.40625,
"rewards/format_reward": 1.0,
"step": 173,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.09375,
"epoch": 0.2664624808575804,
"grad_norm": 4.940813443369881,
"kl": 0.033203125,
"learning_rate": 8.348035863787237e-07,
"loss": 0.0,
"reward": 3.46875,
"reward_std": 0.2943500876426697,
"rewards/accuracy_reward": 2.168750047683716,
"rewards/format_reward": 1.0,
"step": 174,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 257.03125,
"epoch": 0.2679938744257274,
"grad_norm": 6.586326309088319,
"kl": 0.033203125,
"learning_rate": 8.330131111232201e-07,
"loss": 0.0,
"reward": 4.03125,
"reward_std": 0.6235748529434204,
"rewards/accuracy_reward": 2.8062498569488525,
"rewards/format_reward": 1.0,
"step": 175,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 290.8125,
"epoch": 0.26952526799387444,
"grad_norm": 6.164855709585961,
"kl": 0.03564453125,
"learning_rate": 8.312149280066542e-07,
"loss": 0.0,
"reward": 3.612499713897705,
"reward_std": 0.36475399136543274,
"rewards/accuracy_reward": 2.3125,
"rewards/format_reward": 1.0,
"step": 176,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.90625,
"epoch": 0.27105666156202146,
"grad_norm": 12.646560626155859,
"kl": 0.0306396484375,
"learning_rate": 8.294090786494463e-07,
"loss": 0.0,
"reward": 3.674999952316284,
"reward_std": 0.4887303411960602,
"rewards/accuracy_reward": 2.375,
"rewards/format_reward": 1.0,
"step": 177,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.96875,
"epoch": 0.2725880551301684,
"grad_norm": 10.184859910871095,
"kl": 0.031494140625,
"learning_rate": 8.275956048494579e-07,
"loss": 0.0,
"reward": 3.518749952316284,
"reward_std": 0.5078155398368835,
"rewards/accuracy_reward": 2.293750047683716,
"rewards/format_reward": 1.0,
"step": 178,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 290.0,
"epoch": 0.27411944869831545,
"grad_norm": 15.727667383001375,
"kl": 0.031005859375,
"learning_rate": 8.257745485810249e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.5017106533050537,
"rewards/accuracy_reward": 2.575000047683716,
"rewards/format_reward": 1.0,
"step": 179,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.75,
"epoch": 0.27565084226646247,
"grad_norm": 3.1981726512133464,
"kl": 0.03564453125,
"learning_rate": 8.239459519939851e-07,
"loss": 0.0,
"reward": 3.28125,
"reward_std": 0.3276711106300354,
"rewards/accuracy_reward": 2.0562500953674316,
"rewards/format_reward": 1.0,
"step": 180,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.71875,
"epoch": 0.2771822358346095,
"grad_norm": 13.088209195159147,
"kl": 0.0306396484375,
"learning_rate": 8.221098574127035e-07,
"loss": 0.0,
"reward": 3.8062500953674316,
"reward_std": 0.7018867135047913,
"rewards/accuracy_reward": 2.5812501907348633,
"rewards/format_reward": 1.0,
"step": 181,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.125,
"epoch": 0.2787136294027565,
"grad_norm": 5.994825764520661,
"kl": 0.032958984375,
"learning_rate": 8.202663073350921e-07,
"loss": 0.0,
"reward": 3.4124999046325684,
"reward_std": 0.49544626474380493,
"rewards/accuracy_reward": 2.112499952316284,
"rewards/format_reward": 1.0,
"step": 182,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.78125,
"epoch": 0.28024502297090353,
"grad_norm": 6.843693897252769,
"kl": 0.034912109375,
"learning_rate": 8.184153444316269e-07,
"loss": 0.0,
"reward": 3.59375,
"reward_std": 0.4298456907272339,
"rewards/accuracy_reward": 2.293750047683716,
"rewards/format_reward": 1.0,
"step": 183,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 359.34375,
"epoch": 0.28177641653905056,
"grad_norm": 5.53671809463008,
"kl": 0.02978515625,
"learning_rate": 8.165570115443592e-07,
"loss": 0.0,
"reward": 3.5062499046325684,
"reward_std": 0.4097582697868347,
"rewards/accuracy_reward": 2.206249952316284,
"rewards/format_reward": 1.0,
"step": 184,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.4375,
"epoch": 0.2833078101071976,
"grad_norm": 5.028956492644307,
"kl": 0.030029296875,
"learning_rate": 8.14691351685925e-07,
"loss": 0.0,
"reward": 2.981250047683716,
"reward_std": 0.20719552040100098,
"rewards/accuracy_reward": 1.681249976158142,
"rewards/format_reward": 1.0,
"step": 185,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 332.28125,
"epoch": 0.28483920367534454,
"grad_norm": 14.870145066767432,
"kl": 0.03076171875,
"learning_rate": 8.12818408038549e-07,
"loss": 0.0,
"reward": 2.887500047683716,
"reward_std": 0.3261798322200775,
"rewards/accuracy_reward": 1.662500023841858,
"rewards/format_reward": 1.0,
"step": 186,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 241.71875,
"epoch": 0.28637059724349156,
"grad_norm": 7.261751728440713,
"kl": 0.03369140625,
"learning_rate": 8.109382239530451e-07,
"loss": 0.0,
"reward": 3.5500001907348633,
"reward_std": 0.6096799373626709,
"rewards/accuracy_reward": 2.25,
"rewards/format_reward": 1.0,
"step": 187,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 255.0,
"epoch": 0.2879019908116386,
"grad_norm": 9.417177086278453,
"kl": 0.037841796875,
"learning_rate": 8.090508429478129e-07,
"loss": 0.0,
"reward": 3.612499952316284,
"reward_std": 0.589754581451416,
"rewards/accuracy_reward": 2.3125,
"rewards/format_reward": 1.0,
"step": 188,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.84375,
"epoch": 0.2894333843797856,
"grad_norm": 12.28812335790627,
"kl": 0.032958984375,
"learning_rate": 8.07156308707831e-07,
"loss": 0.0,
"reward": 3.4625000953674316,
"reward_std": 0.4379882514476776,
"rewards/accuracy_reward": 2.1624999046325684,
"rewards/format_reward": 1.0,
"step": 189,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.5,
"epoch": 0.29096477794793263,
"grad_norm": 8.95424949929567,
"kl": 0.029296875,
"learning_rate": 8.052546650836453e-07,
"loss": 0.0,
"reward": 3.987499952316284,
"reward_std": 0.4414653480052948,
"rewards/accuracy_reward": 2.6875,
"rewards/format_reward": 1.0,
"step": 190,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 289.1875,
"epoch": 0.29249617151607965,
"grad_norm": 16.95609593725675,
"kl": 0.034423828125,
"learning_rate": 8.033459560903539e-07,
"loss": 0.0,
"reward": 3.3812499046325684,
"reward_std": 0.32076090574264526,
"rewards/accuracy_reward": 2.081249952316284,
"rewards/format_reward": 1.0,
"step": 191,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 289.15625,
"epoch": 0.29402756508422667,
"grad_norm": 7.171670881715125,
"kl": 0.03076171875,
"learning_rate": 8.014302259065892e-07,
"loss": 0.0,
"reward": 3.325000047683716,
"reward_std": 0.5578684210777283,
"rewards/accuracy_reward": 2.0250000953674316,
"rewards/format_reward": 1.0,
"step": 192,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 275.40625,
"epoch": 0.29555895865237364,
"grad_norm": 7.072965681156847,
"kl": 0.03271484375,
"learning_rate": 7.995075188734946e-07,
"loss": 0.0,
"reward": 3.6312499046325684,
"reward_std": 0.2462092638015747,
"rewards/accuracy_reward": 2.331249952316284,
"rewards/format_reward": 1.0,
"step": 193,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.96875,
"epoch": 0.29709035222052066,
"grad_norm": 5.763751054733127,
"kl": 0.025634765625,
"learning_rate": 7.975778794936978e-07,
"loss": 0.0,
"reward": 3.4562501907348633,
"reward_std": 0.33579856157302856,
"rewards/accuracy_reward": 2.231250047683716,
"rewards/format_reward": 1.0,
"step": 194,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.84375,
"epoch": 0.2986217457886677,
"grad_norm": 5.903003459098319,
"kl": 0.0294189453125,
"learning_rate": 7.956413524302823e-07,
"loss": 0.0,
"reward": 3.40625,
"reward_std": 0.4575210213661194,
"rewards/accuracy_reward": 2.1812500953674316,
"rewards/format_reward": 1.0,
"step": 195,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 244.40625,
"epoch": 0.3001531393568147,
"grad_norm": 23.64457466042576,
"kl": 0.042236328125,
"learning_rate": 7.93697982505752e-07,
"loss": 0.0,
"reward": 2.8500001430511475,
"reward_std": 0.33267366886138916,
"rewards/accuracy_reward": 1.5499999523162842,
"rewards/format_reward": 1.0,
"step": 196,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 268.59375,
"epoch": 0.3016845329249617,
"grad_norm": 9.620171967989569,
"kl": 0.03515625,
"learning_rate": 7.917478147009949e-07,
"loss": 0.0,
"reward": 3.518749713897705,
"reward_std": 0.4448161721229553,
"rewards/accuracy_reward": 2.3687500953674316,
"rewards/format_reward": 1.0,
"step": 197,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.03125,
"epoch": 0.30321592649310875,
"grad_norm": 6.234928506561371,
"kl": 0.036376953125,
"learning_rate": 7.89790894154241e-07,
"loss": 0.0,
"reward": 3.2437500953674316,
"reward_std": 0.4087636470794678,
"rewards/accuracy_reward": 2.018749952316284,
"rewards/format_reward": 1.0,
"step": 198,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.15625,
"epoch": 0.30474732006125577,
"grad_norm": 9.715940321636149,
"kl": 0.0361328125,
"learning_rate": 7.878272661600185e-07,
"loss": 0.0,
"reward": 3.5875000953674316,
"reward_std": 0.2343311309814453,
"rewards/accuracy_reward": 2.2874999046325684,
"rewards/format_reward": 1.0,
"step": 199,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.125,
"epoch": 0.30627871362940273,
"grad_norm": 5.672785055414545,
"kl": 0.031494140625,
"learning_rate": 7.858569761681047e-07,
"loss": 0.0,
"reward": 2.875,
"reward_std": 0.3694216012954712,
"rewards/accuracy_reward": 1.5749999284744263,
"rewards/format_reward": 1.0,
"step": 200,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 328.4375,
"epoch": 0.30781010719754975,
"grad_norm": 6.123192750528283,
"kl": 0.03271484375,
"learning_rate": 7.838800697824743e-07,
"loss": 0.0,
"reward": 3.0625,
"reward_std": 0.516420304775238,
"rewards/accuracy_reward": 1.7625000476837158,
"rewards/format_reward": 1.0,
"step": 201,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.34375,
"epoch": 0.3093415007656968,
"grad_norm": 9.512927288541976,
"kl": 0.03173828125,
"learning_rate": 7.818965927602436e-07,
"loss": 0.0,
"reward": 4.099999904632568,
"reward_std": 0.5014769434928894,
"rewards/accuracy_reward": 2.799999952316284,
"rewards/format_reward": 1.0,
"step": 202,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.75,
"epoch": 0.3108728943338438,
"grad_norm": 5.366517575068269,
"kl": 0.031005859375,
"learning_rate": 7.799065910106126e-07,
"loss": 0.0,
"reward": 3.3937501907348633,
"reward_std": 0.5237241983413696,
"rewards/accuracy_reward": 2.093750238418579,
"rewards/format_reward": 1.0,
"step": 203,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 271.90625,
"epoch": 0.3124042879019908,
"grad_norm": 12.699080352281662,
"kl": 0.04345703125,
"learning_rate": 7.779101105938004e-07,
"loss": 0.0,
"reward": 3.8375000953674316,
"reward_std": 0.4205377995967865,
"rewards/accuracy_reward": 2.5375001430511475,
"rewards/format_reward": 1.0,
"step": 204,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.4375,
"epoch": 0.31393568147013784,
"grad_norm": 7.7656380006752155,
"kl": 0.035888671875,
"learning_rate": 7.759071977199806e-07,
"loss": 0.0,
"reward": 3.4749999046325684,
"reward_std": 0.35376298427581787,
"rewards/accuracy_reward": 2.174999952316284,
"rewards/format_reward": 1.0,
"step": 205,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.46875,
"epoch": 0.31546707503828486,
"grad_norm": 4.391130785851663,
"kl": 0.0341796875,
"learning_rate": 7.738978987482112e-07,
"loss": 0.0,
"reward": 3.5,
"reward_std": 0.4183518886566162,
"rewards/accuracy_reward": 2.1999998092651367,
"rewards/format_reward": 1.0,
"step": 206,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 245.40625,
"epoch": 0.3169984686064318,
"grad_norm": 5.540719912300597,
"kl": 0.03955078125,
"learning_rate": 7.71882260185362e-07,
"loss": 0.0,
"reward": 3.40625,
"reward_std": 0.31065988540649414,
"rewards/accuracy_reward": 2.1062498092651367,
"rewards/format_reward": 1.0,
"step": 207,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 313.1875,
"epoch": 0.31852986217457885,
"grad_norm": 17.420886036265625,
"kl": 0.03515625,
"learning_rate": 7.698603286850374e-07,
"loss": 0.0,
"reward": 3.8500001430511475,
"reward_std": 0.30487963557243347,
"rewards/accuracy_reward": 2.549999952316284,
"rewards/format_reward": 1.0,
"step": 208,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 256.6875,
"epoch": 0.32006125574272587,
"grad_norm": 7.933847200772637,
"kl": 0.0458984375,
"learning_rate": 7.678321510464971e-07,
"loss": 0.0,
"reward": 3.856250047683716,
"reward_std": 0.29905903339385986,
"rewards/accuracy_reward": 2.5562500953674316,
"rewards/format_reward": 1.0,
"step": 209,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 336.65625,
"epoch": 0.3215926493108729,
"grad_norm": 9.297095513483827,
"kl": 0.034912109375,
"learning_rate": 7.657977742135725e-07,
"loss": 0.0,
"reward": 2.9124999046325684,
"reward_std": 0.4157797396183014,
"rewards/accuracy_reward": 1.6124999523162842,
"rewards/format_reward": 1.0,
"step": 210,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.03125,
"epoch": 0.3231240428790199,
"grad_norm": 5.541167825333942,
"kl": 0.036376953125,
"learning_rate": 7.637572452735813e-07,
"loss": 0.0,
"reward": 3.7750000953674316,
"reward_std": 0.49859824776649475,
"rewards/accuracy_reward": 2.4749999046325684,
"rewards/format_reward": 1.0,
"step": 211,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 255.40625,
"epoch": 0.32465543644716693,
"grad_norm": 9.168998819803033,
"kl": 0.04052734375,
"learning_rate": 7.617106114562359e-07,
"loss": 0.0,
"reward": 4.243749618530273,
"reward_std": 0.4166516363620758,
"rewards/accuracy_reward": 2.9437501430511475,
"rewards/format_reward": 1.0,
"step": 212,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 264.71875,
"epoch": 0.32618683001531396,
"grad_norm": 9.531857908625076,
"kl": 0.042236328125,
"learning_rate": 7.596579201325515e-07,
"loss": 0.0,
"reward": 3.4749999046325684,
"reward_std": 0.377795547246933,
"rewards/accuracy_reward": 2.174999952316284,
"rewards/format_reward": 1.0,
"step": 213,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.0625,
"epoch": 0.327718223583461,
"grad_norm": 8.31464966245884,
"kl": 0.041015625,
"learning_rate": 7.57599218813749e-07,
"loss": 0.0,
"reward": 3.6500000953674316,
"reward_std": 0.378510445356369,
"rewards/accuracy_reward": 2.3500001430511475,
"rewards/format_reward": 1.0,
"step": 214,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.34375,
"epoch": 0.32924961715160794,
"grad_norm": 4.287495478359939,
"kl": 0.0380859375,
"learning_rate": 7.555345551501557e-07,
"loss": 0.0,
"reward": 3.1812498569488525,
"reward_std": 0.38055357336997986,
"rewards/accuracy_reward": 1.881250023841858,
"rewards/format_reward": 1.0,
"step": 215,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 324.65625,
"epoch": 0.33078101071975496,
"grad_norm": 15.056876642486671,
"kl": 0.03662109375,
"learning_rate": 7.534639769301024e-07,
"loss": 0.0,
"reward": 4.268750190734863,
"reward_std": 0.2899988889694214,
"rewards/accuracy_reward": 2.96875,
"rewards/format_reward": 1.0,
"step": 216,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 244.90625,
"epoch": 0.332312404287902,
"grad_norm": 9.069801680915958,
"kl": 0.04345703125,
"learning_rate": 7.513875320788165e-07,
"loss": 0.0,
"reward": 4.099999904632568,
"reward_std": 0.3825719654560089,
"rewards/accuracy_reward": 2.799999713897705,
"rewards/format_reward": 1.0,
"step": 217,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.1875,
"epoch": 0.333843797856049,
"grad_norm": 10.955229211982534,
"kl": 0.04052734375,
"learning_rate": 7.493052686573147e-07,
"loss": 0.0,
"reward": 3.731250047683716,
"reward_std": 0.6039197444915771,
"rewards/accuracy_reward": 2.4312500953674316,
"rewards/format_reward": 1.0,
"step": 218,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.15625,
"epoch": 0.33537519142419603,
"grad_norm": 5.926525708351621,
"kl": 0.039794921875,
"learning_rate": 7.472172348612876e-07,
"loss": 0.0,
"reward": 4.28125,
"reward_std": 0.6196683049201965,
"rewards/accuracy_reward": 2.9812498092651367,
"rewards/format_reward": 1.0,
"step": 219,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 353.5625,
"epoch": 0.33690658499234305,
"grad_norm": 7.464320668891594,
"kl": 0.035400390625,
"learning_rate": 7.451234790199871e-07,
"loss": 0.0,
"reward": 3.049999952316284,
"reward_std": 0.4355461001396179,
"rewards/accuracy_reward": 1.75,
"rewards/format_reward": 1.0,
"step": 220,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.40625,
"epoch": 0.33843797856049007,
"grad_norm": 12.12432567991499,
"kl": 0.036865234375,
"learning_rate": 7.430240495951062e-07,
"loss": 0.0,
"reward": 2.9812498092651367,
"reward_std": 0.4996665120124817,
"rewards/accuracy_reward": 1.7562501430511475,
"rewards/format_reward": 1.0,
"step": 221,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 274.78125,
"epoch": 0.33996937212863704,
"grad_norm": 5.359147879862915,
"kl": 0.042236328125,
"learning_rate": 7.409189951796574e-07,
"loss": 0.0,
"reward": 3.4187498092651367,
"reward_std": 0.6283072233200073,
"rewards/accuracy_reward": 2.1937501430511475,
"rewards/format_reward": 1.0,
"step": 222,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.03125,
"epoch": 0.34150076569678406,
"grad_norm": 12.760200628929272,
"kl": 0.05078125,
"learning_rate": 7.388083644968481e-07,
"loss": 0.0001,
"reward": 3.90625,
"reward_std": 0.5293543338775635,
"rewards/accuracy_reward": 2.6062498092651367,
"rewards/format_reward": 1.0,
"step": 223,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.90625,
"epoch": 0.3430321592649311,
"grad_norm": 5.413212423004192,
"kl": 0.037109375,
"learning_rate": 7.366922063989535e-07,
"loss": 0.0,
"reward": 3.793750047683716,
"reward_std": 0.37553951144218445,
"rewards/accuracy_reward": 2.4937500953674316,
"rewards/format_reward": 1.0,
"step": 224,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.875,
"epoch": 0.3445635528330781,
"grad_norm": 10.152321573196087,
"kl": 0.044921875,
"learning_rate": 7.345705698661852e-07,
"loss": 0.0,
"reward": 3.393749713897705,
"reward_std": 0.518860936164856,
"rewards/accuracy_reward": 2.168750047683716,
"rewards/format_reward": 1.0,
"step": 225,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 280.65625,
"epoch": 0.3460949464012251,
"grad_norm": 6.901158008710162,
"kl": 0.04345703125,
"learning_rate": 7.324435040055571e-07,
"loss": 0.0,
"reward": 3.3812499046325684,
"reward_std": 0.4390850067138672,
"rewards/accuracy_reward": 2.081249952316284,
"rewards/format_reward": 1.0,
"step": 226,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 348.625,
"epoch": 0.34762633996937214,
"grad_norm": 4.907181453263694,
"kl": 0.033935546875,
"learning_rate": 7.303110580497501e-07,
"loss": 0.0,
"reward": 3.012500047683716,
"reward_std": 0.3685019910335541,
"rewards/accuracy_reward": 1.787500023841858,
"rewards/format_reward": 1.0,
"step": 227,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 288.0,
"epoch": 0.34915773353751917,
"grad_norm": 8.5707111135805,
"kl": 0.0556640625,
"learning_rate": 7.281732813559713e-07,
"loss": 0.0001,
"reward": 3.3187499046325684,
"reward_std": 0.5146178007125854,
"rewards/accuracy_reward": 2.018749952316284,
"rewards/format_reward": 1.0,
"step": 228,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.5,
"epoch": 0.35068912710566613,
"grad_norm": 11.27891495949733,
"kl": 0.042236328125,
"learning_rate": 7.260302234048125e-07,
"loss": 0.0,
"reward": 3.575000047683716,
"reward_std": 0.3730372190475464,
"rewards/accuracy_reward": 2.2750000953674316,
"rewards/format_reward": 1.0,
"step": 229,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.9375,
"epoch": 0.35222052067381315,
"grad_norm": 19.476901011082536,
"kl": 0.04150390625,
"learning_rate": 7.23881933799104e-07,
"loss": 0.0,
"reward": 3.924999952316284,
"reward_std": 0.5146142244338989,
"rewards/accuracy_reward": 2.625,
"rewards/format_reward": 1.0,
"step": 230,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.25,
"epoch": 0.3537519142419602,
"grad_norm": 6.55022815298153,
"kl": 0.044921875,
"learning_rate": 7.217284622627674e-07,
"loss": 0.0,
"reward": 3.968749761581421,
"reward_std": 0.6387063264846802,
"rewards/accuracy_reward": 2.668750286102295,
"rewards/format_reward": 1.0,
"step": 231,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 330.0625,
"epoch": 0.3552833078101072,
"grad_norm": 11.903816051107185,
"kl": 0.041259765625,
"learning_rate": 7.195698586396645e-07,
"loss": 0.0,
"reward": 2.6374998092651367,
"reward_std": 0.28559741377830505,
"rewards/accuracy_reward": 1.412500023841858,
"rewards/format_reward": 1.0,
"step": 232,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.21875,
"epoch": 0.3568147013782542,
"grad_norm": 7.891011771111216,
"kl": 0.04052734375,
"learning_rate": 7.174061728924428e-07,
"loss": 0.0,
"reward": 3.1499998569488525,
"reward_std": 0.5327916145324707,
"rewards/accuracy_reward": 1.8499999046325684,
"rewards/format_reward": 1.0,
"step": 233,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 304.3125,
"epoch": 0.35834609494640124,
"grad_norm": 14.640401008280097,
"kl": 0.03955078125,
"learning_rate": 7.152374551013804e-07,
"loss": 0.0,
"reward": 3.3499999046325684,
"reward_std": 0.5537967681884766,
"rewards/accuracy_reward": 2.049999952316284,
"rewards/format_reward": 1.0,
"step": 234,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.40625,
"epoch": 0.35987748851454826,
"grad_norm": 4.292297714421554,
"kl": 0.04443359375,
"learning_rate": 7.130637554632257e-07,
"loss": 0.0,
"reward": 2.9937498569488525,
"reward_std": 0.1590990275144577,
"rewards/accuracy_reward": 1.6937499046325684,
"rewards/format_reward": 1.0,
"step": 235,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.28125,
"epoch": 0.3614088820826952,
"grad_norm": 36.0396195163604,
"kl": 0.038330078125,
"learning_rate": 7.108851242900364e-07,
"loss": 0.0,
"reward": 2.9250001907348633,
"reward_std": 0.4584749937057495,
"rewards/accuracy_reward": 1.625,
"rewards/format_reward": 1.0,
"step": 236,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 280.28125,
"epoch": 0.36294027565084225,
"grad_norm": 10.543504458410956,
"kl": 0.048095703125,
"learning_rate": 7.087016120080145e-07,
"loss": 0.0,
"reward": 3.687499761581421,
"reward_std": 0.4067375659942627,
"rewards/accuracy_reward": 2.387500047683716,
"rewards/format_reward": 1.0,
"step": 237,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 279.4375,
"epoch": 0.36447166921898927,
"grad_norm": 12.020212644340816,
"kl": 0.05517578125,
"learning_rate": 7.065132691563388e-07,
"loss": 0.0001,
"reward": 4.162500381469727,
"reward_std": 0.6134676933288574,
"rewards/accuracy_reward": 2.937500238418579,
"rewards/format_reward": 1.0,
"step": 238,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.03125,
"epoch": 0.3660030627871363,
"grad_norm": 6.445427231037523,
"kl": 0.04638671875,
"learning_rate": 7.043201463859963e-07,
"loss": 0.0,
"reward": 4.006249904632568,
"reward_std": 0.39609289169311523,
"rewards/accuracy_reward": 2.706249952316284,
"rewards/format_reward": 1.0,
"step": 239,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.4375,
"epoch": 0.3675344563552833,
"grad_norm": 12.36139816672738,
"kl": 0.04443359375,
"learning_rate": 7.021222944586088e-07,
"loss": 0.0,
"reward": 3.0812501907348633,
"reward_std": 0.21749000251293182,
"rewards/accuracy_reward": 1.78125,
"rewards/format_reward": 1.0,
"step": 240,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.96875,
"epoch": 0.36906584992343033,
"grad_norm": 46.70492156510754,
"kl": 0.04248046875,
"learning_rate": 6.999197642452583e-07,
"loss": 0.0,
"reward": 3.53125,
"reward_std": 0.339226096868515,
"rewards/accuracy_reward": 2.231250047683716,
"rewards/format_reward": 1.0,
"step": 241,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.75,
"epoch": 0.37059724349157736,
"grad_norm": 5.008941210138734,
"kl": 0.044189453125,
"learning_rate": 6.977126067253095e-07,
"loss": 0.0,
"reward": 3.143749952316284,
"reward_std": 0.5060650110244751,
"rewards/accuracy_reward": 1.8437498807907104,
"rewards/format_reward": 1.0,
"step": 242,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 255.09375,
"epoch": 0.3721286370597244,
"grad_norm": 7.851328072810821,
"kl": 0.04931640625,
"learning_rate": 6.9550087298523e-07,
"loss": 0.0,
"reward": 3.9749999046325684,
"reward_std": 0.39178720116615295,
"rewards/accuracy_reward": 2.6750001907348633,
"rewards/format_reward": 1.0,
"step": 243,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 332.6875,
"epoch": 0.37366003062787134,
"grad_norm": 15.778103385034653,
"kl": 0.04052734375,
"learning_rate": 6.93284614217408e-07,
"loss": 0.0,
"reward": 2.856250047683716,
"reward_std": 0.3301621079444885,
"rewards/accuracy_reward": 1.5562498569488525,
"rewards/format_reward": 1.0,
"step": 244,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 272.21875,
"epoch": 0.37519142419601836,
"grad_norm": 11.303882433021103,
"kl": 0.0458984375,
"learning_rate": 6.910638817189664e-07,
"loss": 0.0,
"reward": 3.8062500953674316,
"reward_std": 0.46951526403427124,
"rewards/accuracy_reward": 2.5062499046325684,
"rewards/format_reward": 1.0,
"step": 245,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 334.25,
"epoch": 0.3767228177641654,
"grad_norm": 10.26008198653885,
"kl": 0.04296875,
"learning_rate": 6.888387268905773e-07,
"loss": 0.0,
"reward": 3.0749998092651367,
"reward_std": 0.4924160838127136,
"rewards/accuracy_reward": 1.7750000953674316,
"rewards/format_reward": 1.0,
"step": 246,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.40625,
"epoch": 0.3782542113323124,
"grad_norm": 31.96761888686875,
"kl": 0.0380859375,
"learning_rate": 6.866092012352705e-07,
"loss": 0.0,
"reward": 3.331249952316284,
"reward_std": 0.6083425283432007,
"rewards/accuracy_reward": 2.03125,
"rewards/format_reward": 1.0,
"step": 247,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.28125,
"epoch": 0.37978560490045943,
"grad_norm": 6.5775361765625595,
"kl": 0.04345703125,
"learning_rate": 6.843753563572423e-07,
"loss": 0.0,
"reward": 3.9000000953674316,
"reward_std": 0.659449577331543,
"rewards/accuracy_reward": 2.674999952316284,
"rewards/format_reward": 1.0,
"step": 248,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.53125,
"epoch": 0.38131699846860645,
"grad_norm": 6.36601489337084,
"kl": 0.0400390625,
"learning_rate": 6.821372439606611e-07,
"loss": 0.0,
"reward": 2.9499998092651367,
"reward_std": 0.4043930172920227,
"rewards/accuracy_reward": 1.6500000953674316,
"rewards/format_reward": 1.0,
"step": 249,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.0,
"epoch": 0.38284839203675347,
"grad_norm": 3.1943067429090424,
"kl": 0.04541015625,
"learning_rate": 6.798949158484705e-07,
"loss": 0.0,
"reward": 3.0249998569488525,
"reward_std": 0.38181358575820923,
"rewards/accuracy_reward": 1.7999999523162842,
"rewards/format_reward": 1.0,
"step": 250,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.09375,
"epoch": 0.38437978560490044,
"grad_norm": 5.713129404217661,
"kl": 0.046630859375,
"learning_rate": 6.776484239211903e-07,
"loss": 0.0,
"reward": 3.268749952316284,
"reward_std": 0.26601967215538025,
"rewards/accuracy_reward": 1.96875,
"rewards/format_reward": 1.0,
"step": 251,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 321.125,
"epoch": 0.38591117917304746,
"grad_norm": 6.335821394393776,
"kl": 0.052490234375,
"learning_rate": 6.753978201757149e-07,
"loss": 0.0001,
"reward": 3.456249952316284,
"reward_std": 0.3842371702194214,
"rewards/accuracy_reward": 2.15625,
"rewards/format_reward": 1.0,
"step": 252,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 276.71875,
"epoch": 0.3874425727411945,
"grad_norm": 8.679817303939133,
"kl": 0.046875,
"learning_rate": 6.731431567041106e-07,
"loss": 0.0,
"reward": 3.7562499046325684,
"reward_std": 0.5177336931228638,
"rewards/accuracy_reward": 2.456249952316284,
"rewards/format_reward": 1.0,
"step": 253,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 263.9375,
"epoch": 0.3889739663093415,
"grad_norm": 5.808317301193834,
"kl": 0.04833984375,
"learning_rate": 6.708844856924088e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.4449988007545471,
"rewards/accuracy_reward": 2.9000000953674316,
"rewards/format_reward": 1.0,
"step": 254,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.1875,
"epoch": 0.3905053598774885,
"grad_norm": 8.34282922172143,
"kl": 0.041015625,
"learning_rate": 6.686218594193993e-07,
"loss": 0.0,
"reward": 3.1062498092651367,
"reward_std": 0.3678450584411621,
"rewards/accuracy_reward": 1.8062500953674316,
"rewards/format_reward": 1.0,
"step": 255,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 285.0,
"epoch": 0.39203675344563554,
"grad_norm": 6.993127416049526,
"kl": 0.053955078125,
"learning_rate": 6.663553302554193e-07,
"loss": 0.0001,
"reward": 3.6812500953674316,
"reward_std": 0.26855015754699707,
"rewards/accuracy_reward": 2.4562501907348633,
"rewards/format_reward": 1.0,
"step": 256,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.6875,
"epoch": 0.39356814701378257,
"grad_norm": 21.057020289534563,
"kl": 0.04638671875,
"learning_rate": 6.640849506611417e-07,
"loss": 0.0,
"reward": 3.9124999046325684,
"reward_std": 0.4169827699661255,
"rewards/accuracy_reward": 2.6125001907348633,
"rewards/format_reward": 1.0,
"step": 257,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 304.1875,
"epoch": 0.39509954058192953,
"grad_norm": 7.14663466029652,
"kl": 0.044677734375,
"learning_rate": 6.618107731863608e-07,
"loss": 0.0,
"reward": 4.231250286102295,
"reward_std": 0.5161499977111816,
"rewards/accuracy_reward": 2.9312500953674316,
"rewards/format_reward": 1.0,
"step": 258,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.25,
"epoch": 0.39663093415007655,
"grad_norm": 9.2743529009992,
"kl": 0.04150390625,
"learning_rate": 6.595328504687757e-07,
"loss": 0.0,
"reward": 3.231250047683716,
"reward_std": 0.40922269225120544,
"rewards/accuracy_reward": 2.0062499046325684,
"rewards/format_reward": 1.0,
"step": 259,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.375,
"epoch": 0.3981623277182236,
"grad_norm": 4.79212098522223,
"kl": 0.03955078125,
"learning_rate": 6.572512352327726e-07,
"loss": 0.0,
"reward": 3.6687498092651367,
"reward_std": 0.3076220154762268,
"rewards/accuracy_reward": 2.3687500953674316,
"rewards/format_reward": 1.0,
"step": 260,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.53125,
"epoch": 0.3996937212863706,
"grad_norm": 16.65575325809638,
"kl": 0.0478515625,
"learning_rate": 6.549659802882038e-07,
"loss": 0.0,
"reward": 3.3812501430511475,
"reward_std": 0.3832942843437195,
"rewards/accuracy_reward": 2.081249952316284,
"rewards/format_reward": 1.0,
"step": 261,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.375,
"epoch": 0.4012251148545176,
"grad_norm": 6.152326499680604,
"kl": 0.04443359375,
"learning_rate": 6.526771385291656e-07,
"loss": 0.0,
"reward": 3.84375,
"reward_std": 0.38622021675109863,
"rewards/accuracy_reward": 2.5437498092651367,
"rewards/format_reward": 1.0,
"step": 262,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.40625,
"epoch": 0.40275650842266464,
"grad_norm": 10.417553039855553,
"kl": 0.050048828125,
"learning_rate": 6.503847629327744e-07,
"loss": 0.0,
"reward": 4.006250381469727,
"reward_std": 0.583352267742157,
"rewards/accuracy_reward": 2.7062501907348633,
"rewards/format_reward": 1.0,
"step": 263,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.0,
"epoch": 0.40428790199081166,
"grad_norm": 3.669963233981849,
"kl": 0.045166015625,
"learning_rate": 6.480889065579398e-07,
"loss": 0.0,
"reward": 2.7249999046325684,
"reward_std": 0.2972213923931122,
"rewards/accuracy_reward": 1.4250000715255737,
"rewards/format_reward": 1.0,
"step": 264,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 259.90625,
"epoch": 0.4058192955589586,
"grad_norm": 4.88282000130352,
"kl": 0.0556640625,
"learning_rate": 6.457896225441371e-07,
"loss": 0.0001,
"reward": 3.1875,
"reward_std": 0.2949331998825073,
"rewards/accuracy_reward": 1.962499976158142,
"rewards/format_reward": 1.0,
"step": 265,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 265.125,
"epoch": 0.40735068912710565,
"grad_norm": 22.004499913124683,
"kl": 0.046875,
"learning_rate": 6.434869641101768e-07,
"loss": 0.0,
"reward": 3.950000047683716,
"reward_std": 0.6295482516288757,
"rewards/accuracy_reward": 2.6500000953674316,
"rewards/format_reward": 1.0,
"step": 266,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.8125,
"epoch": 0.40888208269525267,
"grad_norm": 6.6840837429320485,
"kl": 0.050048828125,
"learning_rate": 6.411809845529734e-07,
"loss": 0.0,
"reward": 3.893749713897705,
"reward_std": 0.5396034717559814,
"rewards/accuracy_reward": 2.668750047683716,
"rewards/format_reward": 1.0,
"step": 267,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.5625,
"epoch": 0.4104134762633997,
"grad_norm": 7.143023845506581,
"kl": 0.05029296875,
"learning_rate": 6.388717372463115e-07,
"loss": 0.0001,
"reward": 3.731250047683716,
"reward_std": 0.3467658758163452,
"rewards/accuracy_reward": 2.4312498569488525,
"rewards/format_reward": 1.0,
"step": 268,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.4375,
"epoch": 0.4119448698315467,
"grad_norm": 12.633736339498368,
"kl": 0.05224609375,
"learning_rate": 6.365592756396101e-07,
"loss": 0.0001,
"reward": 2.9250001907348633,
"reward_std": 0.4054605960845947,
"rewards/accuracy_reward": 1.6250001192092896,
"rewards/format_reward": 1.0,
"step": 269,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.40625,
"epoch": 0.41347626339969373,
"grad_norm": 4.355690686839267,
"kl": 0.05078125,
"learning_rate": 6.342436532566865e-07,
"loss": 0.0001,
"reward": 3.5749998092651367,
"reward_std": 0.2884190082550049,
"rewards/accuracy_reward": 2.2750000953674316,
"rewards/format_reward": 1.0,
"step": 270,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.34375,
"epoch": 0.41500765696784075,
"grad_norm": 5.447937890611708,
"kl": 0.04736328125,
"learning_rate": 6.319249236945161e-07,
"loss": 0.0,
"reward": 3.674999713897705,
"reward_std": 0.5889295339584351,
"rewards/accuracy_reward": 2.450000047683716,
"rewards/format_reward": 1.0,
"step": 271,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 255.6875,
"epoch": 0.4165390505359878,
"grad_norm": 15.587262473448611,
"kl": 0.05419921875,
"learning_rate": 6.296031406219926e-07,
"loss": 0.0001,
"reward": 3.5562500953674316,
"reward_std": 0.4689219295978546,
"rewards/accuracy_reward": 2.2562499046325684,
"rewards/format_reward": 1.0,
"step": 272,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 314.78125,
"epoch": 0.41807044410413474,
"grad_norm": 5.16002952322231,
"kl": 0.052978515625,
"learning_rate": 6.272783577786862e-07,
"loss": 0.0001,
"reward": 3.4562501907348633,
"reward_std": 0.5224058628082275,
"rewards/accuracy_reward": 2.15625,
"rewards/format_reward": 1.0,
"step": 273,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.28125,
"epoch": 0.41960183767228176,
"grad_norm": 15.23123318426172,
"kl": 0.047119140625,
"learning_rate": 6.249506289735984e-07,
"loss": 0.0,
"reward": 3.299999952316284,
"reward_std": 0.44371646642684937,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"step": 274,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.875,
"epoch": 0.4211332312404288,
"grad_norm": 6.07627204223583,
"kl": 0.043212890625,
"learning_rate": 6.226200080839182e-07,
"loss": 0.0,
"reward": 4.193749904632568,
"reward_std": 0.41577982902526855,
"rewards/accuracy_reward": 2.8937501907348633,
"rewards/format_reward": 1.0,
"step": 275,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 304.46875,
"epoch": 0.4226646248085758,
"grad_norm": 3.736918599404993,
"kl": 0.05078125,
"learning_rate": 6.202865490537739e-07,
"loss": 0.0001,
"reward": 3.531249761581421,
"reward_std": 0.3116908073425293,
"rewards/accuracy_reward": 2.231250047683716,
"rewards/format_reward": 1.0,
"step": 276,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 284.71875,
"epoch": 0.4241960183767228,
"grad_norm": 37.59291744193968,
"kl": 0.05029296875,
"learning_rate": 6.179503058929849e-07,
"loss": 0.0,
"reward": 3.6687498092651367,
"reward_std": 0.4390817880630493,
"rewards/accuracy_reward": 2.3687500953674316,
"rewards/format_reward": 1.0,
"step": 277,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 336.9375,
"epoch": 0.42572741194486985,
"grad_norm": 6.1493970390526105,
"kl": 0.044677734375,
"learning_rate": 6.156113326758118e-07,
"loss": 0.0,
"reward": 3.5687499046325684,
"reward_std": 0.3175239861011505,
"rewards/accuracy_reward": 2.268749952316284,
"rewards/format_reward": 1.0,
"step": 278,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 270.46875,
"epoch": 0.42725880551301687,
"grad_norm": 4.579166257528243,
"kl": 0.043701171875,
"learning_rate": 6.132696835397038e-07,
"loss": 0.0,
"reward": 3.5999999046325684,
"reward_std": 0.3979983329772949,
"rewards/accuracy_reward": 2.299999952316284,
"rewards/format_reward": 1.0,
"step": 279,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 290.9375,
"epoch": 0.42879019908116384,
"grad_norm": 11.69894583370795,
"kl": 0.052978515625,
"learning_rate": 6.109254126840479e-07,
"loss": 0.0001,
"reward": 2.9124999046325684,
"reward_std": 0.40907424688339233,
"rewards/accuracy_reward": 1.6875,
"rewards/format_reward": 1.0,
"step": 280,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.75,
"epoch": 0.43032159264931086,
"grad_norm": 4.623021086674321,
"kl": 0.04345703125,
"learning_rate": 6.085785743689113e-07,
"loss": 0.0,
"reward": 3.46875,
"reward_std": 0.39231395721435547,
"rewards/accuracy_reward": 2.2437498569488525,
"rewards/format_reward": 1.0,
"step": 281,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 313.28125,
"epoch": 0.4318529862174579,
"grad_norm": 6.303506543948684,
"kl": 0.048095703125,
"learning_rate": 6.062292229137885e-07,
"loss": 0.0,
"reward": 3.3687500953674316,
"reward_std": 0.46298372745513916,
"rewards/accuracy_reward": 2.0687499046325684,
"rewards/format_reward": 1.0,
"step": 282,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 264.5,
"epoch": 0.4333843797856049,
"grad_norm": 13.679405544800808,
"kl": 0.04833984375,
"learning_rate": 6.038774126963416e-07,
"loss": 0.0,
"reward": 4.356250286102295,
"reward_std": 0.37773382663726807,
"rewards/accuracy_reward": 3.0562498569488525,
"rewards/format_reward": 1.0,
"step": 283,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 262.21875,
"epoch": 0.4349157733537519,
"grad_norm": 14.30067593344122,
"kl": 0.055908203125,
"learning_rate": 6.015231981511439e-07,
"loss": 0.0001,
"reward": 3.5749998092651367,
"reward_std": 0.29896894097328186,
"rewards/accuracy_reward": 2.2749998569488525,
"rewards/format_reward": 1.0,
"step": 284,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 265.78125,
"epoch": 0.43644716692189894,
"grad_norm": 7.190634169332564,
"kl": 0.05908203125,
"learning_rate": 5.991666337684176e-07,
"loss": 0.0001,
"reward": 4.068750381469727,
"reward_std": 0.6775935292243958,
"rewards/accuracy_reward": 2.84375,
"rewards/format_reward": 1.0,
"step": 285,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 263.59375,
"epoch": 0.43797856049004597,
"grad_norm": 6.305888489386578,
"kl": 0.051513671875,
"learning_rate": 5.968077740927748e-07,
"loss": 0.0001,
"reward": 3.53125,
"reward_std": 0.5097336769104004,
"rewards/accuracy_reward": 2.231250047683716,
"rewards/format_reward": 1.0,
"step": 286,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 349.59375,
"epoch": 0.43950995405819293,
"grad_norm": 8.02368291703908,
"kl": 0.03857421875,
"learning_rate": 5.944466737219536e-07,
"loss": 0.0,
"reward": 3.393749952316284,
"reward_std": 0.4041438102722168,
"rewards/accuracy_reward": 2.09375,
"rewards/format_reward": 1.0,
"step": 287,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.9375,
"epoch": 0.44104134762633995,
"grad_norm": 3.7358796421010387,
"kl": 0.04443359375,
"learning_rate": 5.920833873055546e-07,
"loss": 0.0,
"reward": 3.65625,
"reward_std": 0.33253127336502075,
"rewards/accuracy_reward": 2.4312500953674316,
"rewards/format_reward": 1.0,
"step": 288,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 289.84375,
"epoch": 0.442572741194487,
"grad_norm": 5.746987770848019,
"kl": 0.052001953125,
"learning_rate": 5.89717969543777e-07,
"loss": 0.0001,
"reward": 3.46875,
"reward_std": 0.4438609480857849,
"rewards/accuracy_reward": 2.168750047683716,
"rewards/format_reward": 1.0,
"step": 289,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.3125,
"epoch": 0.444104134762634,
"grad_norm": 14.501094390969634,
"kl": 0.05712890625,
"learning_rate": 5.873504751861507e-07,
"loss": 0.0001,
"reward": 3.731250047683716,
"reward_std": 0.41762256622314453,
"rewards/accuracy_reward": 2.4312500953674316,
"rewards/format_reward": 1.0,
"step": 290,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 266.1875,
"epoch": 0.445635528330781,
"grad_norm": 5.515171753260493,
"kl": 0.055419921875,
"learning_rate": 5.849809590302712e-07,
"loss": 0.0001,
"reward": 3.71875,
"reward_std": 0.46692323684692383,
"rewards/accuracy_reward": 2.418750047683716,
"rewards/format_reward": 1.0,
"step": 291,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 242.8125,
"epoch": 0.44716692189892804,
"grad_norm": 17.253150012005655,
"kl": 0.0517578125,
"learning_rate": 5.826094759205293e-07,
"loss": 0.0001,
"reward": 4.137499809265137,
"reward_std": 0.41192424297332764,
"rewards/accuracy_reward": 2.8375000953674316,
"rewards/format_reward": 1.0,
"step": 292,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.90625,
"epoch": 0.44869831546707506,
"grad_norm": 6.706786195251721,
"kl": 0.04931640625,
"learning_rate": 5.802360807468427e-07,
"loss": 0.0,
"reward": 3.3249998092651367,
"reward_std": 0.3380519151687622,
"rewards/accuracy_reward": 2.0999999046325684,
"rewards/format_reward": 1.0,
"step": 293,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.625,
"epoch": 0.450229709035222,
"grad_norm": 8.494514075528341,
"kl": 0.047119140625,
"learning_rate": 5.778608284433862e-07,
"loss": 0.0,
"reward": 3.325000047683716,
"reward_std": 0.48659753799438477,
"rewards/accuracy_reward": 2.0999999046325684,
"rewards/format_reward": 1.0,
"step": 294,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 350.125,
"epoch": 0.45176110260336905,
"grad_norm": 5.536688254453823,
"kl": 0.04296875,
"learning_rate": 5.754837739873178e-07,
"loss": 0.0,
"reward": 2.856250047683716,
"reward_std": 0.25302496552467346,
"rewards/accuracy_reward": 1.7062499523162842,
"rewards/format_reward": 1.0,
"step": 295,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.0,
"epoch": 0.45329249617151607,
"grad_norm": 4.692889551300066,
"kl": 0.0458984375,
"learning_rate": 5.731049723975096e-07,
"loss": 0.0,
"reward": 3.25,
"reward_std": 0.5270546674728394,
"rewards/accuracy_reward": 2.0250000953674316,
"rewards/format_reward": 1.0,
"step": 296,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 288.34375,
"epoch": 0.4548238897396631,
"grad_norm": 6.310747002522716,
"kl": 0.0458984375,
"learning_rate": 5.707244787332711e-07,
"loss": 0.0,
"reward": 3.643749952316284,
"reward_std": 0.46277567744255066,
"rewards/accuracy_reward": 2.343749761581421,
"rewards/format_reward": 1.0,
"step": 297,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 332.96875,
"epoch": 0.4563552833078101,
"grad_norm": 7.199689739207391,
"kl": 0.04541015625,
"learning_rate": 5.683423480930774e-07,
"loss": 0.0,
"reward": 3.6312499046325684,
"reward_std": 0.42347651720046997,
"rewards/accuracy_reward": 2.331249952316284,
"rewards/format_reward": 1.0,
"step": 298,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.21875,
"epoch": 0.45788667687595713,
"grad_norm": 11.877389622998688,
"kl": 0.0458984375,
"learning_rate": 5.659586356132917e-07,
"loss": 0.0,
"reward": 3.768749952316284,
"reward_std": 0.5539591312408447,
"rewards/accuracy_reward": 2.5437498092651367,
"rewards/format_reward": 1.0,
"step": 299,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 261.78125,
"epoch": 0.45941807044410415,
"grad_norm": 4.089729197678542,
"kl": 0.04931640625,
"learning_rate": 5.635733964668909e-07,
"loss": 0.0,
"reward": 3.71875,
"reward_std": 0.40402692556381226,
"rewards/accuracy_reward": 2.418750047683716,
"rewards/format_reward": 1.0,
"step": 300,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 269.84375,
"epoch": 0.4609494640122512,
"grad_norm": 8.83236964834344,
"kl": 0.051513671875,
"learning_rate": 5.611866858621874e-07,
"loss": 0.0001,
"reward": 3.9187498092651367,
"reward_std": 0.31364038586616516,
"rewards/accuracy_reward": 2.6187500953674316,
"rewards/format_reward": 1.0,
"step": 301,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 256.34375,
"epoch": 0.46248085758039814,
"grad_norm": 10.261742972214364,
"kl": 0.05078125,
"learning_rate": 5.587985590415523e-07,
"loss": 0.0001,
"reward": 4.306249618530273,
"reward_std": 0.3834628462791443,
"rewards/accuracy_reward": 3.0062499046325684,
"rewards/format_reward": 1.0,
"step": 302,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.9375,
"epoch": 0.46401225114854516,
"grad_norm": 10.77009600273013,
"kl": 0.049072265625,
"learning_rate": 5.564090712801355e-07,
"loss": 0.0,
"reward": 4.081250190734863,
"reward_std": 0.5357294678688049,
"rewards/accuracy_reward": 2.78125,
"rewards/format_reward": 1.0,
"step": 303,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.1875,
"epoch": 0.4655436447166922,
"grad_norm": 10.077594491655711,
"kl": 0.047607421875,
"learning_rate": 5.540182778845871e-07,
"loss": 0.0,
"reward": 3.25,
"reward_std": 0.4022381901741028,
"rewards/accuracy_reward": 2.0250000953674316,
"rewards/format_reward": 1.0,
"step": 304,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.46875,
"epoch": 0.4670750382848392,
"grad_norm": 23.880086686835266,
"kl": 0.047119140625,
"learning_rate": 5.516262341917778e-07,
"loss": 0.0,
"reward": 3.8312501907348633,
"reward_std": 0.577486515045166,
"rewards/accuracy_reward": 2.53125,
"rewards/format_reward": 1.0,
"step": 305,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 279.71875,
"epoch": 0.4686064318529862,
"grad_norm": 5.6859935156058725,
"kl": 0.052978515625,
"learning_rate": 5.492329955675166e-07,
"loss": 0.0001,
"reward": 3.481250047683716,
"reward_std": 0.4928224980831146,
"rewards/accuracy_reward": 2.2562499046325684,
"rewards/format_reward": 1.0,
"step": 306,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 308.375,
"epoch": 0.47013782542113325,
"grad_norm": 16.540677042156723,
"kl": 0.0458984375,
"learning_rate": 5.468386174052709e-07,
"loss": 0.0,
"reward": 3.4187498092651367,
"reward_std": 0.4784466326236725,
"rewards/accuracy_reward": 2.1187500953674316,
"rewards/format_reward": 1.0,
"step": 307,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.3125,
"epoch": 0.47166921898928027,
"grad_norm": 4.992857126648013,
"kl": 0.050048828125,
"learning_rate": 5.444431551248831e-07,
"loss": 0.0,
"reward": 3.450000047683716,
"reward_std": 0.5468271970748901,
"rewards/accuracy_reward": 2.1499998569488525,
"rewards/format_reward": 1.0,
"step": 308,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 350.28125,
"epoch": 0.47320061255742724,
"grad_norm": 7.387730644270765,
"kl": 0.04345703125,
"learning_rate": 5.420466641712886e-07,
"loss": 0.0,
"reward": 3.362499952316284,
"reward_std": 0.44643062353134155,
"rewards/accuracy_reward": 2.0625,
"rewards/format_reward": 1.0,
"step": 309,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.28125,
"epoch": 0.47473200612557426,
"grad_norm": 4.8298307691048175,
"kl": 0.0517578125,
"learning_rate": 5.396492000132325e-07,
"loss": 0.0001,
"reward": 3.28125,
"reward_std": 0.6318193674087524,
"rewards/accuracy_reward": 2.0562500953674316,
"rewards/format_reward": 1.0,
"step": 310,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 345.125,
"epoch": 0.4762633996937213,
"grad_norm": 4.827377648738647,
"kl": 0.04833984375,
"learning_rate": 5.372508181419851e-07,
"loss": 0.0,
"reward": 3.706249952316284,
"reward_std": 0.4461830258369446,
"rewards/accuracy_reward": 2.40625,
"rewards/format_reward": 1.0,
"step": 311,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.5,
"epoch": 0.4777947932618683,
"grad_norm": 6.296930290616834,
"kl": 0.05078125,
"learning_rate": 5.348515740700582e-07,
"loss": 0.0001,
"reward": 3.6624999046325684,
"reward_std": 0.4589795470237732,
"rewards/accuracy_reward": 2.362499952316284,
"rewards/format_reward": 1.0,
"step": 312,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 272.1875,
"epoch": 0.4793261868300153,
"grad_norm": 8.929722302544173,
"kl": 0.0595703125,
"learning_rate": 5.324515233299199e-07,
"loss": 0.0001,
"reward": 4.399999618530273,
"reward_std": 0.37918606400489807,
"rewards/accuracy_reward": 3.0999999046325684,
"rewards/format_reward": 1.0,
"step": 313,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 329.28125,
"epoch": 0.48085758039816234,
"grad_norm": 5.059035226884364,
"kl": 0.04736328125,
"learning_rate": 5.300507214727092e-07,
"loss": 0.0,
"reward": 3.0625,
"reward_std": 0.4319705069065094,
"rewards/accuracy_reward": 1.7624999284744263,
"rewards/format_reward": 1.0,
"step": 314,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.28125,
"epoch": 0.48238897396630936,
"grad_norm": 4.2357305471468045,
"kl": 0.0517578125,
"learning_rate": 5.276492240669503e-07,
"loss": 0.0001,
"reward": 3.4187498092651367,
"reward_std": 0.5939319133758545,
"rewards/accuracy_reward": 2.1187500953674316,
"rewards/format_reward": 1.0,
"step": 315,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.125,
"epoch": 0.48392036753445633,
"grad_norm": 5.052922851946215,
"kl": 0.05029296875,
"learning_rate": 5.252470866972668e-07,
"loss": 0.0,
"reward": 3.46875,
"reward_std": 0.20276173949241638,
"rewards/accuracy_reward": 2.2437498569488525,
"rewards/format_reward": 1.0,
"step": 316,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.53125,
"epoch": 0.48545176110260335,
"grad_norm": 14.798643962141302,
"kl": 0.044921875,
"learning_rate": 5.228443649630945e-07,
"loss": 0.0,
"reward": 4.143750190734863,
"reward_std": 0.5091822743415833,
"rewards/accuracy_reward": 2.84375,
"rewards/format_reward": 1.0,
"step": 317,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.1875,
"epoch": 0.4869831546707504,
"grad_norm": 3.342287669075317,
"kl": 0.04541015625,
"learning_rate": 5.204411144773944e-07,
"loss": 0.0,
"reward": 3.6812500953674316,
"reward_std": 0.3361450731754303,
"rewards/accuracy_reward": 2.3812496662139893,
"rewards/format_reward": 1.0,
"step": 318,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.25,
"epoch": 0.4885145482388974,
"grad_norm": 4.492148845651719,
"kl": 0.05078125,
"learning_rate": 5.180373908653667e-07,
"loss": 0.0001,
"reward": 3.2937498092651367,
"reward_std": 0.43178924918174744,
"rewards/accuracy_reward": 1.993749976158142,
"rewards/format_reward": 1.0,
"step": 319,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 278.25,
"epoch": 0.4900459418070444,
"grad_norm": 13.039966488350624,
"kl": 0.04931640625,
"learning_rate": 5.156332497631621e-07,
"loss": 0.0,
"reward": 3.456249952316284,
"reward_std": 0.38298481702804565,
"rewards/accuracy_reward": 2.15625,
"rewards/format_reward": 1.0,
"step": 320,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 279.875,
"epoch": 0.49157733537519144,
"grad_norm": 12.642622696044361,
"kl": 0.055419921875,
"learning_rate": 5.13228746816594e-07,
"loss": 0.0001,
"reward": 3.887500286102295,
"reward_std": 0.4415132403373718,
"rewards/accuracy_reward": 2.5874998569488525,
"rewards/format_reward": 1.0,
"step": 321,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.46875,
"epoch": 0.49310872894333846,
"grad_norm": 4.286766253570016,
"kl": 0.053466796875,
"learning_rate": 5.10823937679852e-07,
"loss": 0.0001,
"reward": 4.28125,
"reward_std": 0.3476155996322632,
"rewards/accuracy_reward": 3.0562498569488525,
"rewards/format_reward": 1.0,
"step": 322,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.03125,
"epoch": 0.4946401225114854,
"grad_norm": 5.229987914897258,
"kl": 0.05224609375,
"learning_rate": 5.084188780142118e-07,
"loss": 0.0001,
"reward": 3.9250001907348633,
"reward_std": 0.6279107928276062,
"rewards/accuracy_reward": 2.6999998092651367,
"rewards/format_reward": 1.0,
"step": 323,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.9375,
"epoch": 0.49617151607963245,
"grad_norm": 28.49902177125843,
"kl": 0.05517578125,
"learning_rate": 5.060136234867484e-07,
"loss": 0.0001,
"reward": 3.2312498092651367,
"reward_std": 0.4152737259864807,
"rewards/accuracy_reward": 1.9312498569488525,
"rewards/format_reward": 1.0,
"step": 324,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 289.59375,
"epoch": 0.49770290964777947,
"grad_norm": 7.1439008267200235,
"kl": 0.051025390625,
"learning_rate": 5.036082297690464e-07,
"loss": 0.0001,
"reward": 3.6312499046325684,
"reward_std": 0.3471168279647827,
"rewards/accuracy_reward": 2.331249952316284,
"rewards/format_reward": 1.0,
"step": 325,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.46875,
"epoch": 0.4992343032159265,
"grad_norm": 11.072867676579405,
"kl": 0.046875,
"learning_rate": 5.012027525359129e-07,
"loss": 0.0,
"reward": 3.5,
"reward_std": 0.30619415640830994,
"rewards/accuracy_reward": 2.200000047683716,
"rewards/format_reward": 1.0,
"step": 326,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.125,
"epoch": 0.5007656967840735,
"grad_norm": 21.889693980675588,
"kl": 0.054443359375,
"learning_rate": 4.987972474640873e-07,
"loss": 0.0001,
"reward": 4.412499904632568,
"reward_std": 0.32305750250816345,
"rewards/accuracy_reward": 3.112499713897705,
"rewards/format_reward": 1.0,
"step": 327,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.53125,
"epoch": 0.5022970903522205,
"grad_norm": 3.952527777748008,
"kl": 0.045654296875,
"learning_rate": 4.963917702309536e-07,
"loss": 0.0,
"reward": 3.8562498092651367,
"reward_std": 0.39377397298812866,
"rewards/accuracy_reward": 2.5562500953674316,
"rewards/format_reward": 1.0,
"step": 328,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.21875,
"epoch": 0.5038284839203675,
"grad_norm": 5.346886118235814,
"kl": 0.047607421875,
"learning_rate": 4.939863765132519e-07,
"loss": 0.0,
"reward": 2.9749999046325684,
"reward_std": 0.42998963594436646,
"rewards/accuracy_reward": 1.75,
"rewards/format_reward": 1.0,
"step": 329,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 276.375,
"epoch": 0.5053598774885145,
"grad_norm": 6.679344822369119,
"kl": 0.049072265625,
"learning_rate": 4.915811219857882e-07,
"loss": 0.0,
"reward": 3.7437500953674316,
"reward_std": 0.5443640947341919,
"rewards/accuracy_reward": 2.4437499046325684,
"rewards/format_reward": 1.0,
"step": 330,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.8125,
"epoch": 0.5068912710566615,
"grad_norm": 34.57022446692435,
"kl": 0.047119140625,
"learning_rate": 4.891760623201481e-07,
"loss": 0.0,
"reward": 3.65625,
"reward_std": 0.5190573930740356,
"rewards/accuracy_reward": 2.4312498569488525,
"rewards/format_reward": 1.0,
"step": 331,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 330.8125,
"epoch": 0.5084226646248086,
"grad_norm": 8.057551366527573,
"kl": 0.0458984375,
"learning_rate": 4.86771253183406e-07,
"loss": 0.0,
"reward": 3.9812498092651367,
"reward_std": 0.4452638328075409,
"rewards/accuracy_reward": 2.6812498569488525,
"rewards/format_reward": 1.0,
"step": 332,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 277.6875,
"epoch": 0.5099540581929556,
"grad_norm": 26.453879255218297,
"kl": 0.0546875,
"learning_rate": 4.84366750236838e-07,
"loss": 0.0001,
"reward": 3.8249998092651367,
"reward_std": 0.46865373849868774,
"rewards/accuracy_reward": 2.5250000953674316,
"rewards/format_reward": 1.0,
"step": 333,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.875,
"epoch": 0.5114854517611026,
"grad_norm": 4.902733023855285,
"kl": 0.054443359375,
"learning_rate": 4.819626091346333e-07,
"loss": 0.0001,
"reward": 3.0437498092651367,
"reward_std": 0.2876538038253784,
"rewards/accuracy_reward": 1.743749976158142,
"rewards/format_reward": 1.0,
"step": 334,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 290.28125,
"epoch": 0.5130168453292496,
"grad_norm": 58.51949600354415,
"kl": 0.046142578125,
"learning_rate": 4.795588855226055e-07,
"loss": 0.0,
"reward": 3.5437498092651367,
"reward_std": 0.41970500349998474,
"rewards/accuracy_reward": 2.2437500953674316,
"rewards/format_reward": 1.0,
"step": 335,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 267.0625,
"epoch": 0.5145482388973966,
"grad_norm": 5.658384395157894,
"kl": 0.05126953125,
"learning_rate": 4.771556350369056e-07,
"loss": 0.0001,
"reward": 4.050000190734863,
"reward_std": 0.347625732421875,
"rewards/accuracy_reward": 2.825000047683716,
"rewards/format_reward": 1.0,
"step": 336,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.5625,
"epoch": 0.5160796324655437,
"grad_norm": 10.144687298640607,
"kl": 0.04638671875,
"learning_rate": 4.7475291330273314e-07,
"loss": 0.0,
"reward": 4.212500095367432,
"reward_std": 0.3171128034591675,
"rewards/accuracy_reward": 2.9124999046325684,
"rewards/format_reward": 1.0,
"step": 337,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 308.96875,
"epoch": 0.5176110260336907,
"grad_norm": 6.512792003516793,
"kl": 0.050048828125,
"learning_rate": 4.7235077593304954e-07,
"loss": 0.0001,
"reward": 3.1687498092651367,
"reward_std": 0.5290573835372925,
"rewards/accuracy_reward": 2.018749952316284,
"rewards/format_reward": 1.0,
"step": 338,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.3125,
"epoch": 0.5191424196018377,
"grad_norm": 5.588824053582611,
"kl": 0.04931640625,
"learning_rate": 4.6994927852729085e-07,
"loss": 0.0,
"reward": 3.418750047683716,
"reward_std": 0.39333152770996094,
"rewards/accuracy_reward": 2.1937499046325684,
"rewards/format_reward": 1.0,
"step": 339,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.875,
"epoch": 0.5206738131699847,
"grad_norm": 15.184768822679787,
"kl": 0.044189453125,
"learning_rate": 4.6754847667008004e-07,
"loss": 0.0,
"reward": 4.112500190734863,
"reward_std": 0.5456335544586182,
"rewards/accuracy_reward": 2.8125,
"rewards/format_reward": 1.0,
"step": 340,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 348.65625,
"epoch": 0.5222052067381318,
"grad_norm": 6.794546630661044,
"kl": 0.046630859375,
"learning_rate": 4.6514842592994176e-07,
"loss": 0.0,
"reward": 3.862499952316284,
"reward_std": 0.4044283926486969,
"rewards/accuracy_reward": 2.5625,
"rewards/format_reward": 1.0,
"step": 341,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 268.4375,
"epoch": 0.5237366003062787,
"grad_norm": 7.093920182211075,
"kl": 0.0546875,
"learning_rate": 4.627491818580149e-07,
"loss": 0.0001,
"reward": 4.349999904632568,
"reward_std": 0.43522968888282776,
"rewards/accuracy_reward": 3.0500001907348633,
"rewards/format_reward": 1.0,
"step": 342,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 317.21875,
"epoch": 0.5252679938744257,
"grad_norm": 7.521140980470885,
"kl": 0.052490234375,
"learning_rate": 4.6035079998676755e-07,
"loss": 0.0001,
"reward": 3.8499999046325684,
"reward_std": 0.3446093201637268,
"rewards/accuracy_reward": 2.549999952316284,
"rewards/format_reward": 1.0,
"step": 343,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.375,
"epoch": 0.5267993874425727,
"grad_norm": 48.83516267703524,
"kl": 0.05517578125,
"learning_rate": 4.5795333582871133e-07,
"loss": 0.0001,
"reward": 3.549999952316284,
"reward_std": 0.5778024196624756,
"rewards/accuracy_reward": 2.25,
"rewards/format_reward": 1.0,
"step": 344,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 268.28125,
"epoch": 0.5283307810107197,
"grad_norm": 5.5659331458819015,
"kl": 0.05859375,
"learning_rate": 4.5555684487511693e-07,
"loss": 0.0001,
"reward": 4.050000190734863,
"reward_std": 0.3617284893989563,
"rewards/accuracy_reward": 2.75,
"rewards/format_reward": 1.0,
"step": 345,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.125,
"epoch": 0.5298621745788668,
"grad_norm": 8.836341008055525,
"kl": 0.05810546875,
"learning_rate": 4.5316138259472915e-07,
"loss": 0.0001,
"reward": 4.099999904632568,
"reward_std": 0.3962700366973877,
"rewards/accuracy_reward": 2.799999952316284,
"rewards/format_reward": 1.0,
"step": 346,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.40625,
"epoch": 0.5313935681470138,
"grad_norm": 6.7857182701070995,
"kl": 0.060302734375,
"learning_rate": 4.507670044324833e-07,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.5822941064834595,
"rewards/accuracy_reward": 2.762500047683716,
"rewards/format_reward": 1.0,
"step": 347,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 272.65625,
"epoch": 0.5329249617151608,
"grad_norm": 21.70038600264373,
"kl": 0.052001953125,
"learning_rate": 4.483737658082223e-07,
"loss": 0.0001,
"reward": 3.5875000953674316,
"reward_std": 0.5105255842208862,
"rewards/accuracy_reward": 2.362499952316284,
"rewards/format_reward": 1.0,
"step": 348,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 309.6875,
"epoch": 0.5344563552833078,
"grad_norm": 8.03758245428532,
"kl": 0.0537109375,
"learning_rate": 4.459817221154129e-07,
"loss": 0.0001,
"reward": 3.5187501907348633,
"reward_std": 0.49836862087249756,
"rewards/accuracy_reward": 2.21875,
"rewards/format_reward": 1.0,
"step": 349,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 294.71875,
"epoch": 0.5359877488514548,
"grad_norm": 26.176585081247165,
"kl": 0.05419921875,
"learning_rate": 4.435909287198646e-07,
"loss": 0.0001,
"reward": 3.3312501907348633,
"reward_std": 0.25713980197906494,
"rewards/accuracy_reward": 2.1812500953674316,
"rewards/format_reward": 1.0,
"step": 350,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 363.59375,
"epoch": 0.5375191424196019,
"grad_norm": 4.189754049368454,
"kl": 0.050048828125,
"learning_rate": 4.4120144095844773e-07,
"loss": 0.0,
"reward": 3.03125,
"reward_std": 0.27744585275650024,
"rewards/accuracy_reward": 1.806249976158142,
"rewards/format_reward": 1.0,
"step": 351,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 267.625,
"epoch": 0.5390505359877489,
"grad_norm": 5.923836672176543,
"kl": 0.0546875,
"learning_rate": 4.3881331413781247e-07,
"loss": 0.0001,
"reward": 3.987499713897705,
"reward_std": 0.5389834046363831,
"rewards/accuracy_reward": 2.6875,
"rewards/format_reward": 1.0,
"step": 352,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 267.5625,
"epoch": 0.5405819295558959,
"grad_norm": 24.676473640178664,
"kl": 0.05615234375,
"learning_rate": 4.364266035331091e-07,
"loss": 0.0001,
"reward": 4.131249904632568,
"reward_std": 0.5211171507835388,
"rewards/accuracy_reward": 2.90625,
"rewards/format_reward": 1.0,
"step": 353,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.75,
"epoch": 0.5421133231240429,
"grad_norm": 5.219576885609563,
"kl": 0.05859375,
"learning_rate": 4.340413643867083e-07,
"loss": 0.0001,
"reward": 3.3687500953674316,
"reward_std": 0.5183711647987366,
"rewards/accuracy_reward": 2.1437501907348633,
"rewards/format_reward": 1.0,
"step": 354,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.78125,
"epoch": 0.5436447166921899,
"grad_norm": 8.935709493601047,
"kl": 0.0498046875,
"learning_rate": 4.316576519069226e-07,
"loss": 0.0,
"reward": 3.8562498092651367,
"reward_std": 0.37087583541870117,
"rewards/accuracy_reward": 2.5562498569488525,
"rewards/format_reward": 1.0,
"step": 355,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 330.53125,
"epoch": 0.5451761102603369,
"grad_norm": 8.829545099860825,
"kl": 0.05810546875,
"learning_rate": 4.2927552126672887e-07,
"loss": 0.0001,
"reward": 3.6812498569488525,
"reward_std": 0.4220343232154846,
"rewards/accuracy_reward": 2.3812499046325684,
"rewards/format_reward": 1.0,
"step": 356,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 278.65625,
"epoch": 0.5467075038284839,
"grad_norm": 6.219717164837179,
"kl": 0.057861328125,
"learning_rate": 4.2689502760249057e-07,
"loss": 0.0001,
"reward": 3.418750047683716,
"reward_std": 0.44134071469306946,
"rewards/accuracy_reward": 2.1187500953674316,
"rewards/format_reward": 1.0,
"step": 357,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 353.75,
"epoch": 0.5482388973966309,
"grad_norm": 5.556766815792866,
"kl": 0.05029296875,
"learning_rate": 4.245162260126823e-07,
"loss": 0.0001,
"reward": 3.625,
"reward_std": 0.4050983190536499,
"rewards/accuracy_reward": 2.3249998092651367,
"rewards/format_reward": 1.0,
"step": 358,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 344.8125,
"epoch": 0.5497702909647779,
"grad_norm": 3.676604375125933,
"kl": 0.05078125,
"learning_rate": 4.2213917155661405e-07,
"loss": 0.0001,
"reward": 2.8499999046325684,
"reward_std": 0.29664111137390137,
"rewards/accuracy_reward": 1.625,
"rewards/format_reward": 1.0,
"step": 359,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.8125,
"epoch": 0.5513016845329249,
"grad_norm": 6.86923911826066,
"kl": 0.06005859375,
"learning_rate": 4.197639192531573e-07,
"loss": 0.0001,
"reward": 3.606250047683716,
"reward_std": 0.3868841528892517,
"rewards/accuracy_reward": 2.3062500953674316,
"rewards/format_reward": 1.0,
"step": 360,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.5,
"epoch": 0.552833078101072,
"grad_norm": 10.259674346863441,
"kl": 0.056884765625,
"learning_rate": 4.1739052407947075e-07,
"loss": 0.0001,
"reward": 3.1999998092651367,
"reward_std": 0.24718201160430908,
"rewards/accuracy_reward": 1.975000023841858,
"rewards/format_reward": 1.0,
"step": 361,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.84375,
"epoch": 0.554364471669219,
"grad_norm": 5.8426688029571485,
"kl": 0.05517578125,
"learning_rate": 4.150190409697288e-07,
"loss": 0.0001,
"reward": 4.425000190734863,
"reward_std": 0.3833528757095337,
"rewards/accuracy_reward": 3.125,
"rewards/format_reward": 1.0,
"step": 362,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 309.9375,
"epoch": 0.555895865237366,
"grad_norm": 5.259636552958595,
"kl": 0.053466796875,
"learning_rate": 4.126495248138492e-07,
"loss": 0.0001,
"reward": 4.081250190734863,
"reward_std": 0.38400453329086304,
"rewards/accuracy_reward": 2.781250238418579,
"rewards/format_reward": 1.0,
"step": 363,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 329.0,
"epoch": 0.557427258805513,
"grad_norm": 6.60110747813211,
"kl": 0.054931640625,
"learning_rate": 4.10282030456223e-07,
"loss": 0.0001,
"reward": 3.062499761581421,
"reward_std": 0.5393983721733093,
"rewards/accuracy_reward": 1.837499976158142,
"rewards/format_reward": 1.0,
"step": 364,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 299.03125,
"epoch": 0.55895865237366,
"grad_norm": 4.253141502103047,
"kl": 0.054931640625,
"learning_rate": 4.079166126944453e-07,
"loss": 0.0001,
"reward": 4.524999618530273,
"reward_std": 0.1969119757413864,
"rewards/accuracy_reward": 3.2249999046325684,
"rewards/format_reward": 1.0,
"step": 365,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.625,
"epoch": 0.5604900459418071,
"grad_norm": 6.796068903920509,
"kl": 0.05126953125,
"learning_rate": 4.055533262780464e-07,
"loss": 0.0001,
"reward": 2.768749952316284,
"reward_std": 0.2892647087574005,
"rewards/accuracy_reward": 1.46875,
"rewards/format_reward": 1.0,
"step": 366,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.21875,
"epoch": 0.5620214395099541,
"grad_norm": 7.758597041925383,
"kl": 0.0537109375,
"learning_rate": 4.031922259072252e-07,
"loss": 0.0001,
"reward": 3.7750000953674316,
"reward_std": 0.529064953327179,
"rewards/accuracy_reward": 2.4749999046325684,
"rewards/format_reward": 1.0,
"step": 367,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 282.59375,
"epoch": 0.5635528330781011,
"grad_norm": 44.275697473879234,
"kl": 0.06298828125,
"learning_rate": 4.0083336623158236e-07,
"loss": 0.0001,
"reward": 4.168749809265137,
"reward_std": 0.43980512022972107,
"rewards/accuracy_reward": 2.9437499046325684,
"rewards/format_reward": 1.0,
"step": 368,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 308.46875,
"epoch": 0.5650842266462481,
"grad_norm": 9.086010791037292,
"kl": 0.059326171875,
"learning_rate": 3.9847680184885613e-07,
"loss": 0.0001,
"reward": 4.131250381469727,
"reward_std": 0.32010617852211,
"rewards/accuracy_reward": 2.831249952316284,
"rewards/format_reward": 1.0,
"step": 369,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 308.0,
"epoch": 0.5666156202143952,
"grad_norm": 11.913129345112655,
"kl": 0.04638671875,
"learning_rate": 3.9612258730365823e-07,
"loss": 0.0,
"reward": 3.6187498569488525,
"reward_std": 0.41490620374679565,
"rewards/accuracy_reward": 2.3187499046325684,
"rewards/format_reward": 1.0,
"step": 370,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.5625,
"epoch": 0.5681470137825421,
"grad_norm": 6.288735778589191,
"kl": 0.0546875,
"learning_rate": 3.9377077708621167e-07,
"loss": 0.0001,
"reward": 3.6999998092651367,
"reward_std": 0.41836118698120117,
"rewards/accuracy_reward": 2.3999998569488525,
"rewards/format_reward": 1.0,
"step": 371,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 314.375,
"epoch": 0.5696784073506891,
"grad_norm": 9.996390173583332,
"kl": 0.05712890625,
"learning_rate": 3.914214256310887e-07,
"loss": 0.0001,
"reward": 3.3249998092651367,
"reward_std": 0.7744901180267334,
"rewards/accuracy_reward": 2.1000001430511475,
"rewards/format_reward": 1.0,
"step": 372,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 323.28125,
"epoch": 0.5712098009188361,
"grad_norm": 5.431898052655735,
"kl": 0.060791015625,
"learning_rate": 3.8907458731595223e-07,
"loss": 0.0001,
"reward": 3.200000047683716,
"reward_std": 0.3381873369216919,
"rewards/accuracy_reward": 1.899999976158142,
"rewards/format_reward": 1.0,
"step": 373,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 271.53125,
"epoch": 0.5727411944869831,
"grad_norm": 3.868776732403,
"kl": 0.0634765625,
"learning_rate": 3.867303164602961e-07,
"loss": 0.0001,
"reward": 3.875,
"reward_std": 0.3560502529144287,
"rewards/accuracy_reward": 2.6499998569488525,
"rewards/format_reward": 1.0,
"step": 374,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.96875,
"epoch": 0.5742725880551302,
"grad_norm": 6.940948762708857,
"kl": 0.0615234375,
"learning_rate": 3.843886673241883e-07,
"loss": 0.0001,
"reward": 3.7124998569488525,
"reward_std": 0.4306986927986145,
"rewards/accuracy_reward": 2.4124999046325684,
"rewards/format_reward": 1.0,
"step": 375,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.3125,
"epoch": 0.5758039816232772,
"grad_norm": 53.90165318932112,
"kl": 0.0615234375,
"learning_rate": 3.8204969410701505e-07,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.49573537707328796,
"rewards/accuracy_reward": 2.8999998569488525,
"rewards/format_reward": 1.0,
"step": 376,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.34375,
"epoch": 0.5773353751914242,
"grad_norm": 4.124093408449859,
"kl": 0.051513671875,
"learning_rate": 3.797134509462261e-07,
"loss": 0.0001,
"reward": 3.7750000953674316,
"reward_std": 0.30634891986846924,
"rewards/accuracy_reward": 2.4749999046325684,
"rewards/format_reward": 1.0,
"step": 377,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 313.03125,
"epoch": 0.5788667687595712,
"grad_norm": 12.732241959905021,
"kl": 0.05859375,
"learning_rate": 3.773799919160817e-07,
"loss": 0.0001,
"reward": 3.5,
"reward_std": 0.43782657384872437,
"rewards/accuracy_reward": 2.200000047683716,
"rewards/format_reward": 1.0,
"step": 378,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 339.40625,
"epoch": 0.5803981623277182,
"grad_norm": 30.76472108606479,
"kl": 0.055419921875,
"learning_rate": 3.750493710264016e-07,
"loss": 0.0001,
"reward": 3.081249952316284,
"reward_std": 0.2554660439491272,
"rewards/accuracy_reward": 1.78125,
"rewards/format_reward": 1.0,
"step": 379,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.8125,
"epoch": 0.5819295558958653,
"grad_norm": 7.012893283715469,
"kl": 0.052001953125,
"learning_rate": 3.7272164222131387e-07,
"loss": 0.0001,
"reward": 3.731250047683716,
"reward_std": 0.4654560089111328,
"rewards/accuracy_reward": 2.4312500953674316,
"rewards/format_reward": 1.0,
"step": 380,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.09375,
"epoch": 0.5834609494640123,
"grad_norm": 7.483991726584263,
"kl": 0.052978515625,
"learning_rate": 3.703968593780074e-07,
"loss": 0.0001,
"reward": 3.937499761581421,
"reward_std": 0.6161357760429382,
"rewards/accuracy_reward": 2.637500047683716,
"rewards/format_reward": 1.0,
"step": 381,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 328.9375,
"epoch": 0.5849923430321593,
"grad_norm": 7.708594456755339,
"kl": 0.04833984375,
"learning_rate": 3.6807507630548394e-07,
"loss": 0.0,
"reward": 3.4187498092651367,
"reward_std": 0.3879605531692505,
"rewards/accuracy_reward": 2.2687501907348633,
"rewards/format_reward": 1.0,
"step": 382,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 338.21875,
"epoch": 0.5865237366003063,
"grad_norm": 7.932311484776607,
"kl": 0.053955078125,
"learning_rate": 3.657563467433134e-07,
"loss": 0.0001,
"reward": 3.518749713897705,
"reward_std": 0.5936441421508789,
"rewards/accuracy_reward": 2.21875,
"rewards/format_reward": 1.0,
"step": 383,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 279.375,
"epoch": 0.5880551301684533,
"grad_norm": 6.986880473373834,
"kl": 0.0673828125,
"learning_rate": 3.6344072436038976e-07,
"loss": 0.0001,
"reward": 5.175000190734863,
"reward_std": 0.2715410590171814,
"rewards/accuracy_reward": 3.874999761581421,
"rewards/format_reward": 1.0,
"step": 384,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.75,
"epoch": 0.5895865237366003,
"grad_norm": 8.544800046398805,
"kl": 0.06103515625,
"learning_rate": 3.611282627536887e-07,
"loss": 0.0001,
"reward": 4.256249904632568,
"reward_std": 0.48312920331954956,
"rewards/accuracy_reward": 2.956249952316284,
"rewards/format_reward": 1.0,
"step": 385,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.1875,
"epoch": 0.5911179173047473,
"grad_norm": 96.50426243207977,
"kl": 0.06396484375,
"learning_rate": 3.5881901544702673e-07,
"loss": 0.0001,
"reward": 3.856250047683716,
"reward_std": 0.43018585443496704,
"rewards/accuracy_reward": 2.5562498569488525,
"rewards/format_reward": 1.0,
"step": 386,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 284.0,
"epoch": 0.5926493108728943,
"grad_norm": 4.379736134724126,
"kl": 0.0693359375,
"learning_rate": 3.565130358898233e-07,
"loss": 0.0001,
"reward": 3.737499713897705,
"reward_std": 0.45637887716293335,
"rewards/accuracy_reward": 2.5124998092651367,
"rewards/format_reward": 1.0,
"step": 387,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.25,
"epoch": 0.5941807044410413,
"grad_norm": 19.658835383842874,
"kl": 0.0634765625,
"learning_rate": 3.54210377455863e-07,
"loss": 0.0001,
"reward": 3.9562501907348633,
"reward_std": 0.4577101171016693,
"rewards/accuracy_reward": 2.65625,
"rewards/format_reward": 1.0,
"step": 388,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 332.34375,
"epoch": 0.5957120980091883,
"grad_norm": 8.319861598093148,
"kl": 0.05908203125,
"learning_rate": 3.519110934420602e-07,
"loss": 0.0001,
"reward": 4.018750190734863,
"reward_std": 0.49576184153556824,
"rewards/accuracy_reward": 2.71875,
"rewards/format_reward": 1.0,
"step": 389,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 343.59375,
"epoch": 0.5972434915773354,
"grad_norm": 8.981467534724874,
"kl": 0.05517578125,
"learning_rate": 3.496152370672255e-07,
"loss": 0.0001,
"reward": 3.637500047683716,
"reward_std": 0.5412981510162354,
"rewards/accuracy_reward": 2.3375000953674316,
"rewards/format_reward": 1.0,
"step": 390,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 294.8125,
"epoch": 0.5987748851454824,
"grad_norm": 4.408603122067736,
"kl": 0.0634765625,
"learning_rate": 3.4732286147083435e-07,
"loss": 0.0001,
"reward": 4.600000381469727,
"reward_std": 0.4068170189857483,
"rewards/accuracy_reward": 3.375,
"rewards/format_reward": 1.0,
"step": 391,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 356.6875,
"epoch": 0.6003062787136294,
"grad_norm": 5.306185280043634,
"kl": 0.051513671875,
"learning_rate": 3.450340197117962e-07,
"loss": 0.0001,
"reward": 3.6500000953674316,
"reward_std": 0.31280529499053955,
"rewards/accuracy_reward": 2.3500001430511475,
"rewards/format_reward": 1.0,
"step": 392,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.65625,
"epoch": 0.6018376722817764,
"grad_norm": 19.013497776702604,
"kl": 0.06298828125,
"learning_rate": 3.427487647672274e-07,
"loss": 0.0001,
"reward": 3.7750000953674316,
"reward_std": 0.4099277853965759,
"rewards/accuracy_reward": 2.4749999046325684,
"rewards/format_reward": 1.0,
"step": 393,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 301.21875,
"epoch": 0.6033690658499234,
"grad_norm": 7.498947178310738,
"kl": 0.07080078125,
"learning_rate": 3.4046714953122435e-07,
"loss": 0.0001,
"reward": 3.9312498569488525,
"reward_std": 0.3217979073524475,
"rewards/accuracy_reward": 2.6312499046325684,
"rewards/format_reward": 1.0,
"step": 394,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 277.21875,
"epoch": 0.6049004594180705,
"grad_norm": 5.258483288911297,
"kl": 0.0673828125,
"learning_rate": 3.381892268136392e-07,
"loss": 0.0001,
"reward": 3.950000047683716,
"reward_std": 0.4231208562850952,
"rewards/accuracy_reward": 2.6500000953674316,
"rewards/format_reward": 1.0,
"step": 395,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.46875,
"epoch": 0.6064318529862175,
"grad_norm": 11.648109176660908,
"kl": 0.06884765625,
"learning_rate": 3.359150493388583e-07,
"loss": 0.0001,
"reward": 4.356249809265137,
"reward_std": 0.43625977635383606,
"rewards/accuracy_reward": 3.0562498569488525,
"rewards/format_reward": 1.0,
"step": 396,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.71875,
"epoch": 0.6079632465543645,
"grad_norm": 4.297923922438784,
"kl": 0.0654296875,
"learning_rate": 3.3364466974458056e-07,
"loss": 0.0001,
"reward": 3.96875,
"reward_std": 0.3672224283218384,
"rewards/accuracy_reward": 2.6687498092651367,
"rewards/format_reward": 1.0,
"step": 397,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 332.8125,
"epoch": 0.6094946401225115,
"grad_norm": 7.58076316133248,
"kl": 0.05615234375,
"learning_rate": 3.313781405806006e-07,
"loss": 0.0001,
"reward": 3.0874998569488525,
"reward_std": 0.44267088174819946,
"rewards/accuracy_reward": 1.787500023841858,
"rewards/format_reward": 1.0,
"step": 398,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 356.5625,
"epoch": 0.6110260336906586,
"grad_norm": 5.811198431695378,
"kl": 0.0595703125,
"learning_rate": 3.291155143075912e-07,
"loss": 0.0001,
"reward": 3.2750000953674316,
"reward_std": 0.3249671161174774,
"rewards/accuracy_reward": 1.9749999046325684,
"rewards/format_reward": 1.0,
"step": 399,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.8125,
"epoch": 0.6125574272588055,
"grad_norm": 6.320383107288802,
"kl": 0.0595703125,
"learning_rate": 3.2685684329588956e-07,
"loss": 0.0001,
"reward": 4.068749904632568,
"reward_std": 0.4821315407752991,
"rewards/accuracy_reward": 2.768749952316284,
"rewards/format_reward": 1.0,
"step": 400,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.09375,
"epoch": 0.6140888208269525,
"grad_norm": 7.273549949269243,
"kl": 0.06201171875,
"learning_rate": 3.2460217982428513e-07,
"loss": 0.0001,
"reward": 4.587500095367432,
"reward_std": 0.5616779327392578,
"rewards/accuracy_reward": 3.2875001430511475,
"rewards/format_reward": 1.0,
"step": 401,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.53125,
"epoch": 0.6156202143950995,
"grad_norm": 5.96611061635415,
"kl": 0.056884765625,
"learning_rate": 3.223515760788098e-07,
"loss": 0.0001,
"reward": 3.7437498569488525,
"reward_std": 0.345234751701355,
"rewards/accuracy_reward": 2.4437499046325684,
"rewards/format_reward": 1.0,
"step": 402,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.375,
"epoch": 0.6171516079632465,
"grad_norm": 13.750305659870863,
"kl": 0.061279296875,
"learning_rate": 3.2010508415152946e-07,
"loss": 0.0001,
"reward": 3.6125001907348633,
"reward_std": 0.502585768699646,
"rewards/accuracy_reward": 2.3125,
"rewards/format_reward": 1.0,
"step": 403,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 273.46875,
"epoch": 0.6186830015313936,
"grad_norm": 17.49120172103442,
"kl": 0.06396484375,
"learning_rate": 3.1786275603933886e-07,
"loss": 0.0001,
"reward": 3.46875,
"reward_std": 0.2607581615447998,
"rewards/accuracy_reward": 2.168750047683716,
"rewards/format_reward": 1.0,
"step": 404,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 330.09375,
"epoch": 0.6202143950995406,
"grad_norm": 11.160363536103876,
"kl": 0.068359375,
"learning_rate": 3.1562464364275774e-07,
"loss": 0.0001,
"reward": 4.081250190734863,
"reward_std": 0.49857833981513977,
"rewards/accuracy_reward": 2.78125,
"rewards/format_reward": 1.0,
"step": 405,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.03125,
"epoch": 0.6217457886676876,
"grad_norm": 16.135854626517443,
"kl": 0.05712890625,
"learning_rate": 3.133907987647295e-07,
"loss": 0.0001,
"reward": 2.843749761581421,
"reward_std": 0.350276917219162,
"rewards/accuracy_reward": 1.5437500476837158,
"rewards/format_reward": 1.0,
"step": 406,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.0,
"epoch": 0.6232771822358346,
"grad_norm": 8.558951314586174,
"kl": 0.06103515625,
"learning_rate": 3.1116127310942263e-07,
"loss": 0.0001,
"reward": 3.8687498569488525,
"reward_std": 0.3773455321788788,
"rewards/accuracy_reward": 2.5687499046325684,
"rewards/format_reward": 1.0,
"step": 407,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.0,
"epoch": 0.6248085758039816,
"grad_norm": 5.9580071935367345,
"kl": 0.0625,
"learning_rate": 3.089361182810335e-07,
"loss": 0.0001,
"reward": 3.0812501907348633,
"reward_std": 0.28288936614990234,
"rewards/accuracy_reward": 1.78125,
"rewards/format_reward": 1.0,
"step": 408,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.6875,
"epoch": 0.6263399693721287,
"grad_norm": 7.083464778039577,
"kl": 0.06787109375,
"learning_rate": 3.0671538578259203e-07,
"loss": 0.0001,
"reward": 3.65625,
"reward_std": 0.4088667929172516,
"rewards/accuracy_reward": 2.3562498092651367,
"rewards/format_reward": 1.0,
"step": 409,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.25,
"epoch": 0.6278713629402757,
"grad_norm": 4.587386738325241,
"kl": 0.06494140625,
"learning_rate": 3.044991270147699e-07,
"loss": 0.0001,
"reward": 3.706249952316284,
"reward_std": 0.45332545042037964,
"rewards/accuracy_reward": 2.406249761581421,
"rewards/format_reward": 1.0,
"step": 410,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 356.5625,
"epoch": 0.6294027565084227,
"grad_norm": 4.485073681832358,
"kl": 0.05419921875,
"learning_rate": 3.0228739327469046e-07,
"loss": 0.0001,
"reward": 3.6312499046325684,
"reward_std": 0.539216935634613,
"rewards/accuracy_reward": 2.331249952316284,
"rewards/format_reward": 1.0,
"step": 411,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.75,
"epoch": 0.6309341500765697,
"grad_norm": 9.519163207799423,
"kl": 0.059326171875,
"learning_rate": 3.000802357547417e-07,
"loss": 0.0001,
"reward": 3.9312498569488525,
"reward_std": 0.4099936783313751,
"rewards/accuracy_reward": 2.6312499046325684,
"rewards/format_reward": 1.0,
"step": 412,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.625,
"epoch": 0.6324655436447167,
"grad_norm": 9.958954384986654,
"kl": 0.07080078125,
"learning_rate": 2.978777055413911e-07,
"loss": 0.0001,
"reward": 3.0687499046325684,
"reward_std": 0.13203126192092896,
"rewards/accuracy_reward": 1.7687499523162842,
"rewards/format_reward": 1.0,
"step": 413,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 291.625,
"epoch": 0.6339969372128637,
"grad_norm": 6.745076028703393,
"kl": 0.06591796875,
"learning_rate": 2.9567985361400376e-07,
"loss": 0.0001,
"reward": 3.793750047683716,
"reward_std": 0.5402119755744934,
"rewards/accuracy_reward": 2.4937500953674316,
"rewards/format_reward": 1.0,
"step": 414,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 346.5,
"epoch": 0.6355283307810107,
"grad_norm": 5.744964983707697,
"kl": 0.0625,
"learning_rate": 2.934867308436613e-07,
"loss": 0.0001,
"reward": 4.387499809265137,
"reward_std": 0.40273937582969666,
"rewards/accuracy_reward": 3.0874998569488525,
"rewards/format_reward": 1.0,
"step": 415,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.28125,
"epoch": 0.6370597243491577,
"grad_norm": 40.656643318281574,
"kl": 0.064453125,
"learning_rate": 2.912983879919857e-07,
"loss": 0.0001,
"reward": 3.78125,
"reward_std": 0.4405216574668884,
"rewards/accuracy_reward": 2.481250047683716,
"rewards/format_reward": 1.0,
"step": 416,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.3125,
"epoch": 0.6385911179173047,
"grad_norm": 4.359120589772653,
"kl": 0.0595703125,
"learning_rate": 2.891148757099636e-07,
"loss": 0.0001,
"reward": 4.112500190734863,
"reward_std": 0.46361613273620605,
"rewards/accuracy_reward": 2.8125,
"rewards/format_reward": 1.0,
"step": 417,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.15625,
"epoch": 0.6401225114854517,
"grad_norm": 13.317110893017306,
"kl": 0.06494140625,
"learning_rate": 2.8693624453677434e-07,
"loss": 0.0001,
"reward": 3.9437499046325684,
"reward_std": 0.5202068090438843,
"rewards/accuracy_reward": 2.643749952316284,
"rewards/format_reward": 1.0,
"step": 418,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.03125,
"epoch": 0.6416539050535988,
"grad_norm": 4.914353351589785,
"kl": 0.056884765625,
"learning_rate": 2.847625448986196e-07,
"loss": 0.0001,
"reward": 4.299999713897705,
"reward_std": 0.5695346593856812,
"rewards/accuracy_reward": 2.999999761581421,
"rewards/format_reward": 1.0,
"step": 419,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 335.9375,
"epoch": 0.6431852986217458,
"grad_norm": 11.187489124968776,
"kl": 0.0625,
"learning_rate": 2.825938271075572e-07,
"loss": 0.0001,
"reward": 2.9749999046325684,
"reward_std": 0.6231825947761536,
"rewards/accuracy_reward": 1.75,
"rewards/format_reward": 1.0,
"step": 420,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.625,
"epoch": 0.6447166921898928,
"grad_norm": 10.59483786175929,
"kl": 0.064453125,
"learning_rate": 2.804301413603356e-07,
"loss": 0.0001,
"reward": 3.7437500953674316,
"reward_std": 0.4473724365234375,
"rewards/accuracy_reward": 2.4437499046325684,
"rewards/format_reward": 1.0,
"step": 421,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 301.96875,
"epoch": 0.6462480857580398,
"grad_norm": 8.851351539181564,
"kl": 0.078125,
"learning_rate": 2.782715377372326e-07,
"loss": 0.0001,
"reward": 4.03125,
"reward_std": 0.1525237262248993,
"rewards/accuracy_reward": 2.731250047683716,
"rewards/format_reward": 1.0,
"step": 422,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 324.34375,
"epoch": 0.6477794793261868,
"grad_norm": 7.808986838862892,
"kl": 0.058349609375,
"learning_rate": 2.761180662008961e-07,
"loss": 0.0001,
"reward": 3.5562500953674316,
"reward_std": 0.6510157585144043,
"rewards/accuracy_reward": 2.2562499046325684,
"rewards/format_reward": 1.0,
"step": 423,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 329.15625,
"epoch": 0.6493108728943339,
"grad_norm": 7.364051668334138,
"kl": 0.0654296875,
"learning_rate": 2.7396977659518744e-07,
"loss": 0.0001,
"reward": 3.7937498092651367,
"reward_std": 0.37267881631851196,
"rewards/accuracy_reward": 2.4937500953674316,
"rewards/format_reward": 1.0,
"step": 424,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.53125,
"epoch": 0.6508422664624809,
"grad_norm": 16.69692399722496,
"kl": 0.0595703125,
"learning_rate": 2.7182671864402856e-07,
"loss": 0.0001,
"reward": 4.074999809265137,
"reward_std": 0.3937183916568756,
"rewards/accuracy_reward": 2.7749998569488525,
"rewards/format_reward": 1.0,
"step": 425,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 301.59375,
"epoch": 0.6523736600306279,
"grad_norm": 5.936387236448735,
"kl": 0.07470703125,
"learning_rate": 2.6968894195024984e-07,
"loss": 0.0001,
"reward": 3.237499713897705,
"reward_std": 0.22437289357185364,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 1.0,
"step": 426,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.71875,
"epoch": 0.6539050535987749,
"grad_norm": 6.076679157286225,
"kl": 0.068359375,
"learning_rate": 2.6755649599444287e-07,
"loss": 0.0001,
"reward": 3.950000047683716,
"reward_std": 0.6021788120269775,
"rewards/accuracy_reward": 2.6500000953674316,
"rewards/format_reward": 1.0,
"step": 427,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 277.0625,
"epoch": 0.655436447166922,
"grad_norm": 4.482861061619246,
"kl": 0.060546875,
"learning_rate": 2.654294301338149e-07,
"loss": 0.0001,
"reward": 3.831249952316284,
"reward_std": 0.39099207520484924,
"rewards/accuracy_reward": 2.53125,
"rewards/format_reward": 1.0,
"step": 428,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 271.9375,
"epoch": 0.6569678407350689,
"grad_norm": 4.806567872654773,
"kl": 0.080078125,
"learning_rate": 2.633077936010465e-07,
"loss": 0.0001,
"reward": 2.9499998092651367,
"reward_std": 0.2927432060241699,
"rewards/accuracy_reward": 1.7999999523162842,
"rewards/format_reward": 1.0,
"step": 429,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.78125,
"epoch": 0.6584992343032159,
"grad_norm": 4.716096742653952,
"kl": 0.060546875,
"learning_rate": 2.6119163550315194e-07,
"loss": 0.0001,
"reward": 3.1875,
"reward_std": 0.3171377182006836,
"rewards/accuracy_reward": 1.8875000476837158,
"rewards/format_reward": 1.0,
"step": 430,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.25,
"epoch": 0.6600306278713629,
"grad_norm": 10.997674271653253,
"kl": 0.060546875,
"learning_rate": 2.590810048203428e-07,
"loss": 0.0001,
"reward": 3.90625,
"reward_std": 0.3396279215812683,
"rewards/accuracy_reward": 2.6812500953674316,
"rewards/format_reward": 1.0,
"step": 431,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 336.75,
"epoch": 0.6615620214395099,
"grad_norm": 6.121337508508913,
"kl": 0.06494140625,
"learning_rate": 2.5697595040489386e-07,
"loss": 0.0001,
"reward": 4.09375,
"reward_std": 0.541084349155426,
"rewards/accuracy_reward": 2.793750047683716,
"rewards/format_reward": 1.0,
"step": 432,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.03125,
"epoch": 0.663093415007657,
"grad_norm": 14.90890760110821,
"kl": 0.06689453125,
"learning_rate": 2.5487652098001267e-07,
"loss": 0.0001,
"reward": 3.512500047683716,
"reward_std": 0.3367306590080261,
"rewards/accuracy_reward": 2.2874999046325684,
"rewards/format_reward": 1.0,
"step": 433,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 329.4375,
"epoch": 0.664624808575804,
"grad_norm": 4.949525751331033,
"kl": 0.06787109375,
"learning_rate": 2.5278276513871233e-07,
"loss": 0.0001,
"reward": 3.6937501430511475,
"reward_std": 0.27078691124916077,
"rewards/accuracy_reward": 2.3937501907348633,
"rewards/format_reward": 1.0,
"step": 434,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 269.40625,
"epoch": 0.666156202143951,
"grad_norm": 5.943192955867212,
"kl": 0.0732421875,
"learning_rate": 2.506947313426854e-07,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.36376625299453735,
"rewards/accuracy_reward": 2.825000047683716,
"rewards/format_reward": 1.0,
"step": 435,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 269.0,
"epoch": 0.667687595712098,
"grad_norm": 19.813187254646646,
"kl": 0.06591796875,
"learning_rate": 2.486124679211834e-07,
"loss": 0.0001,
"reward": 4.425000190734863,
"reward_std": 0.4399248957633972,
"rewards/accuracy_reward": 3.125,
"rewards/format_reward": 1.0,
"step": 436,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 258.03125,
"epoch": 0.669218989280245,
"grad_norm": 6.357727250158085,
"kl": 0.07080078125,
"learning_rate": 2.465360230698978e-07,
"loss": 0.0001,
"reward": 4.699999809265137,
"reward_std": 0.3344690203666687,
"rewards/accuracy_reward": 3.4000000953674316,
"rewards/format_reward": 1.0,
"step": 437,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 290.46875,
"epoch": 0.6707503828483921,
"grad_norm": 6.171421947220471,
"kl": 0.072265625,
"learning_rate": 2.444654448498442e-07,
"loss": 0.0001,
"reward": 3.6437501907348633,
"reward_std": 0.37767261266708374,
"rewards/accuracy_reward": 2.418750047683716,
"rewards/format_reward": 1.0,
"step": 438,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.4375,
"epoch": 0.6722817764165391,
"grad_norm": 7.171340223302997,
"kl": 0.062255859375,
"learning_rate": 2.42400781186251e-07,
"loss": 0.0001,
"reward": 3.6812500953674316,
"reward_std": 0.5633392333984375,
"rewards/accuracy_reward": 2.3812499046325684,
"rewards/format_reward": 1.0,
"step": 439,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 357.0,
"epoch": 0.6738131699846861,
"grad_norm": 9.313462256754683,
"kl": 0.06201171875,
"learning_rate": 2.4034207986744847e-07,
"loss": 0.0001,
"reward": 3.856250047683716,
"reward_std": 0.3225916028022766,
"rewards/accuracy_reward": 2.5562500953674316,
"rewards/format_reward": 1.0,
"step": 440,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 293.28125,
"epoch": 0.6753445635528331,
"grad_norm": 7.9431002089449505,
"kl": 0.06103515625,
"learning_rate": 2.3828938854376408e-07,
"loss": 0.0001,
"reward": 3.9812498092651367,
"reward_std": 0.4964829087257385,
"rewards/accuracy_reward": 2.6812498569488525,
"rewards/format_reward": 1.0,
"step": 441,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.375,
"epoch": 0.6768759571209801,
"grad_norm": 4.236738187315109,
"kl": 0.0673828125,
"learning_rate": 2.362427547264187e-07,
"loss": 0.0001,
"reward": 3.762500286102295,
"reward_std": 0.3254941701889038,
"rewards/accuracy_reward": 2.4625000953674316,
"rewards/format_reward": 1.0,
"step": 442,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.375,
"epoch": 0.678407350689127,
"grad_norm": 7.778748180413386,
"kl": 0.06689453125,
"learning_rate": 2.3420222578642747e-07,
"loss": 0.0001,
"reward": 2.90625,
"reward_std": 0.2956770658493042,
"rewards/accuracy_reward": 1.6062499284744263,
"rewards/format_reward": 1.0,
"step": 443,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.59375,
"epoch": 0.6799387442572741,
"grad_norm": 14.038766966652355,
"kl": 0.072265625,
"learning_rate": 2.321678489535031e-07,
"loss": 0.0001,
"reward": 3.6374998092651367,
"reward_std": 0.5000779628753662,
"rewards/accuracy_reward": 2.3375000953674316,
"rewards/format_reward": 1.0,
"step": 444,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 313.46875,
"epoch": 0.6814701378254211,
"grad_norm": 5.017466913183842,
"kl": 0.068359375,
"learning_rate": 2.301396713149627e-07,
"loss": 0.0001,
"reward": 3.59375,
"reward_std": 0.47141778469085693,
"rewards/accuracy_reward": 2.293750047683716,
"rewards/format_reward": 1.0,
"step": 445,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 367.71875,
"epoch": 0.6830015313935681,
"grad_norm": 12.681507283176876,
"kl": 0.06396484375,
"learning_rate": 2.2811773981463805e-07,
"loss": 0.0001,
"reward": 3.6312501430511475,
"reward_std": 0.4401986598968506,
"rewards/accuracy_reward": 2.331249952316284,
"rewards/format_reward": 1.0,
"step": 446,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.84375,
"epoch": 0.6845329249617151,
"grad_norm": 18.330301394432702,
"kl": 0.06396484375,
"learning_rate": 2.2610210125178863e-07,
"loss": 0.0001,
"reward": 3.1875,
"reward_std": 0.43023771047592163,
"rewards/accuracy_reward": 1.8875000476837158,
"rewards/format_reward": 1.0,
"step": 447,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.0,
"epoch": 0.6860643185298622,
"grad_norm": 17.894755319302757,
"kl": 0.06689453125,
"learning_rate": 2.2409280228001937e-07,
"loss": 0.0001,
"reward": 3.4375,
"reward_std": 0.44989362359046936,
"rewards/accuracy_reward": 2.2125000953674316,
"rewards/format_reward": 1.0,
"step": 448,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.0625,
"epoch": 0.6875957120980092,
"grad_norm": 6.212149538425316,
"kl": 0.0693359375,
"learning_rate": 2.220898894061996e-07,
"loss": 0.0001,
"reward": 3.8812501430511475,
"reward_std": 0.5551595091819763,
"rewards/accuracy_reward": 2.581249952316284,
"rewards/format_reward": 1.0,
"step": 449,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 364.53125,
"epoch": 0.6891271056661562,
"grad_norm": 68.47456626232325,
"kl": 0.06298828125,
"learning_rate": 2.2009340898938738e-07,
"loss": 0.0001,
"reward": 3.5625,
"reward_std": 0.31192710995674133,
"rewards/accuracy_reward": 2.2624998092651367,
"rewards/format_reward": 1.0,
"step": 450,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.0625,
"epoch": 0.6906584992343032,
"grad_norm": 10.434349640876311,
"kl": 0.060302734375,
"learning_rate": 2.1810340723975635e-07,
"loss": 0.0001,
"reward": 4.143749713897705,
"reward_std": 0.34711429476737976,
"rewards/accuracy_reward": 2.84375,
"rewards/format_reward": 1.0,
"step": 451,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 274.3125,
"epoch": 0.6921898928024502,
"grad_norm": 4.79448716086322,
"kl": 0.07470703125,
"learning_rate": 2.1611993021752589e-07,
"loss": 0.0001,
"reward": 3.7124998569488525,
"reward_std": 0.3149541914463043,
"rewards/accuracy_reward": 2.4124999046325684,
"rewards/format_reward": 1.0,
"step": 452,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 343.03125,
"epoch": 0.6937212863705973,
"grad_norm": 5.386247307227581,
"kl": 0.0634765625,
"learning_rate": 2.1414302383189524e-07,
"loss": 0.0001,
"reward": 3.9937498569488525,
"reward_std": 0.507817268371582,
"rewards/accuracy_reward": 2.6937499046325684,
"rewards/format_reward": 1.0,
"step": 453,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 350.71875,
"epoch": 0.6952526799387443,
"grad_norm": 4.072342990706237,
"kl": 0.06298828125,
"learning_rate": 2.121727338399814e-07,
"loss": 0.0001,
"reward": 3.081249952316284,
"reward_std": 0.3716287612915039,
"rewards/accuracy_reward": 1.78125,
"rewards/format_reward": 1.0,
"step": 454,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.65625,
"epoch": 0.6967840735068913,
"grad_norm": 7.167623199805889,
"kl": 0.0634765625,
"learning_rate": 2.1020910584575891e-07,
"loss": 0.0001,
"reward": 3.1687498092651367,
"reward_std": 0.4107249975204468,
"rewards/accuracy_reward": 1.943750023841858,
"rewards/format_reward": 1.0,
"step": 455,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.84375,
"epoch": 0.6983154670750383,
"grad_norm": 9.46426779830963,
"kl": 0.07275390625,
"learning_rate": 2.0825218529900508e-07,
"loss": 0.0001,
"reward": 3.84375,
"reward_std": 0.5541188716888428,
"rewards/accuracy_reward": 2.6187498569488525,
"rewards/format_reward": 1.0,
"step": 456,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 323.375,
"epoch": 0.6998468606431854,
"grad_norm": 7.578977726220703,
"kl": 0.06689453125,
"learning_rate": 2.0630201749424796e-07,
"loss": 0.0001,
"reward": 3.9875001907348633,
"reward_std": 0.6910339593887329,
"rewards/accuracy_reward": 2.6875,
"rewards/format_reward": 1.0,
"step": 457,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 353.0,
"epoch": 0.7013782542113323,
"grad_norm": 5.199030719704364,
"kl": 0.06494140625,
"learning_rate": 2.0435864756971778e-07,
"loss": 0.0001,
"reward": 3.5437498092651367,
"reward_std": 0.589798092842102,
"rewards/accuracy_reward": 2.2437500953674316,
"rewards/format_reward": 1.0,
"step": 458,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 347.25,
"epoch": 0.7029096477794793,
"grad_norm": 22.602000771073744,
"kl": 0.0625,
"learning_rate": 2.0242212050630232e-07,
"loss": 0.0001,
"reward": 3.5437498092651367,
"reward_std": 0.4135865271091461,
"rewards/accuracy_reward": 2.2437498569488525,
"rewards/format_reward": 1.0,
"step": 459,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.78125,
"epoch": 0.7044410413476263,
"grad_norm": 9.367843330693155,
"kl": 0.0703125,
"learning_rate": 2.0049248112650563e-07,
"loss": 0.0001,
"reward": 4.237500190734863,
"reward_std": 0.5218789577484131,
"rewards/accuracy_reward": 2.9375,
"rewards/format_reward": 1.0,
"step": 460,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 343.8125,
"epoch": 0.7059724349157733,
"grad_norm": 15.50592945792412,
"kl": 0.058349609375,
"learning_rate": 1.9856977409341086e-07,
"loss": 0.0001,
"reward": 3.5625,
"reward_std": 0.25935813784599304,
"rewards/accuracy_reward": 2.2624998092651367,
"rewards/format_reward": 1.0,
"step": 461,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.78125,
"epoch": 0.7075038284839203,
"grad_norm": 11.738727534172867,
"kl": 0.0615234375,
"learning_rate": 1.9665404390964597e-07,
"loss": 0.0001,
"reward": 4.300000190734863,
"reward_std": 0.417613685131073,
"rewards/accuracy_reward": 2.999999761581421,
"rewards/format_reward": 1.0,
"step": 462,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 284.6875,
"epoch": 0.7090352220520674,
"grad_norm": 9.156696648441521,
"kl": 0.07275390625,
"learning_rate": 1.947453349163547e-07,
"loss": 0.0001,
"reward": 4.09375,
"reward_std": 0.3098074197769165,
"rewards/accuracy_reward": 2.7937498092651367,
"rewards/format_reward": 1.0,
"step": 463,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 321.75,
"epoch": 0.7105666156202144,
"grad_norm": 11.321426847522837,
"kl": 0.061767578125,
"learning_rate": 1.9284369129216892e-07,
"loss": 0.0001,
"reward": 3.2124998569488525,
"reward_std": 0.4076748788356781,
"rewards/accuracy_reward": 1.912500023841858,
"rewards/format_reward": 1.0,
"step": 464,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 306.65625,
"epoch": 0.7120980091883614,
"grad_norm": 5.638771275332884,
"kl": 0.06494140625,
"learning_rate": 1.9094915705218711e-07,
"loss": 0.0001,
"reward": 3.3812499046325684,
"reward_std": 0.32593533396720886,
"rewards/accuracy_reward": 2.081249952316284,
"rewards/format_reward": 1.0,
"step": 465,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 314.96875,
"epoch": 0.7136294027565084,
"grad_norm": 14.944883534220684,
"kl": 0.07568359375,
"learning_rate": 1.89061776046955e-07,
"loss": 0.0001,
"reward": 3.9000000953674316,
"reward_std": 0.5437703132629395,
"rewards/accuracy_reward": 2.5999999046325684,
"rewards/format_reward": 1.0,
"step": 466,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.71875,
"epoch": 0.7151607963246555,
"grad_norm": 5.386863078488171,
"kl": 0.059326171875,
"learning_rate": 1.8718159196145089e-07,
"loss": 0.0001,
"reward": 3.25,
"reward_std": 0.34646961092948914,
"rewards/accuracy_reward": 2.0250000953674316,
"rewards/format_reward": 1.0,
"step": 467,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 334.9375,
"epoch": 0.7166921898928025,
"grad_norm": 9.443240303349306,
"kl": 0.0693359375,
"learning_rate": 1.853086483140749e-07,
"loss": 0.0001,
"reward": 3.3187499046325684,
"reward_std": 0.3992847502231598,
"rewards/accuracy_reward": 2.018749952316284,
"rewards/format_reward": 1.0,
"step": 468,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.9375,
"epoch": 0.7182235834609495,
"grad_norm": 8.370818598421605,
"kl": 0.0732421875,
"learning_rate": 1.8344298845564072e-07,
"loss": 0.0001,
"reward": 3.481250047683716,
"reward_std": 0.41231366991996765,
"rewards/accuracy_reward": 2.2562499046325684,
"rewards/format_reward": 1.0,
"step": 469,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.0625,
"epoch": 0.7197549770290965,
"grad_norm": 9.241300531058982,
"kl": 0.06982421875,
"learning_rate": 1.8158465556837304e-07,
"loss": 0.0001,
"reward": 3.331249952316284,
"reward_std": 0.44833511114120483,
"rewards/accuracy_reward": 2.106250047683716,
"rewards/format_reward": 1.0,
"step": 470,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 279.6875,
"epoch": 0.7212863705972435,
"grad_norm": 7.054289319864945,
"kl": 0.07080078125,
"learning_rate": 1.797336926649078e-07,
"loss": 0.0001,
"reward": 4.387499809265137,
"reward_std": 0.4259355068206787,
"rewards/accuracy_reward": 3.0875003337860107,
"rewards/format_reward": 1.0,
"step": 471,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.40625,
"epoch": 0.7228177641653905,
"grad_norm": 4.2441745477087895,
"kl": 0.072265625,
"learning_rate": 1.7789014258729657e-07,
"loss": 0.0001,
"reward": 3.875,
"reward_std": 0.535858154296875,
"rewards/accuracy_reward": 2.575000047683716,
"rewards/format_reward": 1.0,
"step": 472,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.90625,
"epoch": 0.7243491577335375,
"grad_norm": 19.64611885820091,
"kl": 0.0634765625,
"learning_rate": 1.7605404800601498e-07,
"loss": 0.0001,
"reward": 3.53125,
"reward_std": 0.4168233573436737,
"rewards/accuracy_reward": 2.231250047683716,
"rewards/format_reward": 1.0,
"step": 473,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 347.59375,
"epoch": 0.7258805513016845,
"grad_norm": 6.744904142124907,
"kl": 0.06201171875,
"learning_rate": 1.7422545141897522e-07,
"loss": 0.0001,
"reward": 3.7312498092651367,
"reward_std": 0.4904130697250366,
"rewards/accuracy_reward": 2.4312498569488525,
"rewards/format_reward": 1.0,
"step": 474,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.1875,
"epoch": 0.7274119448698315,
"grad_norm": 7.837268220751343,
"kl": 0.0703125,
"learning_rate": 1.7240439515054218e-07,
"loss": 0.0001,
"reward": 3.3312501907348633,
"reward_std": 0.13335174322128296,
"rewards/accuracy_reward": 2.03125,
"rewards/format_reward": 1.0,
"step": 475,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 348.21875,
"epoch": 0.7289433384379785,
"grad_norm": 5.818240284000521,
"kl": 0.062255859375,
"learning_rate": 1.705909213505537e-07,
"loss": 0.0001,
"reward": 3.231250047683716,
"reward_std": 0.3217264711856842,
"rewards/accuracy_reward": 1.9312500953674316,
"rewards/format_reward": 1.0,
"step": 476,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 346.71875,
"epoch": 0.7304747320061256,
"grad_norm": 7.677825777713008,
"kl": 0.06396484375,
"learning_rate": 1.687850719933458e-07,
"loss": 0.0001,
"reward": 3.887500047683716,
"reward_std": 0.5294345617294312,
"rewards/accuracy_reward": 2.5875000953674316,
"rewards/format_reward": 1.0,
"step": 477,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 314.5625,
"epoch": 0.7320061255742726,
"grad_norm": 7.475514037163483,
"kl": 0.0673828125,
"learning_rate": 1.6698688887677993e-07,
"loss": 0.0001,
"reward": 3.518749952316284,
"reward_std": 0.284912109375,
"rewards/accuracy_reward": 2.21875,
"rewards/format_reward": 1.0,
"step": 478,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 362.65625,
"epoch": 0.7335375191424196,
"grad_norm": 12.349104248475921,
"kl": 0.0595703125,
"learning_rate": 1.6519641362127628e-07,
"loss": 0.0001,
"reward": 3.3125,
"reward_std": 0.5435852408409119,
"rewards/accuracy_reward": 2.0124998092651367,
"rewards/format_reward": 1.0,
"step": 479,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.25,
"epoch": 0.7350689127105666,
"grad_norm": 4.048801751798312,
"kl": 0.0751953125,
"learning_rate": 1.634136876688504e-07,
"loss": 0.0001,
"reward": 3.543750047683716,
"reward_std": 0.42121684551239014,
"rewards/accuracy_reward": 2.2437500953674316,
"rewards/format_reward": 1.0,
"step": 480,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 321.75,
"epoch": 0.7366003062787136,
"grad_norm": 6.4005433756299785,
"kl": 0.06787109375,
"learning_rate": 1.6163875228215351e-07,
"loss": 0.0001,
"reward": 3.5999999046325684,
"reward_std": 0.42963576316833496,
"rewards/accuracy_reward": 2.3000001907348633,
"rewards/format_reward": 1.0,
"step": 481,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 292.5,
"epoch": 0.7381316998468607,
"grad_norm": 9.877103419548662,
"kl": 0.07421875,
"learning_rate": 1.5987164854351858e-07,
"loss": 0.0001,
"reward": 4.78125,
"reward_std": 0.40715551376342773,
"rewards/accuracy_reward": 3.481250047683716,
"rewards/format_reward": 1.0,
"step": 482,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 302.5,
"epoch": 0.7396630934150077,
"grad_norm": 6.164020418879224,
"kl": 0.068359375,
"learning_rate": 1.5811241735400793e-07,
"loss": 0.0001,
"reward": 3.34375,
"reward_std": 0.49206188321113586,
"rewards/accuracy_reward": 2.1187498569488525,
"rewards/format_reward": 1.0,
"step": 483,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 335.5,
"epoch": 0.7411944869831547,
"grad_norm": 9.137767557498421,
"kl": 0.0634765625,
"learning_rate": 1.5636109943246762e-07,
"loss": 0.0001,
"reward": 4.506250381469727,
"reward_std": 0.4679912328720093,
"rewards/accuracy_reward": 3.206249952316284,
"rewards/format_reward": 1.0,
"step": 484,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 338.28125,
"epoch": 0.7427258805513017,
"grad_norm": 6.2818164268747285,
"kl": 0.06005859375,
"learning_rate": 1.5461773531458455e-07,
"loss": 0.0001,
"reward": 3.362499952316284,
"reward_std": 0.48934221267700195,
"rewards/accuracy_reward": 2.0625,
"rewards/format_reward": 1.0,
"step": 485,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 275.0625,
"epoch": 0.7442572741194488,
"grad_norm": 16.37769015232479,
"kl": 0.078125,
"learning_rate": 1.5288236535194815e-07,
"loss": 0.0001,
"reward": 3.4437499046325684,
"reward_std": 0.33781734108924866,
"rewards/accuracy_reward": 2.143749952316284,
"rewards/format_reward": 1.0,
"step": 486,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 321.6875,
"epoch": 0.7457886676875957,
"grad_norm": 12.209190420784717,
"kl": 0.061767578125,
"learning_rate": 1.5115502971111733e-07,
"loss": 0.0001,
"reward": 3.8812501430511475,
"reward_std": 0.5911651849746704,
"rewards/accuracy_reward": 2.581249952316284,
"rewards/format_reward": 1.0,
"step": 487,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 283.90625,
"epoch": 0.7473200612557427,
"grad_norm": 9.204315985416914,
"kl": 0.06640625,
"learning_rate": 1.4943576837268896e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.5566960573196411,
"rewards/accuracy_reward": 3.2749998569488525,
"rewards/format_reward": 1.0,
"step": 488,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 338.875,
"epoch": 0.7488514548238897,
"grad_norm": 6.523657451114991,
"kl": 0.0654296875,
"learning_rate": 1.4772462113037431e-07,
"loss": 0.0001,
"reward": 4.318749904632568,
"reward_std": 0.24353675544261932,
"rewards/accuracy_reward": 3.018749952316284,
"rewards/format_reward": 1.0,
"step": 489,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 278.6875,
"epoch": 0.7503828483920367,
"grad_norm": 5.213619993552357,
"kl": 0.06982421875,
"learning_rate": 1.460216275900769e-07,
"loss": 0.0001,
"reward": 4.1875,
"reward_std": 0.5140166282653809,
"rewards/accuracy_reward": 2.887500047683716,
"rewards/format_reward": 1.0,
"step": 490,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 286.8125,
"epoch": 0.7519142419601837,
"grad_norm": 6.670540126650134,
"kl": 0.0712890625,
"learning_rate": 1.443268271689766e-07,
"loss": 0.0001,
"reward": 3.96875,
"reward_std": 0.3987843990325928,
"rewards/accuracy_reward": 2.6687498092651367,
"rewards/format_reward": 1.0,
"step": 491,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 289.875,
"epoch": 0.7534456355283308,
"grad_norm": 7.026601512632626,
"kl": 0.076171875,
"learning_rate": 1.426402590946163e-07,
"loss": 0.0001,
"reward": 3.625,
"reward_std": 0.35273683071136475,
"rewards/accuracy_reward": 2.325000047683716,
"rewards/format_reward": 1.0,
"step": 492,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.5,
"epoch": 0.7549770290964778,
"grad_norm": 7.503226154358219,
"kl": 0.0634765625,
"learning_rate": 1.4096196240399478e-07,
"loss": 0.0001,
"reward": 4.400000095367432,
"reward_std": 0.39059334993362427,
"rewards/accuracy_reward": 3.0999999046325684,
"rewards/format_reward": 1.0,
"step": 493,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.71875,
"epoch": 0.7565084226646248,
"grad_norm": 4.980978402620444,
"kl": 0.078125,
"learning_rate": 1.392919759426628e-07,
"loss": 0.0001,
"reward": 3.6062498092651367,
"reward_std": 0.5807459354400635,
"rewards/accuracy_reward": 2.3062500953674316,
"rewards/format_reward": 1.0,
"step": 494,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 352.5625,
"epoch": 0.7580398162327718,
"grad_norm": 7.798753483587729,
"kl": 0.0693359375,
"learning_rate": 1.3763033836382392e-07,
"loss": 0.0001,
"reward": 2.887500047683716,
"reward_std": 0.23938804864883423,
"rewards/accuracy_reward": 1.5875000953674316,
"rewards/format_reward": 1.0,
"step": 495,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 334.9375,
"epoch": 0.7595712098009189,
"grad_norm": 10.083985352107655,
"kl": 0.06591796875,
"learning_rate": 1.3597708812744034e-07,
"loss": 0.0001,
"reward": 3.4312498569488525,
"reward_std": 0.23019403219223022,
"rewards/accuracy_reward": 2.1312499046325684,
"rewards/format_reward": 1.0,
"step": 496,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 325.8125,
"epoch": 0.7611026033690659,
"grad_norm": 9.050885384906376,
"kl": 0.0634765625,
"learning_rate": 1.343322634993421e-07,
"loss": 0.0001,
"reward": 3.887500047683716,
"reward_std": 0.32818514108657837,
"rewards/accuracy_reward": 2.5875000953674316,
"rewards/format_reward": 1.0,
"step": 497,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 389.875,
"epoch": 0.7626339969372129,
"grad_norm": 5.378885028637031,
"kl": 0.06298828125,
"learning_rate": 1.3269590255034163e-07,
"loss": 0.0001,
"reward": 3.2937498092651367,
"reward_std": 0.30960196256637573,
"rewards/accuracy_reward": 1.993749976158142,
"rewards/format_reward": 1.0,
"step": 498,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 309.15625,
"epoch": 0.7641653905053599,
"grad_norm": 22.904067920632766,
"kl": 0.058349609375,
"learning_rate": 1.3106804315535264e-07,
"loss": 0.0001,
"reward": 4.3125,
"reward_std": 0.5876265168190002,
"rewards/accuracy_reward": 3.012500047683716,
"rewards/format_reward": 1.0,
"step": 499,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.78125,
"epoch": 0.7656967840735069,
"grad_norm": 9.043834005585135,
"kl": 0.0751953125,
"learning_rate": 1.294487229925132e-07,
"loss": 0.0001,
"reward": 2.7874999046325684,
"reward_std": 0.36112481355667114,
"rewards/accuracy_reward": 1.5625,
"rewards/format_reward": 1.0,
"step": 500,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 359.5,
"epoch": 0.7672281776416539,
"grad_norm": 5.84942052999165,
"kl": 0.076171875,
"learning_rate": 1.278379795423145e-07,
"loss": 0.0001,
"reward": 3.3374998569488525,
"reward_std": 0.6417471766471863,
"rewards/accuracy_reward": 2.0375001430511475,
"rewards/format_reward": 1.0,
"step": 501,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.59375,
"epoch": 0.7687595712098009,
"grad_norm": 8.322484842813823,
"kl": 0.0791015625,
"learning_rate": 1.262358500867318e-07,
"loss": 0.0001,
"reward": 3.875,
"reward_std": 0.3749288320541382,
"rewards/accuracy_reward": 2.575000047683716,
"rewards/format_reward": 1.0,
"step": 502,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 279.40625,
"epoch": 0.7702909647779479,
"grad_norm": 6.2770678767323105,
"kl": 0.072265625,
"learning_rate": 1.2464237170836313e-07,
"loss": 0.0001,
"reward": 3.7624998092651367,
"reward_std": 0.2802865505218506,
"rewards/accuracy_reward": 2.4625000953674316,
"rewards/format_reward": 1.0,
"step": 503,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 344.09375,
"epoch": 0.7718223583460949,
"grad_norm": 4.445773768777764,
"kl": 0.0654296875,
"learning_rate": 1.2305758128956973e-07,
"loss": 0.0001,
"reward": 3.5625,
"reward_std": 0.491230309009552,
"rewards/accuracy_reward": 2.2624998092651367,
"rewards/format_reward": 1.0,
"step": 504,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 325.53125,
"epoch": 0.7733537519142419,
"grad_norm": 4.920107986633787,
"kl": 0.0654296875,
"learning_rate": 1.2148151551162345e-07,
"loss": 0.0001,
"reward": 3.7249999046325684,
"reward_std": 0.39738136529922485,
"rewards/accuracy_reward": 2.424999952316284,
"rewards/format_reward": 1.0,
"step": 505,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.09375,
"epoch": 0.774885145482389,
"grad_norm": 6.548906578710591,
"kl": 0.064453125,
"learning_rate": 1.1991421085385672e-07,
"loss": 0.0001,
"reward": 4.168749809265137,
"reward_std": 0.5465906262397766,
"rewards/accuracy_reward": 2.8687500953674316,
"rewards/format_reward": 1.0,
"step": 506,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 298.6875,
"epoch": 0.776416539050536,
"grad_norm": 7.141153189502872,
"kl": 0.0673828125,
"learning_rate": 1.1835570359281893e-07,
"loss": 0.0001,
"reward": 3.6812500953674316,
"reward_std": 0.43552249670028687,
"rewards/accuracy_reward": 2.3812499046325684,
"rewards/format_reward": 1.0,
"step": 507,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.0625,
"epoch": 0.777947932618683,
"grad_norm": 6.2961568088398705,
"kl": 0.06982421875,
"learning_rate": 1.1680602980143639e-07,
"loss": 0.0001,
"reward": 4.149999618530273,
"reward_std": 0.4107191562652588,
"rewards/accuracy_reward": 2.8500001430511475,
"rewards/format_reward": 1.0,
"step": 508,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 306.5,
"epoch": 0.77947932618683,
"grad_norm": 5.438986938486748,
"kl": 0.0634765625,
"learning_rate": 1.152652253481774e-07,
"loss": 0.0001,
"reward": 3.78125,
"reward_std": 0.5216482877731323,
"rewards/accuracy_reward": 2.5562500953674316,
"rewards/format_reward": 1.0,
"step": 509,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.5625,
"epoch": 0.781010719754977,
"grad_norm": 8.038160550760889,
"kl": 0.064453125,
"learning_rate": 1.137333258962227e-07,
"loss": 0.0001,
"reward": 3.5,
"reward_std": 0.43106070160865784,
"rewards/accuracy_reward": 2.1999998092651367,
"rewards/format_reward": 1.0,
"step": 510,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 336.34375,
"epoch": 0.7825421133231241,
"grad_norm": 5.433265521527188,
"kl": 0.0654296875,
"learning_rate": 1.1221036690263885e-07,
"loss": 0.0001,
"reward": 3.9749999046325684,
"reward_std": 0.25459665060043335,
"rewards/accuracy_reward": 2.674999952316284,
"rewards/format_reward": 1.0,
"step": 511,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.25,
"epoch": 0.7840735068912711,
"grad_norm": 8.717682371268383,
"kl": 0.064453125,
"learning_rate": 1.1069638361755857e-07,
"loss": 0.0001,
"reward": 3.831249952316284,
"reward_std": 0.4424981474876404,
"rewards/accuracy_reward": 2.53125,
"rewards/format_reward": 1.0,
"step": 512,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.125,
"epoch": 0.7856049004594181,
"grad_norm": 14.452852305560832,
"kl": 0.0673828125,
"learning_rate": 1.0919141108336433e-07,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.6297014355659485,
"rewards/accuracy_reward": 2.8249998092651367,
"rewards/format_reward": 1.0,
"step": 513,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.25,
"epoch": 0.7871362940275651,
"grad_norm": 8.46706644871554,
"kl": 0.07177734375,
"learning_rate": 1.0769548413387719e-07,
"loss": 0.0001,
"reward": 4.143750190734863,
"reward_std": 0.3131358325481415,
"rewards/accuracy_reward": 2.843750238418579,
"rewards/format_reward": 1.0,
"step": 514,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.6875,
"epoch": 0.7886676875957122,
"grad_norm": 6.559956586845839,
"kl": 0.06787109375,
"learning_rate": 1.0620863739355135e-07,
"loss": 0.0001,
"reward": 3.7562499046325684,
"reward_std": 0.4320494532585144,
"rewards/accuracy_reward": 2.456249952316284,
"rewards/format_reward": 1.0,
"step": 515,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.0,
"epoch": 0.7901990811638591,
"grad_norm": 20.319001292719538,
"kl": 0.0712890625,
"learning_rate": 1.0473090527667166e-07,
"loss": 0.0001,
"reward": 3.5812501907348633,
"reward_std": 0.46948492527008057,
"rewards/accuracy_reward": 2.28125,
"rewards/format_reward": 1.0,
"step": 516,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.1875,
"epoch": 0.7917304747320061,
"grad_norm": 11.42060678226176,
"kl": 0.072265625,
"learning_rate": 1.0326232198655738e-07,
"loss": 0.0001,
"reward": 3.5124998092651367,
"reward_std": 0.30364906787872314,
"rewards/accuracy_reward": 2.2124998569488525,
"rewards/format_reward": 1.0,
"step": 517,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.53125,
"epoch": 0.7932618683001531,
"grad_norm": 10.373305893594383,
"kl": 0.0625,
"learning_rate": 1.0180292151477099e-07,
"loss": 0.0001,
"reward": 3.6750001907348633,
"reward_std": 0.5693778991699219,
"rewards/accuracy_reward": 2.375,
"rewards/format_reward": 1.0,
"step": 518,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 372.0,
"epoch": 0.7947932618683001,
"grad_norm": 5.20394549436034,
"kl": 0.060302734375,
"learning_rate": 1.0035273764033131e-07,
"loss": 0.0001,
"reward": 3.9124996662139893,
"reward_std": 0.3591833710670471,
"rewards/accuracy_reward": 2.612499952316284,
"rewards/format_reward": 1.0,
"step": 519,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 350.375,
"epoch": 0.7963246554364471,
"grad_norm": 5.9918206526844955,
"kl": 0.064453125,
"learning_rate": 9.891180392893117e-08,
"loss": 0.0001,
"reward": 3.9749999046325684,
"reward_std": 0.3856534957885742,
"rewards/accuracy_reward": 2.674999952316284,
"rewards/format_reward": 1.0,
"step": 520,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.625,
"epoch": 0.7978560490045942,
"grad_norm": 3.749658955032504,
"kl": 0.07177734375,
"learning_rate": 9.748015373216078e-08,
"loss": 0.0001,
"reward": 4.337499618530273,
"reward_std": 0.4496949315071106,
"rewards/accuracy_reward": 3.0375001430511475,
"rewards/format_reward": 1.0,
"step": 521,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 353.625,
"epoch": 0.7993874425727412,
"grad_norm": 4.004240398257631,
"kl": 0.0615234375,
"learning_rate": 9.605782018673591e-08,
"loss": 0.0001,
"reward": 3.1937499046325684,
"reward_std": 0.45035600662231445,
"rewards/accuracy_reward": 1.8937500715255737,
"rewards/format_reward": 1.0,
"step": 522,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.8125,
"epoch": 0.8009188361408882,
"grad_norm": 4.644865902306151,
"kl": 0.068359375,
"learning_rate": 9.464483621373076e-08,
"loss": 0.0001,
"reward": 3.6187500953674316,
"reward_std": 0.6933550834655762,
"rewards/accuracy_reward": 2.3937501907348633,
"rewards/format_reward": 1.0,
"step": 523,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.75,
"epoch": 0.8024502297090352,
"grad_norm": 6.067119684325836,
"kl": 0.06640625,
"learning_rate": 9.324123451781618e-08,
"loss": 0.0001,
"reward": 3.78125,
"reward_std": 0.3978561460971832,
"rewards/accuracy_reward": 2.4812498092651367,
"rewards/format_reward": 1.0,
"step": 524,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.84375,
"epoch": 0.8039816232771823,
"grad_norm": 6.624482449249475,
"kl": 0.064453125,
"learning_rate": 9.184704758650241e-08,
"loss": 0.0001,
"reward": 3.606250047683716,
"reward_std": 0.8364351987838745,
"rewards/accuracy_reward": 2.3062500953674316,
"rewards/format_reward": 1.0,
"step": 525,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 303.375,
"epoch": 0.8055130168453293,
"grad_norm": 6.280530978489687,
"kl": 0.0830078125,
"learning_rate": 9.046230768938718e-08,
"loss": 0.0001,
"reward": 4.34375,
"reward_std": 0.5486670136451721,
"rewards/accuracy_reward": 3.043750047683716,
"rewards/format_reward": 1.0,
"step": 526,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.90625,
"epoch": 0.8070444104134763,
"grad_norm": 10.186624617343176,
"kl": 0.07763671875,
"learning_rate": 8.908704687740898e-08,
"loss": 0.0001,
"reward": 3.606250047683716,
"reward_std": 0.20874956250190735,
"rewards/accuracy_reward": 2.3062498569488525,
"rewards/format_reward": 1.0,
"step": 527,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.59375,
"epoch": 0.8085758039816233,
"grad_norm": 7.133477662889388,
"kl": 0.072265625,
"learning_rate": 8.772129698210495e-08,
"loss": 0.0001,
"reward": 3.5687499046325684,
"reward_std": 0.3917831778526306,
"rewards/accuracy_reward": 2.268749952316284,
"rewards/format_reward": 1.0,
"step": 528,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 299.6875,
"epoch": 0.8101071975497703,
"grad_norm": 5.393522043538054,
"kl": 0.07568359375,
"learning_rate": 8.636508961487471e-08,
"loss": 0.0001,
"reward": 4.518750190734863,
"reward_std": 0.48873287439346313,
"rewards/accuracy_reward": 3.21875,
"rewards/format_reward": 1.0,
"step": 529,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 345.46875,
"epoch": 0.8116385911179173,
"grad_norm": 6.620096667191019,
"kl": 0.064453125,
"learning_rate": 8.501845616624798e-08,
"loss": 0.0001,
"reward": 3.6937501430511475,
"reward_std": 0.3262782394886017,
"rewards/accuracy_reward": 2.393749952316284,
"rewards/format_reward": 1.0,
"step": 530,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.40625,
"epoch": 0.8131699846860643,
"grad_norm": 8.442037927760953,
"kl": 0.06982421875,
"learning_rate": 8.368142780515796e-08,
"loss": 0.0001,
"reward": 4.737500190734863,
"reward_std": 0.48170897364616394,
"rewards/accuracy_reward": 3.437499761581421,
"rewards/format_reward": 1.0,
"step": 531,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.4375,
"epoch": 0.8147013782542113,
"grad_norm": 9.915935858659468,
"kl": 0.06689453125,
"learning_rate": 8.235403547822062e-08,
"loss": 0.0001,
"reward": 3.0187501907348633,
"reward_std": 0.5618056058883667,
"rewards/accuracy_reward": 1.7937500476837158,
"rewards/format_reward": 1.0,
"step": 532,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.84375,
"epoch": 0.8162327718223583,
"grad_norm": 5.261205124298314,
"kl": 0.07421875,
"learning_rate": 8.103630990901827e-08,
"loss": 0.0001,
"reward": 3.0374999046325684,
"reward_std": 0.3254516124725342,
"rewards/accuracy_reward": 1.7375001907348633,
"rewards/format_reward": 1.0,
"step": 533,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 324.5625,
"epoch": 0.8177641653905053,
"grad_norm": 8.964171987309859,
"kl": 0.07421875,
"learning_rate": 7.972828159738765e-08,
"loss": 0.0001,
"reward": 3.4124999046325684,
"reward_std": 0.4482683539390564,
"rewards/accuracy_reward": 2.1125001907348633,
"rewards/format_reward": 1.0,
"step": 534,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 367.5625,
"epoch": 0.8192955589586524,
"grad_norm": 9.20349897028433,
"kl": 0.06982421875,
"learning_rate": 7.842998081871493e-08,
"loss": 0.0001,
"reward": 3.0625,
"reward_std": 0.35382628440856934,
"rewards/accuracy_reward": 1.7625000476837158,
"rewards/format_reward": 1.0,
"step": 535,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 368.09375,
"epoch": 0.8208269525267994,
"grad_norm": 7.456768649523639,
"kl": 0.06201171875,
"learning_rate": 7.714143762323433e-08,
"loss": 0.0001,
"reward": 4.1875,
"reward_std": 0.39741051197052,
"rewards/accuracy_reward": 2.8874998092651367,
"rewards/format_reward": 1.0,
"step": 536,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.03125,
"epoch": 0.8223583460949464,
"grad_norm": 5.3780765218567845,
"kl": 0.064453125,
"learning_rate": 7.58626818353329e-08,
"loss": 0.0001,
"reward": 3.8187499046325684,
"reward_std": 0.39980944991111755,
"rewards/accuracy_reward": 2.518749952316284,
"rewards/format_reward": 1.0,
"step": 537,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 382.625,
"epoch": 0.8238897396630934,
"grad_norm": 4.210494443133476,
"kl": 0.064453125,
"learning_rate": 7.459374305286009e-08,
"loss": 0.0001,
"reward": 4.081250190734863,
"reward_std": 0.30467599630355835,
"rewards/accuracy_reward": 2.78125,
"rewards/format_reward": 1.0,
"step": 538,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 382.3125,
"epoch": 0.8254211332312404,
"grad_norm": 6.8239106890424885,
"kl": 0.0634765625,
"learning_rate": 7.333465064644301e-08,
"loss": 0.0001,
"reward": 3.237499952316284,
"reward_std": 0.43901118636131287,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 1.0,
"step": 539,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.5625,
"epoch": 0.8269525267993875,
"grad_norm": 8.104808281105164,
"kl": 0.07763671875,
"learning_rate": 7.208543375880594e-08,
"loss": 0.0001,
"reward": 3.2437500953674316,
"reward_std": 0.44948697090148926,
"rewards/accuracy_reward": 1.9437499046325684,
"rewards/format_reward": 1.0,
"step": 540,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.0625,
"epoch": 0.8284839203675345,
"grad_norm": 6.257706588887124,
"kl": 0.06591796875,
"learning_rate": 7.084612130409634e-08,
"loss": 0.0001,
"reward": 3.9125001430511475,
"reward_std": 0.39685332775115967,
"rewards/accuracy_reward": 2.612499952316284,
"rewards/format_reward": 1.0,
"step": 541,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.9375,
"epoch": 0.8300153139356815,
"grad_norm": 7.447523308777689,
"kl": 0.0654296875,
"learning_rate": 6.961674196721556e-08,
"loss": 0.0001,
"reward": 3.674999713897705,
"reward_std": 0.4210602045059204,
"rewards/accuracy_reward": 2.375,
"rewards/format_reward": 1.0,
"step": 542,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 349.21875,
"epoch": 0.8315467075038285,
"grad_norm": 5.213659250225261,
"kl": 0.06982421875,
"learning_rate": 6.839732420315458e-08,
"loss": 0.0001,
"reward": 3.387500047683716,
"reward_std": 0.20724307000637054,
"rewards/accuracy_reward": 2.0875000953674316,
"rewards/format_reward": 1.0,
"step": 543,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.53125,
"epoch": 0.8330781010719756,
"grad_norm": 6.187298433070561,
"kl": 0.07373046875,
"learning_rate": 6.718789623633597e-08,
"loss": 0.0001,
"reward": 4.181249618530273,
"reward_std": 0.3611743152141571,
"rewards/accuracy_reward": 2.8812499046325684,
"rewards/format_reward": 1.0,
"step": 544,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 350.90625,
"epoch": 0.8346094946401225,
"grad_norm": 6.411269307309986,
"kl": 0.06884765625,
"learning_rate": 6.598848605996004e-08,
"loss": 0.0001,
"reward": 3.737499952316284,
"reward_std": 0.5504343509674072,
"rewards/accuracy_reward": 2.512500047683716,
"rewards/format_reward": 1.0,
"step": 545,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 388.3125,
"epoch": 0.8361408882082695,
"grad_norm": 4.73243763405586,
"kl": 0.0654296875,
"learning_rate": 6.479912143535699e-08,
"loss": 0.0001,
"reward": 3.0999999046325684,
"reward_std": 0.25826239585876465,
"rewards/accuracy_reward": 1.875,
"rewards/format_reward": 1.0,
"step": 546,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 280.78125,
"epoch": 0.8376722817764165,
"grad_norm": 5.847861898680038,
"kl": 0.08203125,
"learning_rate": 6.361982989134468e-08,
"loss": 0.0001,
"reward": 3.8187501430511475,
"reward_std": 0.48482370376586914,
"rewards/accuracy_reward": 2.5187501907348633,
"rewards/format_reward": 1.0,
"step": 547,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 339.15625,
"epoch": 0.8392036753445635,
"grad_norm": 8.739221622928392,
"kl": 0.0732421875,
"learning_rate": 6.245063872359141e-08,
"loss": 0.0001,
"reward": 4.131249904632568,
"reward_std": 0.402154803276062,
"rewards/accuracy_reward": 2.831249952316284,
"rewards/format_reward": 1.0,
"step": 548,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.96875,
"epoch": 0.8407350689127105,
"grad_norm": 6.410308165004863,
"kl": 0.06591796875,
"learning_rate": 6.129157499398385e-08,
"loss": 0.0001,
"reward": 3.6624999046325684,
"reward_std": 0.28865599632263184,
"rewards/accuracy_reward": 2.3625001907348633,
"rewards/format_reward": 1.0,
"step": 549,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.21875,
"epoch": 0.8422664624808576,
"grad_norm": 6.139748962368488,
"kl": 0.0751953125,
"learning_rate": 6.014266553000074e-08,
"loss": 0.0001,
"reward": 3.9125001430511475,
"reward_std": 0.3542941212654114,
"rewards/accuracy_reward": 2.6875,
"rewards/format_reward": 1.0,
"step": 550,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.375,
"epoch": 0.8437978560490046,
"grad_norm": 14.756876501209561,
"kl": 0.06884765625,
"learning_rate": 5.900393692409222e-08,
"loss": 0.0001,
"reward": 3.206249952316284,
"reward_std": 0.3955453038215637,
"rewards/accuracy_reward": 1.9812500476837158,
"rewards/format_reward": 1.0,
"step": 551,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 304.34375,
"epoch": 0.8453292496171516,
"grad_norm": 8.973776709804563,
"kl": 0.07861328125,
"learning_rate": 5.787541553306385e-08,
"loss": 0.0001,
"reward": 3.862499952316284,
"reward_std": 0.4274066686630249,
"rewards/accuracy_reward": 2.5625,
"rewards/format_reward": 1.0,
"step": 552,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 365.25,
"epoch": 0.8468606431852986,
"grad_norm": 8.468511967683463,
"kl": 0.07080078125,
"learning_rate": 5.6757127477467305e-08,
"loss": 0.0001,
"reward": 3.7624998092651367,
"reward_std": 0.37143707275390625,
"rewards/accuracy_reward": 2.4625000953674316,
"rewards/format_reward": 1.0,
"step": 553,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.625,
"epoch": 0.8483920367534457,
"grad_norm": 6.893679444223453,
"kl": 0.07177734375,
"learning_rate": 5.564909864099493e-08,
"loss": 0.0001,
"reward": 3.15625,
"reward_std": 0.6041836738586426,
"rewards/accuracy_reward": 1.8562499284744263,
"rewards/format_reward": 1.0,
"step": 554,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 304.4375,
"epoch": 0.8499234303215927,
"grad_norm": 6.661123517820795,
"kl": 0.072265625,
"learning_rate": 5.4551354669881145e-08,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.24634216725826263,
"rewards/accuracy_reward": 2.7624998092651367,
"rewards/format_reward": 1.0,
"step": 555,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 323.4375,
"epoch": 0.8514548238897397,
"grad_norm": 8.99928636834237,
"kl": 0.07470703125,
"learning_rate": 5.34639209723089e-08,
"loss": 0.0001,
"reward": 3.6999998092651367,
"reward_std": 0.35582679510116577,
"rewards/accuracy_reward": 2.4000000953674316,
"rewards/format_reward": 1.0,
"step": 556,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.84375,
"epoch": 0.8529862174578867,
"grad_norm": 15.728098381965413,
"kl": 0.0751953125,
"learning_rate": 5.238682271782102e-08,
"loss": 0.0001,
"reward": 3.8812499046325684,
"reward_std": 0.5641553401947021,
"rewards/accuracy_reward": 2.5812501907348633,
"rewards/format_reward": 1.0,
"step": 557,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 317.5625,
"epoch": 0.8545176110260337,
"grad_norm": 9.747780740390152,
"kl": 0.0673828125,
"learning_rate": 5.132008483673872e-08,
"loss": 0.0001,
"reward": 4.112500190734863,
"reward_std": 0.25696486234664917,
"rewards/accuracy_reward": 2.8125,
"rewards/format_reward": 1.0,
"step": 558,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 330.03125,
"epoch": 0.8560490045941807,
"grad_norm": 58.7445907158911,
"kl": 0.068359375,
"learning_rate": 5.0263732019583335e-08,
"loss": 0.0001,
"reward": 3.3499999046325684,
"reward_std": 0.2300891876220703,
"rewards/accuracy_reward": 2.049999952316284,
"rewards/format_reward": 1.0,
"step": 559,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 341.1875,
"epoch": 0.8575803981623277,
"grad_norm": 15.018191525656299,
"kl": 0.068359375,
"learning_rate": 4.921778871650539e-08,
"loss": 0.0001,
"reward": 3.581249952316284,
"reward_std": 0.33813318610191345,
"rewards/accuracy_reward": 2.28125,
"rewards/format_reward": 1.0,
"step": 560,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 339.40625,
"epoch": 0.8591117917304747,
"grad_norm": 10.084138368008286,
"kl": 0.0654296875,
"learning_rate": 4.818227913671891e-08,
"loss": 0.0001,
"reward": 3.268749952316284,
"reward_std": 0.536249041557312,
"rewards/accuracy_reward": 2.0437498092651367,
"rewards/format_reward": 1.0,
"step": 561,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.3125,
"epoch": 0.8606431852986217,
"grad_norm": 6.380660496138349,
"kl": 0.0693359375,
"learning_rate": 4.715722724794091e-08,
"loss": 0.0001,
"reward": 3.9312500953674316,
"reward_std": 0.5714499950408936,
"rewards/accuracy_reward": 2.6312499046325684,
"rewards/format_reward": 1.0,
"step": 562,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 325.5,
"epoch": 0.8621745788667687,
"grad_norm": 4.148426286122439,
"kl": 0.0712890625,
"learning_rate": 4.6142656775836395e-08,
"loss": 0.0001,
"reward": 3.7437496185302734,
"reward_std": 0.416331022977829,
"rewards/accuracy_reward": 2.4437499046325684,
"rewards/format_reward": 1.0,
"step": 563,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 370.3125,
"epoch": 0.8637059724349158,
"grad_norm": 8.631894184924093,
"kl": 0.06201171875,
"learning_rate": 4.513859120346947e-08,
"loss": 0.0001,
"reward": 3.549999952316284,
"reward_std": 0.337804913520813,
"rewards/accuracy_reward": 2.25,
"rewards/format_reward": 1.0,
"step": 564,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 342.1875,
"epoch": 0.8652373660030628,
"grad_norm": 4.359189319403748,
"kl": 0.06103515625,
"learning_rate": 4.414505377075978e-08,
"loss": 0.0001,
"reward": 4.15625,
"reward_std": 0.41671764850616455,
"rewards/accuracy_reward": 2.8562498092651367,
"rewards/format_reward": 1.0,
"step": 565,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 349.25,
"epoch": 0.8667687595712098,
"grad_norm": 7.361451262732712,
"kl": 0.0673828125,
"learning_rate": 4.316206747394435e-08,
"loss": 0.0001,
"reward": 3.799999952316284,
"reward_std": 0.4484034776687622,
"rewards/accuracy_reward": 2.5,
"rewards/format_reward": 1.0,
"step": 566,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 276.25,
"epoch": 0.8683001531393568,
"grad_norm": 3.973315355124972,
"kl": 0.072265625,
"learning_rate": 4.218965506504596e-08,
"loss": 0.0001,
"reward": 4.175000190734863,
"reward_std": 0.519250750541687,
"rewards/accuracy_reward": 2.950000047683716,
"rewards/format_reward": 1.0,
"step": 567,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 295.03125,
"epoch": 0.8698315467075038,
"grad_norm": 5.998362168235618,
"kl": 0.0791015625,
"learning_rate": 4.122783905134564e-08,
"loss": 0.0001,
"reward": 4.337499618530273,
"reward_std": 0.4173644781112671,
"rewards/accuracy_reward": 3.0375001430511475,
"rewards/format_reward": 1.0,
"step": 568,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 315.0,
"epoch": 0.8713629402756509,
"grad_norm": 8.798421112404203,
"kl": 0.07177734375,
"learning_rate": 4.0276641694862504e-08,
"loss": 0.0001,
"reward": 4.037499904632568,
"reward_std": 0.34528836607933044,
"rewards/accuracy_reward": 2.737499713897705,
"rewards/format_reward": 1.0,
"step": 569,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 265.4375,
"epoch": 0.8728943338437979,
"grad_norm": 3.2524813095600424,
"kl": 0.072265625,
"learning_rate": 3.933608501183788e-08,
"loss": 0.0001,
"reward": 4.018749713897705,
"reward_std": 0.3298349678516388,
"rewards/accuracy_reward": 2.71875,
"rewards/format_reward": 1.0,
"step": 570,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 356.5,
"epoch": 0.8744257274119449,
"grad_norm": 5.812136800776212,
"kl": 0.061279296875,
"learning_rate": 3.840619077222612e-08,
"loss": 0.0001,
"reward": 3.7937498092651367,
"reward_std": 0.2996395230293274,
"rewards/accuracy_reward": 2.4937498569488525,
"rewards/format_reward": 1.0,
"step": 571,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.5625,
"epoch": 0.8759571209800919,
"grad_norm": 15.570383714813701,
"kl": 0.078125,
"learning_rate": 3.7486980499190804e-08,
"loss": 0.0001,
"reward": 3.3625001907348633,
"reward_std": 0.2761770784854889,
"rewards/accuracy_reward": 2.137500047683716,
"rewards/format_reward": 1.0,
"step": 572,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 281.65625,
"epoch": 0.877488514548239,
"grad_norm": 7.2942510199839194,
"kl": 0.07421875,
"learning_rate": 3.6578475468606096e-08,
"loss": 0.0001,
"reward": 4.09375,
"reward_std": 0.38450920581817627,
"rewards/accuracy_reward": 2.7937498092651367,
"rewards/format_reward": 1.0,
"step": 573,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.09375,
"epoch": 0.8790199081163859,
"grad_norm": 15.251141316249088,
"kl": 0.0791015625,
"learning_rate": 3.568069670856466e-08,
"loss": 0.0001,
"reward": 3.7562499046325684,
"reward_std": 0.36518940329551697,
"rewards/accuracy_reward": 2.53125,
"rewards/format_reward": 1.0,
"step": 574,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 330.59375,
"epoch": 0.8805513016845329,
"grad_norm": 9.60945967657897,
"kl": 0.0712890625,
"learning_rate": 3.479366499889058e-08,
"loss": 0.0001,
"reward": 4.612500190734863,
"reward_std": 0.26537150144577026,
"rewards/accuracy_reward": 3.3125,
"rewards/format_reward": 1.0,
"step": 575,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 410.78125,
"epoch": 0.8820826952526799,
"grad_norm": 4.801673821010206,
"kl": 0.06201171875,
"learning_rate": 3.391740087065914e-08,
"loss": 0.0001,
"reward": 3.1500000953674316,
"reward_std": 0.13879363238811493,
"rewards/accuracy_reward": 1.9249999523162842,
"rewards/format_reward": 1.0,
"step": 576,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 387.40625,
"epoch": 0.8836140888208269,
"grad_norm": 38.338735179774616,
"kl": 0.0693359375,
"learning_rate": 3.305192460572087e-08,
"loss": 0.0001,
"reward": 3.674999952316284,
"reward_std": 0.4538288116455078,
"rewards/accuracy_reward": 2.375,
"rewards/format_reward": 1.0,
"step": 577,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 356.46875,
"epoch": 0.885145482388974,
"grad_norm": 18.580046936316474,
"kl": 0.06298828125,
"learning_rate": 3.219725623623243e-08,
"loss": 0.0001,
"reward": 3.1187500953674316,
"reward_std": 0.3937293291091919,
"rewards/accuracy_reward": 1.8937499523162842,
"rewards/format_reward": 1.0,
"step": 578,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 389.0625,
"epoch": 0.886676875957121,
"grad_norm": 6.789398671780015,
"kl": 0.0625,
"learning_rate": 3.135341554419274e-08,
"loss": 0.0001,
"reward": 3.5999999046325684,
"reward_std": 0.4359382390975952,
"rewards/accuracy_reward": 2.3000001907348633,
"rewards/format_reward": 1.0,
"step": 579,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 323.9375,
"epoch": 0.888208269525268,
"grad_norm": 5.950185827212625,
"kl": 0.0673828125,
"learning_rate": 3.052042206098537e-08,
"loss": 0.0001,
"reward": 3.75,
"reward_std": 0.4836667478084564,
"rewards/accuracy_reward": 2.450000047683716,
"rewards/format_reward": 1.0,
"step": 580,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 299.34375,
"epoch": 0.889739663093415,
"grad_norm": 4.44586827014121,
"kl": 0.07275390625,
"learning_rate": 2.9698295066926615e-08,
"loss": 0.0001,
"reward": 3.4250001907348633,
"reward_std": 0.4025263786315918,
"rewards/accuracy_reward": 2.125,
"rewards/format_reward": 1.0,
"step": 581,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.3125,
"epoch": 0.891271056661562,
"grad_norm": 9.428133854879954,
"kl": 0.07177734375,
"learning_rate": 2.8887053590818556e-08,
"loss": 0.0001,
"reward": 4.275000095367432,
"reward_std": 0.28471794724464417,
"rewards/accuracy_reward": 2.9750001430511475,
"rewards/format_reward": 1.0,
"step": 582,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.3125,
"epoch": 0.892802450229709,
"grad_norm": 6.226688787620919,
"kl": 0.056884765625,
"learning_rate": 2.808671640950927e-08,
"loss": 0.0001,
"reward": 3.7249999046325684,
"reward_std": 0.4804357588291168,
"rewards/accuracy_reward": 2.424999952316284,
"rewards/format_reward": 1.0,
"step": 583,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 372.3125,
"epoch": 0.8943338437978561,
"grad_norm": 7.851692063224346,
"kl": 0.0693359375,
"learning_rate": 2.7297302047458058e-08,
"loss": 0.0001,
"reward": 3.987499952316284,
"reward_std": 0.42765170335769653,
"rewards/accuracy_reward": 2.6875,
"rewards/format_reward": 1.0,
"step": 584,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 331.3125,
"epoch": 0.8958652373660031,
"grad_norm": 5.870969665593799,
"kl": 0.060302734375,
"learning_rate": 2.6518828776306347e-08,
"loss": 0.0001,
"reward": 4.237500190734863,
"reward_std": 0.3059806525707245,
"rewards/accuracy_reward": 2.9375,
"rewards/format_reward": 1.0,
"step": 585,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.40625,
"epoch": 0.8973966309341501,
"grad_norm": 4.616698971444567,
"kl": 0.07958984375,
"learning_rate": 2.5751314614455455e-08,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.3738013207912445,
"rewards/accuracy_reward": 2.762500047683716,
"rewards/format_reward": 1.0,
"step": 586,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.0,
"epoch": 0.8989280245022971,
"grad_norm": 4.757707203191272,
"kl": 0.072265625,
"learning_rate": 2.4994777326648954e-08,
"loss": 0.0001,
"reward": 4.106249809265137,
"reward_std": 0.3416978120803833,
"rewards/accuracy_reward": 2.8062500953674316,
"rewards/format_reward": 1.0,
"step": 587,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 309.8125,
"epoch": 0.900459418070444,
"grad_norm": 4.265384443593157,
"kl": 0.0693359375,
"learning_rate": 2.424923442356158e-08,
"loss": 0.0001,
"reward": 4.674999713897705,
"reward_std": 0.3187902867794037,
"rewards/accuracy_reward": 3.375000238418579,
"rewards/format_reward": 1.0,
"step": 588,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.15625,
"epoch": 0.9019908116385911,
"grad_norm": 3.849769706047688,
"kl": 0.06494140625,
"learning_rate": 2.3514703161394088e-08,
"loss": 0.0001,
"reward": 3.3562498092651367,
"reward_std": 0.5659125447273254,
"rewards/accuracy_reward": 2.0562500953674316,
"rewards/format_reward": 1.0,
"step": 589,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 346.75,
"epoch": 0.9035222052067381,
"grad_norm": 3.9256866651208986,
"kl": 0.06787109375,
"learning_rate": 2.279120054147393e-08,
"loss": 0.0001,
"reward": 3.706249952316284,
"reward_std": 0.5209039449691772,
"rewards/accuracy_reward": 2.40625,
"rewards/format_reward": 1.0,
"step": 590,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 371.53125,
"epoch": 0.9050535987748851,
"grad_norm": 5.095664340886334,
"kl": 0.06591796875,
"learning_rate": 2.207874330986148e-08,
"loss": 0.0001,
"reward": 3.4499998092651367,
"reward_std": 0.2862982749938965,
"rewards/accuracy_reward": 2.1500000953674316,
"rewards/format_reward": 1.0,
"step": 591,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.03125,
"epoch": 0.9065849923430321,
"grad_norm": 7.617297771827687,
"kl": 0.07666015625,
"learning_rate": 2.1377347956962556e-08,
"loss": 0.0001,
"reward": 3.706249952316284,
"reward_std": 0.3007799983024597,
"rewards/accuracy_reward": 2.406249761581421,
"rewards/format_reward": 1.0,
"step": 592,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 353.71875,
"epoch": 0.9081163859111792,
"grad_norm": 5.52079112647666,
"kl": 0.06640625,
"learning_rate": 2.068703071714678e-08,
"loss": 0.0001,
"reward": 3.34375,
"reward_std": 0.3307170867919922,
"rewards/accuracy_reward": 2.043750047683716,
"rewards/format_reward": 1.0,
"step": 593,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 353.40625,
"epoch": 0.9096477794793262,
"grad_norm": 8.716888115770072,
"kl": 0.05810546875,
"learning_rate": 2.0007807568371725e-08,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.3458287715911865,
"rewards/accuracy_reward": 2.8249998092651367,
"rewards/format_reward": 1.0,
"step": 594,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 312.78125,
"epoch": 0.9111791730474732,
"grad_norm": 5.199539573733889,
"kl": 0.06884765625,
"learning_rate": 1.9339694231813252e-08,
"loss": 0.0001,
"reward": 3.3687500953674316,
"reward_std": 0.4148343503475189,
"rewards/accuracy_reward": 2.143749952316284,
"rewards/format_reward": 1.0,
"step": 595,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 344.125,
"epoch": 0.9127105666156202,
"grad_norm": 7.552839202353648,
"kl": 0.0625,
"learning_rate": 1.8682706171501416e-08,
"loss": 0.0001,
"reward": 3.799999952316284,
"reward_std": 0.40122342109680176,
"rewards/accuracy_reward": 2.5,
"rewards/format_reward": 1.0,
"step": 596,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 294.84375,
"epoch": 0.9142419601837672,
"grad_norm": 9.787298112854378,
"kl": 0.06982421875,
"learning_rate": 1.80368585939627e-08,
"loss": 0.0001,
"reward": 3.4250001907348633,
"reward_std": 0.4172694683074951,
"rewards/accuracy_reward": 2.200000047683716,
"rewards/format_reward": 1.0,
"step": 597,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 344.125,
"epoch": 0.9157733537519143,
"grad_norm": 28.605894274862234,
"kl": 0.0703125,
"learning_rate": 1.7402166447867962e-08,
"loss": 0.0001,
"reward": 3.5062499046325684,
"reward_std": 0.4138874411582947,
"rewards/accuracy_reward": 2.206249952316284,
"rewards/format_reward": 1.0,
"step": 598,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 359.40625,
"epoch": 0.9173047473200613,
"grad_norm": 4.883960139516868,
"kl": 0.06298828125,
"learning_rate": 1.6778644423686482e-08,
"loss": 0.0001,
"reward": 3.4124999046325684,
"reward_std": 0.4562169313430786,
"rewards/accuracy_reward": 2.112499952316284,
"rewards/format_reward": 1.0,
"step": 599,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.1875,
"epoch": 0.9188361408882083,
"grad_norm": 10.33792433059585,
"kl": 0.0625,
"learning_rate": 1.616630695334592e-08,
"loss": 0.0001,
"reward": 4.493749618530273,
"reward_std": 0.5656530857086182,
"rewards/accuracy_reward": 3.1937499046325684,
"rewards/format_reward": 1.0,
"step": 600,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 307.28125,
"epoch": 0.9203675344563553,
"grad_norm": 13.576771230474968,
"kl": 0.0751953125,
"learning_rate": 1.5565168209898395e-08,
"loss": 0.0001,
"reward": 3.5250000953674316,
"reward_std": 0.2717602252960205,
"rewards/accuracy_reward": 2.299999952316284,
"rewards/format_reward": 1.0,
"step": 601,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 364.8125,
"epoch": 0.9218989280245024,
"grad_norm": 10.322156519733802,
"kl": 0.06640625,
"learning_rate": 1.497524210719203e-08,
"loss": 0.0001,
"reward": 3.8999998569488525,
"reward_std": 0.35277166962623596,
"rewards/accuracy_reward": 2.6000001430511475,
"rewards/format_reward": 1.0,
"step": 602,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 343.125,
"epoch": 0.9234303215926493,
"grad_norm": 4.515624838691467,
"kl": 0.06884765625,
"learning_rate": 1.4396542299549563e-08,
"loss": 0.0001,
"reward": 3.6500000953674316,
"reward_std": 0.3364337384700775,
"rewards/accuracy_reward": 2.3499999046325684,
"rewards/format_reward": 1.0,
"step": 603,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 328.6875,
"epoch": 0.9249617151607963,
"grad_norm": 23.351233380824027,
"kl": 0.0654296875,
"learning_rate": 1.3829082181451624e-08,
"loss": 0.0001,
"reward": 4.300000190734863,
"reward_std": 0.29585695266723633,
"rewards/accuracy_reward": 3.0,
"rewards/format_reward": 1.0,
"step": 604,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.4375,
"epoch": 0.9264931087289433,
"grad_norm": 10.101075291421978,
"kl": 0.068359375,
"learning_rate": 1.3272874887227281e-08,
"loss": 0.0001,
"reward": 3.2125000953674316,
"reward_std": 0.44710811972618103,
"rewards/accuracy_reward": 1.912500023841858,
"rewards/format_reward": 1.0,
"step": 605,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 350.1875,
"epoch": 0.9280245022970903,
"grad_norm": 5.202766658176163,
"kl": 0.06494140625,
"learning_rate": 1.2727933290749615e-08,
"loss": 0.0001,
"reward": 4.256249904632568,
"reward_std": 0.2566061019897461,
"rewards/accuracy_reward": 3.03125,
"rewards/format_reward": 1.0,
"step": 606,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.78125,
"epoch": 0.9295558958652373,
"grad_norm": 6.1359910508190385,
"kl": 0.0703125,
"learning_rate": 1.2194270005137953e-08,
"loss": 0.0001,
"reward": 3.3999998569488525,
"reward_std": 0.26921939849853516,
"rewards/accuracy_reward": 2.174999713897705,
"rewards/format_reward": 1.0,
"step": 607,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 316.46875,
"epoch": 0.9310872894333844,
"grad_norm": 6.319370371879856,
"kl": 0.06640625,
"learning_rate": 1.1671897382465878e-08,
"loss": 0.0001,
"reward": 3.8249998092651367,
"reward_std": 0.4702260196208954,
"rewards/accuracy_reward": 2.5250000953674316,
"rewards/format_reward": 1.0,
"step": 608,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 301.4375,
"epoch": 0.9326186830015314,
"grad_norm": 5.090879708247038,
"kl": 0.06884765625,
"learning_rate": 1.1160827513475468e-08,
"loss": 0.0001,
"reward": 3.90625,
"reward_std": 0.4447071850299835,
"rewards/accuracy_reward": 2.606250047683716,
"rewards/format_reward": 1.0,
"step": 609,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 306.21875,
"epoch": 0.9341500765696784,
"grad_norm": 5.687037019839013,
"kl": 0.0712890625,
"learning_rate": 1.066107222729712e-08,
"loss": 0.0001,
"reward": 3.625,
"reward_std": 0.3328478932380676,
"rewards/accuracy_reward": 2.3249998092651367,
"rewards/format_reward": 1.0,
"step": 610,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 303.90625,
"epoch": 0.9356814701378254,
"grad_norm": 6.2524293242086815,
"kl": 0.07421875,
"learning_rate": 1.0172643091176104e-08,
"loss": 0.0001,
"reward": 3.4124999046325684,
"reward_std": 0.4472746253013611,
"rewards/accuracy_reward": 2.112499952316284,
"rewards/format_reward": 1.0,
"step": 611,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.21875,
"epoch": 0.9372128637059725,
"grad_norm": 16.203473126784306,
"kl": 0.07080078125,
"learning_rate": 9.695551410204506e-09,
"loss": 0.0001,
"reward": 3.8125,
"reward_std": 0.5833103060722351,
"rewards/accuracy_reward": 2.512500047683716,
"rewards/format_reward": 1.0,
"step": 612,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 332.09375,
"epoch": 0.9387442572741195,
"grad_norm": 4.888634913199968,
"kl": 0.0751953125,
"learning_rate": 9.229808227059876e-09,
"loss": 0.0001,
"reward": 3.4124999046325684,
"reward_std": 0.289096474647522,
"rewards/accuracy_reward": 2.1125001907348633,
"rewards/format_reward": 1.0,
"step": 613,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 395.34375,
"epoch": 0.9402756508422665,
"grad_norm": 7.009140647685338,
"kl": 0.076171875,
"learning_rate": 8.775424321749381e-09,
"loss": 0.0001,
"reward": 2.9437499046325684,
"reward_std": 0.25198203325271606,
"rewards/accuracy_reward": 1.6437499523162842,
"rewards/format_reward": 1.0,
"step": 614,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.59375,
"epoch": 0.9418070444104135,
"grad_norm": 5.095209828668755,
"kl": 0.06689453125,
"learning_rate": 8.332410211360608e-09,
"loss": 0.0001,
"reward": 3.53125,
"reward_std": 0.2518140375614166,
"rewards/accuracy_reward": 2.2312498092651367,
"rewards/format_reward": 1.0,
"step": 615,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.09375,
"epoch": 0.9433384379785605,
"grad_norm": 5.274059040607986,
"kl": 0.0703125,
"learning_rate": 7.900776149817712e-09,
"loss": 0.0001,
"reward": 4.0,
"reward_std": 0.5040473937988281,
"rewards/accuracy_reward": 2.6999998092651367,
"rewards/format_reward": 1.0,
"step": 616,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 327.59375,
"epoch": 0.9448698315467075,
"grad_norm": 6.827469118684358,
"kl": 0.06640625,
"learning_rate": 7.480532127644435e-09,
"loss": 0.0001,
"reward": 4.274999618530273,
"reward_std": 0.5195462703704834,
"rewards/accuracy_reward": 3.049999952316284,
"rewards/format_reward": 1.0,
"step": 617,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 333.40625,
"epoch": 0.9464012251148545,
"grad_norm": 8.68679701533234,
"kl": 0.0673828125,
"learning_rate": 7.071687871732512e-09,
"loss": 0.0001,
"reward": 3.424999713897705,
"reward_std": 0.2530859112739563,
"rewards/accuracy_reward": 2.125,
"rewards/format_reward": 1.0,
"step": 618,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 354.8125,
"epoch": 0.9479326186830015,
"grad_norm": 6.14507875298798,
"kl": 0.0673828125,
"learning_rate": 6.6742528451171895e-09,
"loss": 0.0001,
"reward": 2.8312501907348633,
"reward_std": 0.34376227855682373,
"rewards/accuracy_reward": 1.6062500476837158,
"rewards/format_reward": 1.0,
"step": 619,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 320.25,
"epoch": 0.9494640122511485,
"grad_norm": 5.882825033336854,
"kl": 0.068359375,
"learning_rate": 6.288236246757284e-09,
"loss": 0.0001,
"reward": 4.118749618530273,
"reward_std": 0.3603181540966034,
"rewards/accuracy_reward": 2.8187499046325684,
"rewards/format_reward": 1.0,
"step": 620,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 345.71875,
"epoch": 0.9509954058192955,
"grad_norm": 15.156974160837702,
"kl": 0.064453125,
"learning_rate": 5.913647011323075e-09,
"loss": 0.0001,
"reward": 3.174999713897705,
"reward_std": 0.2598288655281067,
"rewards/accuracy_reward": 1.875,
"rewards/format_reward": 1.0,
"step": 621,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 297.40625,
"epoch": 0.9525267993874426,
"grad_norm": 3.840998936877307,
"kl": 0.07275390625,
"learning_rate": 5.5504938089890316e-09,
"loss": 0.0001,
"reward": 3.799999952316284,
"reward_std": 0.34373825788497925,
"rewards/accuracy_reward": 2.499999761581421,
"rewards/format_reward": 1.0,
"step": 622,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 347.78125,
"epoch": 0.9540581929555896,
"grad_norm": 7.652011405321351,
"kl": 0.07177734375,
"learning_rate": 5.198785045233245e-09,
"loss": 0.0001,
"reward": 3.4124999046325684,
"reward_std": 0.43421119451522827,
"rewards/accuracy_reward": 2.112499952316284,
"rewards/format_reward": 1.0,
"step": 623,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 310.53125,
"epoch": 0.9555895865237366,
"grad_norm": 4.331709203711526,
"kl": 0.0751953125,
"learning_rate": 4.85852886064303e-09,
"loss": 0.0001,
"reward": 3.6875,
"reward_std": 0.3671942353248596,
"rewards/accuracy_reward": 2.4625000953674316,
"rewards/format_reward": 1.0,
"step": 624,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 355.71875,
"epoch": 0.9571209800918836,
"grad_norm": 4.783970405387704,
"kl": 0.07080078125,
"learning_rate": 4.529733130726299e-09,
"loss": 0.0001,
"reward": 3.2562499046325684,
"reward_std": 0.37546029686927795,
"rewards/accuracy_reward": 1.9562499523162842,
"rewards/format_reward": 1.0,
"step": 625,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 361.0,
"epoch": 0.9586523736600306,
"grad_norm": 11.829435512309791,
"kl": 0.0791015625,
"learning_rate": 4.2124054657293184e-09,
"loss": 0.0001,
"reward": 3.4749999046325684,
"reward_std": 0.5535058975219727,
"rewards/accuracy_reward": 2.25,
"rewards/format_reward": 1.0,
"step": 626,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 306.9375,
"epoch": 0.9601837672281777,
"grad_norm": 10.714822722401369,
"kl": 0.07861328125,
"learning_rate": 3.9065532104607946e-09,
"loss": 0.0001,
"reward": 4.012499809265137,
"reward_std": 0.4494459629058838,
"rewards/accuracy_reward": 2.7125000953674316,
"rewards/format_reward": 1.0,
"step": 627,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 384.875,
"epoch": 0.9617151607963247,
"grad_norm": 7.244466933322755,
"kl": 0.064453125,
"learning_rate": 3.6121834441213416e-09,
"loss": 0.0001,
"reward": 3.8125,
"reward_std": 0.40354323387145996,
"rewards/accuracy_reward": 2.512500047683716,
"rewards/format_reward": 1.0,
"step": 628,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 311.96875,
"epoch": 0.9632465543644717,
"grad_norm": 4.438103763095202,
"kl": 0.07177734375,
"learning_rate": 3.3293029801403917e-09,
"loss": 0.0001,
"reward": 3.2249999046325684,
"reward_std": 0.3224777579307556,
"rewards/accuracy_reward": 1.9249999523162842,
"rewards/format_reward": 1.0,
"step": 629,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 323.96875,
"epoch": 0.9647779479326187,
"grad_norm": 4.66747144008146,
"kl": 0.06494140625,
"learning_rate": 3.0579183660177086e-09,
"loss": 0.0001,
"reward": 3.6624999046325684,
"reward_std": 0.554044246673584,
"rewards/accuracy_reward": 2.4375,
"rewards/format_reward": 1.0,
"step": 630,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 365.90625,
"epoch": 0.9663093415007658,
"grad_norm": 7.001370987648456,
"kl": 0.06298828125,
"learning_rate": 2.7980358831724004e-09,
"loss": 0.0001,
"reward": 4.1875,
"reward_std": 0.44796818494796753,
"rewards/accuracy_reward": 2.887500286102295,
"rewards/format_reward": 1.0,
"step": 631,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 383.46875,
"epoch": 0.9678407350689127,
"grad_norm": 5.139795499775845,
"kl": 0.0673828125,
"learning_rate": 2.549661546797255e-09,
"loss": 0.0001,
"reward": 3.6374998092651367,
"reward_std": 0.29774677753448486,
"rewards/accuracy_reward": 2.3374998569488525,
"rewards/format_reward": 1.0,
"step": 632,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 337.34375,
"epoch": 0.9693721286370597,
"grad_norm": 13.267348596865721,
"kl": 0.0830078125,
"learning_rate": 2.312801105719575e-09,
"loss": 0.0001,
"reward": 3.9124999046325684,
"reward_std": 0.34188583493232727,
"rewards/accuracy_reward": 2.612499952316284,
"rewards/format_reward": 1.0,
"step": 633,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 367.21875,
"epoch": 0.9709035222052067,
"grad_norm": 6.861770207450841,
"kl": 0.05908203125,
"learning_rate": 2.0874600422682297e-09,
"loss": 0.0001,
"reward": 2.9812498092651367,
"reward_std": 0.36701488494873047,
"rewards/accuracy_reward": 1.681249976158142,
"rewards/format_reward": 1.0,
"step": 634,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 322.34375,
"epoch": 0.9724349157733537,
"grad_norm": 3.549048561492334,
"kl": 0.078125,
"learning_rate": 1.8736435721465326e-09,
"loss": 0.0001,
"reward": 3.456249713897705,
"reward_std": 0.2611098885536194,
"rewards/accuracy_reward": 2.15625,
"rewards/format_reward": 1.0,
"step": 635,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 318.78125,
"epoch": 0.9739663093415007,
"grad_norm": 11.650883201241767,
"kl": 0.07568359375,
"learning_rate": 1.6713566443117832e-09,
"loss": 0.0001,
"reward": 3.9749999046325684,
"reward_std": 0.493512898683548,
"rewards/accuracy_reward": 2.674999952316284,
"rewards/format_reward": 1.0,
"step": 636,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 334.25,
"epoch": 0.9754977029096478,
"grad_norm": 6.177634684168555,
"kl": 0.06884765625,
"learning_rate": 1.4806039408604699e-09,
"loss": 0.0001,
"reward": 3.075000047683716,
"reward_std": 0.2664228677749634,
"rewards/accuracy_reward": 1.8499999046325684,
"rewards/format_reward": 1.0,
"step": 637,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 335.34375,
"epoch": 0.9770290964777948,
"grad_norm": 13.392275647815515,
"kl": 0.0712890625,
"learning_rate": 1.3013898769200783e-09,
"loss": 0.0001,
"reward": 3.5687499046325684,
"reward_std": 0.3499985337257385,
"rewards/accuracy_reward": 2.2687501907348633,
"rewards/format_reward": 1.0,
"step": 638,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 308.75,
"epoch": 0.9785604900459418,
"grad_norm": 6.08506315401085,
"kl": 0.06982421875,
"learning_rate": 1.1337186005467846e-09,
"loss": 0.0001,
"reward": 3.7249999046325684,
"reward_std": 0.30260762572288513,
"rewards/accuracy_reward": 2.424999713897705,
"rewards/format_reward": 1.0,
"step": 639,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 395.34375,
"epoch": 0.9800918836140888,
"grad_norm": 6.627925742618377,
"kl": 0.057373046875,
"learning_rate": 9.775939926296439e-10,
"loss": 0.0001,
"reward": 3.34375,
"reward_std": 0.4554150104522705,
"rewards/accuracy_reward": 2.043750047683716,
"rewards/format_reward": 1.0,
"step": 640,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.5625,
"epoch": 0.9816232771822359,
"grad_norm": 4.4909176339040195,
"kl": 0.07666015625,
"learning_rate": 8.33019666800383e-10,
"loss": 0.0001,
"reward": 4.512500286102295,
"reward_std": 0.5600020885467529,
"rewards/accuracy_reward": 3.2124998569488525,
"rewards/format_reward": 1.0,
"step": 641,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 325.09375,
"epoch": 0.9831546707503829,
"grad_norm": 5.439886025150774,
"kl": 0.06884765625,
"learning_rate": 6.999989693501906e-10,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.43934160470962524,
"rewards/accuracy_reward": 2.7624998092651367,
"rewards/format_reward": 1.0,
"step": 642,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 351.0625,
"epoch": 0.9846860643185299,
"grad_norm": 3.729869183229773,
"kl": 0.07080078125,
"learning_rate": 5.785349791520011e-10,
"loss": 0.0001,
"reward": 3.34375,
"reward_std": 0.64092618227005,
"rewards/accuracy_reward": 2.1187500953674316,
"rewards/format_reward": 1.0,
"step": 643,
"temporal_rewards": 0.75
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 343.8125,
"epoch": 0.9862174578866769,
"grad_norm": 7.5174674414090035,
"kl": 0.0693359375,
"learning_rate": 4.686305075892738e-10,
"loss": 0.0001,
"reward": 4.206249713897705,
"reward_std": 0.3077024221420288,
"rewards/accuracy_reward": 2.90625,
"rewards/format_reward": 1.0,
"step": 644,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 296.71875,
"epoch": 0.9877488514548239,
"grad_norm": 10.685241592241573,
"kl": 0.0791015625,
"learning_rate": 3.7028809849098954e-10,
"loss": 0.0001,
"reward": 3.262500047683716,
"reward_std": 0.3571416139602661,
"rewards/accuracy_reward": 1.962499976158142,
"rewards/format_reward": 1.0,
"step": 645,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 326.71875,
"epoch": 0.9892802450229708,
"grad_norm": 8.356539394023253,
"kl": 0.0712890625,
"learning_rate": 2.835100280726976e-10,
"loss": 0.0001,
"reward": 3.5625,
"reward_std": 0.4336293935775757,
"rewards/accuracy_reward": 2.2624998092651367,
"rewards/format_reward": 1.0,
"step": 646,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 300.8125,
"epoch": 0.9908116385911179,
"grad_norm": 20.49471176250662,
"kl": 0.06787109375,
"learning_rate": 2.0829830488389154e-10,
"loss": 0.0001,
"reward": 3.5250000953674316,
"reward_std": 0.5362038612365723,
"rewards/accuracy_reward": 2.375,
"rewards/format_reward": 1.0,
"step": 647,
"temporal_rewards": 0.5
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 305.8125,
"epoch": 0.9923430321592649,
"grad_norm": 5.603911872872956,
"kl": 0.080078125,
"learning_rate": 1.446546697614903e-10,
"loss": 0.0001,
"reward": 4.118750095367432,
"reward_std": 0.46842336654663086,
"rewards/accuracy_reward": 2.8187499046325684,
"rewards/format_reward": 1.0,
"step": 648,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 287.78125,
"epoch": 0.9938744257274119,
"grad_norm": 4.003878671046886,
"kl": 0.07763671875,
"learning_rate": 9.258059578948207e-11,
"loss": 0.0001,
"reward": 3.356250047683716,
"reward_std": 0.25471654534339905,
"rewards/accuracy_reward": 2.0562500953674316,
"rewards/format_reward": 1.0,
"step": 649,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 340.4375,
"epoch": 0.9954058192955589,
"grad_norm": 4.5992490785580795,
"kl": 0.072265625,
"learning_rate": 5.2077288264951166e-11,
"loss": 0.0001,
"reward": 3.0875000953674316,
"reward_std": 0.4648074507713318,
"rewards/accuracy_reward": 1.787500023841858,
"rewards/format_reward": 1.0,
"step": 650,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 375.40625,
"epoch": 0.996937212863706,
"grad_norm": 6.9796469597999735,
"kl": 0.0732421875,
"learning_rate": 2.3145684670100583e-11,
"loss": 0.0001,
"reward": 3.637500047683716,
"reward_std": 0.42184919118881226,
"rewards/accuracy_reward": 2.3375000953674316,
"rewards/format_reward": 1.0,
"step": 651,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 379.875,
"epoch": 0.998468606431853,
"grad_norm": 4.574612761450238,
"kl": 0.06494140625,
"learning_rate": 5.786454650602568e-12,
"loss": 0.0001,
"reward": 3.2749998569488525,
"reward_std": 0.20265839993953705,
"rewards/accuracy_reward": 1.975000023841858,
"rewards/format_reward": 1.0,
"step": 652,
"temporal_rewards": 1.0
},
{
"all_correct": 1.0,
"all_wrong": 0.0,
"completion_length": 319.75,
"epoch": 1.0,
"grad_norm": 6.773732480754798,
"kl": 0.07080078125,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 3.28125,
"reward_std": 0.394453227519989,
"rewards/accuracy_reward": 1.9812499284744263,
"rewards/format_reward": 1.0,
"step": 653,
"temporal_rewards": 1.0
}
],
"logging_steps": 1.0,
"max_steps": 653,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}