Ground-R1_orig_1050 / trainer_state.json
Icey444's picture
Upload folder using huggingface_hub
503f527 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25938735177865613,
"eval_steps": 500,
"global_step": 1050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 176.71875,
"epoch": 0.00024703557312252963,
"grad_norm": 3.9802175845327943,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": -0.0,
"reward": 1.6392865180969238,
"reward_std": 0.3521379828453064,
"rewards/accuracy_reward_stage2": 0.3434532582759857,
"rewards/format_reward_all_stage": 1.2958333492279053,
"scores/refine_times": 1.84375,
"step": 1
},
{
"completion_length": 105.453125,
"epoch": 0.0004940711462450593,
"grad_norm": 8.551880775801044,
"kl": 0.00066375732421875,
"learning_rate": 9.997529644268775e-07,
"loss": 0.0,
"reward": 1.9555529356002808,
"reward_std": 0.5750938653945923,
"rewards/accuracy_reward_stage2": 0.5399278998374939,
"rewards/format_reward_all_stage": 1.4156250953674316,
"scores/refine_times": 1.484375,
"step": 2
},
{
"completion_length": 147.21875,
"epoch": 0.0007411067193675889,
"grad_norm": 7.326960101035143,
"kl": 0.0022125244140625,
"learning_rate": 9.995059288537548e-07,
"loss": 0.0001,
"reward": 1.696824312210083,
"reward_std": 0.7795395851135254,
"rewards/accuracy_reward_stage2": 0.5530742406845093,
"rewards/format_reward_all_stage": 1.1437499523162842,
"scores/refine_times": 1.703125,
"step": 3
},
{
"completion_length": 208.84375,
"epoch": 0.0009881422924901185,
"grad_norm": 4.54458264965591,
"kl": 0.00144195556640625,
"learning_rate": 9.992588932806324e-07,
"loss": 0.0001,
"reward": 1.765295386314392,
"reward_std": 0.8180840611457825,
"rewards/accuracy_reward_stage2": 0.4215453863143921,
"rewards/format_reward_all_stage": 1.34375,
"scores/refine_times": 2.09375,
"step": 4
},
{
"completion_length": 187.09375,
"epoch": 0.0012351778656126482,
"grad_norm": 4.889281322875834,
"kl": 0.00213623046875,
"learning_rate": 9.9901185770751e-07,
"loss": 0.0001,
"reward": 1.936046838760376,
"reward_std": 0.6565989255905151,
"rewards/accuracy_reward_stage2": 0.44438010454177856,
"rewards/format_reward_all_stage": 1.4916666746139526,
"scores/refine_times": 1.875,
"step": 5
},
{
"completion_length": 144.9375,
"epoch": 0.0014822134387351778,
"grad_norm": 3.650312537503493,
"kl": 0.005859375,
"learning_rate": 9.987648221343872e-07,
"loss": 0.0002,
"reward": 2.015610694885254,
"reward_std": 0.5393483638763428,
"rewards/accuracy_reward_stage2": 0.648944079875946,
"rewards/format_reward_all_stage": 1.366666555404663,
"scores/refine_times": 1.6875,
"step": 6
},
{
"completion_length": 206.09375,
"epoch": 0.0017292490118577075,
"grad_norm": 3.88597528891217,
"kl": 0.006591796875,
"learning_rate": 9.985177865612648e-07,
"loss": 0.0003,
"reward": 1.4110958576202393,
"reward_std": 0.4156912863254547,
"rewards/accuracy_reward_stage2": 0.3126583695411682,
"rewards/format_reward_all_stage": 1.0984375476837158,
"scores/refine_times": 2.09375,
"step": 7
},
{
"completion_length": 190.78125,
"epoch": 0.001976284584980237,
"grad_norm": 3.7825448304674785,
"kl": 0.0047607421875,
"learning_rate": 9.982707509881423e-07,
"loss": 0.0002,
"reward": 1.8658943176269531,
"reward_std": 0.33927106857299805,
"rewards/accuracy_reward_stage2": 0.32214438915252686,
"rewards/format_reward_all_stage": 1.5437500476837158,
"scores/refine_times": 2.0625,
"step": 8
},
{
"completion_length": 245.578125,
"epoch": 0.002223320158102767,
"grad_norm": 3.272740004919561,
"kl": 0.00537109375,
"learning_rate": 9.980237154150196e-07,
"loss": 0.0002,
"reward": 1.6504933834075928,
"reward_std": 0.617614209651947,
"rewards/accuracy_reward_stage2": 0.34736835956573486,
"rewards/format_reward_all_stage": 1.303125023841858,
"scores/refine_times": 2.390625,
"step": 9
},
{
"completion_length": 190.5625,
"epoch": 0.0024703557312252965,
"grad_norm": 3.01022489508433,
"kl": 0.01092529296875,
"learning_rate": 9.977766798418972e-07,
"loss": 0.0004,
"reward": 2.478713035583496,
"reward_std": 0.38603711128234863,
"rewards/accuracy_reward_stage2": 0.663608968257904,
"rewards/format_reward_all_stage": 1.8151041269302368,
"scores/refine_times": 2.171875,
"step": 10
},
{
"completion_length": 155.546875,
"epoch": 0.002717391304347826,
"grad_norm": 3.9877146575128064,
"kl": 0.0174560546875,
"learning_rate": 9.975296442687747e-07,
"loss": 0.0007,
"reward": 2.293698787689209,
"reward_std": 0.36475634574890137,
"rewards/accuracy_reward_stage2": 0.5129696726799011,
"rewards/format_reward_all_stage": 1.7807291746139526,
"scores/refine_times": 1.5625,
"step": 11
},
{
"completion_length": 183.53125,
"epoch": 0.0029644268774703555,
"grad_norm": 3.396300273793604,
"kl": 0.014404296875,
"learning_rate": 9.972826086956523e-07,
"loss": 0.0006,
"reward": 2.3064093589782715,
"reward_std": 0.37103700637817383,
"rewards/accuracy_reward_stage2": 0.4647427201271057,
"rewards/format_reward_all_stage": 1.841666579246521,
"scores/refine_times": 1.859375,
"step": 12
},
{
"completion_length": 123.828125,
"epoch": 0.0032114624505928855,
"grad_norm": 6.558836134964146,
"kl": 0.12060546875,
"learning_rate": 9.970355731225296e-07,
"loss": 0.0048,
"reward": 2.1412835121154785,
"reward_std": 0.298098087310791,
"rewards/accuracy_reward_stage2": 0.4058668315410614,
"rewards/format_reward_all_stage": 1.7354166507720947,
"scores/refine_times": 1.265625,
"step": 13
},
{
"completion_length": 160.40625,
"epoch": 0.003458498023715415,
"grad_norm": 3.333202019037145,
"kl": 0.017333984375,
"learning_rate": 9.967885375494071e-07,
"loss": 0.0007,
"reward": 2.3971457481384277,
"reward_std": 0.4326985478401184,
"rewards/accuracy_reward_stage2": 0.5752710103988647,
"rewards/format_reward_all_stage": 1.821874976158142,
"scores/refine_times": 1.578125,
"step": 14
},
{
"completion_length": 138.5625,
"epoch": 0.0037055335968379445,
"grad_norm": 3.981989824534079,
"kl": 0.031982421875,
"learning_rate": 9.965415019762845e-07,
"loss": 0.0013,
"reward": 2.39411997795105,
"reward_std": 0.3097790479660034,
"rewards/accuracy_reward_stage2": 0.5972450375556946,
"rewards/format_reward_all_stage": 1.796875,
"scores/refine_times": 1.65625,
"step": 15
},
{
"completion_length": 126.546875,
"epoch": 0.003952569169960474,
"grad_norm": 4.685808632727751,
"kl": 0.024169921875,
"learning_rate": 9.96294466403162e-07,
"loss": 0.001,
"reward": 2.398407220840454,
"reward_std": 0.23343093693256378,
"rewards/accuracy_reward_stage2": 0.5484069585800171,
"rewards/format_reward_all_stage": 1.850000023841858,
"scores/refine_times": 1.25,
"step": 16
},
{
"completion_length": 121.65625,
"epoch": 0.004199604743083004,
"grad_norm": 4.846282709405759,
"kl": 0.0252685546875,
"learning_rate": 9.960474308300395e-07,
"loss": 0.001,
"reward": 2.297874689102173,
"reward_std": 0.37787097692489624,
"rewards/accuracy_reward_stage2": 0.4603745937347412,
"rewards/format_reward_all_stage": 1.837499976158142,
"scores/refine_times": 1.328125,
"step": 17
},
{
"completion_length": 130.1875,
"epoch": 0.004446640316205534,
"grad_norm": 3.377534489669681,
"kl": 0.021728515625,
"learning_rate": 9.958003952569169e-07,
"loss": 0.0009,
"reward": 2.5723018646240234,
"reward_std": 0.24451476335525513,
"rewards/accuracy_reward_stage2": 0.6816768050193787,
"rewards/format_reward_all_stage": 1.890625,
"scores/refine_times": 1.546875,
"step": 18
},
{
"completion_length": 124.125,
"epoch": 0.004693675889328063,
"grad_norm": 4.888604917487123,
"kl": 0.0260009765625,
"learning_rate": 9.955533596837944e-07,
"loss": 0.001,
"reward": 2.2013630867004395,
"reward_std": 0.43003690242767334,
"rewards/accuracy_reward_stage2": 0.4992799162864685,
"rewards/format_reward_all_stage": 1.7020832300186157,
"scores/refine_times": 1.265625,
"step": 19
},
{
"completion_length": 112.984375,
"epoch": 0.004940711462450593,
"grad_norm": 4.247112761669188,
"kl": 0.029296875,
"learning_rate": 9.95306324110672e-07,
"loss": 0.0012,
"reward": 2.458132028579712,
"reward_std": 0.45524460077285767,
"rewards/accuracy_reward_stage2": 0.6154236197471619,
"rewards/format_reward_all_stage": 1.8427083492279053,
"scores/refine_times": 1.421875,
"step": 20
},
{
"completion_length": 95.1875,
"epoch": 0.005187747035573123,
"grad_norm": 5.691563277271698,
"kl": 0.031494140625,
"learning_rate": 9.950592885375495e-07,
"loss": 0.0013,
"reward": 2.3042306900024414,
"reward_std": 0.20654383301734924,
"rewards/accuracy_reward_stage2": 0.30423077940940857,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 21
},
{
"completion_length": 77.0,
"epoch": 0.005434782608695652,
"grad_norm": 4.170049319339651,
"kl": 0.0517578125,
"learning_rate": 9.948122529644268e-07,
"loss": 0.0021,
"reward": 2.779543876647949,
"reward_std": 0.049611423164606094,
"rewards/accuracy_reward_stage2": 0.7795437574386597,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 22
},
{
"completion_length": 115.03125,
"epoch": 0.005681818181818182,
"grad_norm": 4.183045012317983,
"kl": 0.03662109375,
"learning_rate": 9.945652173913043e-07,
"loss": 0.0015,
"reward": 2.506112813949585,
"reward_std": 0.26072418689727783,
"rewards/accuracy_reward_stage2": 0.5842377543449402,
"rewards/format_reward_all_stage": 1.921875,
"scores/refine_times": 1.34375,
"step": 23
},
{
"completion_length": 128.40625,
"epoch": 0.005928853754940711,
"grad_norm": 5.330982257283442,
"kl": 0.04345703125,
"learning_rate": 9.943181818181817e-07,
"loss": 0.0017,
"reward": 2.2350950241088867,
"reward_std": 0.4230126738548279,
"rewards/accuracy_reward_stage2": 0.494990736246109,
"rewards/format_reward_all_stage": 1.7401041984558105,
"scores/refine_times": 1.53125,
"step": 24
},
{
"completion_length": 124.375,
"epoch": 0.006175889328063241,
"grad_norm": 4.250442014523113,
"kl": 0.03564453125,
"learning_rate": 9.940711462450592e-07,
"loss": 0.0014,
"reward": 2.6479578018188477,
"reward_std": 0.10312794148921967,
"rewards/accuracy_reward_stage2": 0.6667079925537109,
"rewards/format_reward_all_stage": 1.9812500476837158,
"scores/refine_times": 1.390625,
"step": 25
},
{
"completion_length": 98.3125,
"epoch": 0.006422924901185771,
"grad_norm": 5.3796088493504195,
"kl": 0.0732421875,
"learning_rate": 9.938241106719368e-07,
"loss": 0.0029,
"reward": 2.4852514266967773,
"reward_std": 0.17395630478858948,
"rewards/accuracy_reward_stage2": 0.5102513432502747,
"rewards/format_reward_all_stage": 1.975000023841858,
"scores/refine_times": 1.125,
"step": 26
},
{
"completion_length": 95.625,
"epoch": 0.0066699604743083,
"grad_norm": 4.531359745756485,
"kl": 0.0556640625,
"learning_rate": 9.93577075098814e-07,
"loss": 0.0022,
"reward": 2.668905019760132,
"reward_std": 0.0784706324338913,
"rewards/accuracy_reward_stage2": 0.6689050197601318,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 27
},
{
"completion_length": 97.859375,
"epoch": 0.00691699604743083,
"grad_norm": 6.538585320360923,
"kl": 0.037841796875,
"learning_rate": 9.933300395256916e-07,
"loss": 0.0015,
"reward": 2.3841779232025146,
"reward_std": 0.26399004459381104,
"rewards/accuracy_reward_stage2": 0.40292802453041077,
"rewards/format_reward_all_stage": 1.9812500476837158,
"scores/refine_times": 1.21875,
"step": 28
},
{
"completion_length": 73.625,
"epoch": 0.00716403162055336,
"grad_norm": 3.595423200658827,
"kl": 0.06103515625,
"learning_rate": 9.930830039525692e-07,
"loss": 0.0025,
"reward": 2.5502519607543945,
"reward_std": 0.053210172802209854,
"rewards/accuracy_reward_stage2": 0.5502521395683289,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 29
},
{
"completion_length": 106.421875,
"epoch": 0.007411067193675889,
"grad_norm": 5.2183549887056975,
"kl": 0.05029296875,
"learning_rate": 9.928359683794467e-07,
"loss": 0.002,
"reward": 2.453716516494751,
"reward_std": 0.18897229433059692,
"rewards/accuracy_reward_stage2": 0.484966516494751,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.328125,
"step": 30
},
{
"completion_length": 63.25,
"epoch": 0.007658102766798419,
"grad_norm": 4.605415382726092,
"kl": 0.061279296875,
"learning_rate": 9.92588932806324e-07,
"loss": 0.0025,
"reward": 2.706653594970703,
"reward_std": 0.16327600181102753,
"rewards/accuracy_reward_stage2": 0.8316534757614136,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 31
},
{
"completion_length": 71.1875,
"epoch": 0.007905138339920948,
"grad_norm": 5.107542561053739,
"kl": 0.04345703125,
"learning_rate": 9.923418972332016e-07,
"loss": 0.0017,
"reward": 2.5002684593200684,
"reward_std": 0.07764653861522675,
"rewards/accuracy_reward_stage2": 0.5002684593200684,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 32
},
{
"completion_length": 87.984375,
"epoch": 0.008152173913043478,
"grad_norm": 5.276629249410866,
"kl": 0.05908203125,
"learning_rate": 9.920948616600791e-07,
"loss": 0.0024,
"reward": 2.567147970199585,
"reward_std": 0.20031458139419556,
"rewards/accuracy_reward_stage2": 0.582772970199585,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.171875,
"step": 33
},
{
"completion_length": 81.375,
"epoch": 0.008399209486166008,
"grad_norm": 6.680603340606892,
"kl": 0.05078125,
"learning_rate": 9.918478260869564e-07,
"loss": 0.002,
"reward": 2.356149435043335,
"reward_std": 0.3229230046272278,
"rewards/accuracy_reward_stage2": 0.4811493754386902,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 34
},
{
"completion_length": 66.25,
"epoch": 0.008646245059288538,
"grad_norm": 4.761831971688755,
"kl": 0.06787109375,
"learning_rate": 9.91600790513834e-07,
"loss": 0.0027,
"reward": 2.679305076599121,
"reward_std": 0.2129913717508316,
"rewards/accuracy_reward_stage2": 0.8043051362037659,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 35
},
{
"completion_length": 64.5625,
"epoch": 0.008893280632411068,
"grad_norm": 4.318337980431364,
"kl": 0.057861328125,
"learning_rate": 9.913537549407113e-07,
"loss": 0.0023,
"reward": 2.641396999359131,
"reward_std": 0.06798752397298813,
"rewards/accuracy_reward_stage2": 0.6413968801498413,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 36
},
{
"completion_length": 81.828125,
"epoch": 0.009140316205533596,
"grad_norm": 6.067477705808883,
"kl": 0.08447265625,
"learning_rate": 9.911067193675888e-07,
"loss": 0.0034,
"reward": 2.5635056495666504,
"reward_std": 0.1310747265815735,
"rewards/accuracy_reward_stage2": 0.5635056495666504,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 37
},
{
"completion_length": 68.625,
"epoch": 0.009387351778656126,
"grad_norm": 5.097067697325507,
"kl": 0.0498046875,
"learning_rate": 9.908596837944664e-07,
"loss": 0.002,
"reward": 2.657144784927368,
"reward_std": 0.14659970998764038,
"rewards/accuracy_reward_stage2": 0.7821449041366577,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 38
},
{
"completion_length": 82.484375,
"epoch": 0.009634387351778656,
"grad_norm": 5.382691839406121,
"kl": 0.05810546875,
"learning_rate": 9.90612648221344e-07,
"loss": 0.0023,
"reward": 2.6429476737976074,
"reward_std": 0.056348949670791626,
"rewards/accuracy_reward_stage2": 0.642947793006897,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 39
},
{
"completion_length": 69.4375,
"epoch": 0.009881422924901186,
"grad_norm": 5.662499218407379,
"kl": 0.05908203125,
"learning_rate": 9.903656126482212e-07,
"loss": 0.0024,
"reward": 2.6908767223358154,
"reward_std": 0.20418381690979004,
"rewards/accuracy_reward_stage2": 0.8158766627311707,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 40
},
{
"completion_length": 67.3125,
"epoch": 0.010128458498023716,
"grad_norm": 4.335602071251039,
"kl": 0.1328125,
"learning_rate": 9.901185770750988e-07,
"loss": 0.0053,
"reward": 2.38730525970459,
"reward_std": 0.0236817616969347,
"rewards/accuracy_reward_stage2": 0.38730525970458984,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 41
},
{
"completion_length": 73.0625,
"epoch": 0.010375494071146246,
"grad_norm": 4.859918176788424,
"kl": 0.06201171875,
"learning_rate": 9.898715415019763e-07,
"loss": 0.0025,
"reward": 2.676914691925049,
"reward_std": 0.2246585190296173,
"rewards/accuracy_reward_stage2": 0.801914632320404,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 42
},
{
"completion_length": 55.1875,
"epoch": 0.010622529644268774,
"grad_norm": 5.676790714973813,
"kl": 0.08740234375,
"learning_rate": 9.896245059288537e-07,
"loss": 0.0035,
"reward": 2.4531073570251465,
"reward_std": 0.14041699469089508,
"rewards/accuracy_reward_stage2": 0.45310738682746887,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 43
},
{
"completion_length": 64.1875,
"epoch": 0.010869565217391304,
"grad_norm": 4.692443081447903,
"kl": 0.07080078125,
"learning_rate": 9.893774703557312e-07,
"loss": 0.0028,
"reward": 2.5724921226501465,
"reward_std": 0.03363148868083954,
"rewards/accuracy_reward_stage2": 0.5724921226501465,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 44
},
{
"completion_length": 68.640625,
"epoch": 0.011116600790513834,
"grad_norm": 4.558828578106905,
"kl": 0.11376953125,
"learning_rate": 9.891304347826085e-07,
"loss": 0.0045,
"reward": 2.6344082355499268,
"reward_std": 0.07256568968296051,
"rewards/accuracy_reward_stage2": 0.6344083547592163,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 45
},
{
"completion_length": 61.3125,
"epoch": 0.011363636363636364,
"grad_norm": 2.223285366639568,
"kl": 0.09716796875,
"learning_rate": 9.88883399209486e-07,
"loss": 0.0039,
"reward": 2.777026414871216,
"reward_std": 0.004207036457955837,
"rewards/accuracy_reward_stage2": 0.7770264148712158,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 46
},
{
"completion_length": 56.9375,
"epoch": 0.011610671936758894,
"grad_norm": 4.425636565779575,
"kl": 0.111328125,
"learning_rate": 9.886363636363636e-07,
"loss": 0.0044,
"reward": 2.7204360961914062,
"reward_std": 0.07424846291542053,
"rewards/accuracy_reward_stage2": 0.7204362154006958,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 47
},
{
"completion_length": 57.3125,
"epoch": 0.011857707509881422,
"grad_norm": 3.8051294688279405,
"kl": 0.138671875,
"learning_rate": 9.883893280632411e-07,
"loss": 0.0055,
"reward": 2.5655875205993652,
"reward_std": 0.00956629030406475,
"rewards/accuracy_reward_stage2": 0.5655874013900757,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 48
},
{
"completion_length": 73.625,
"epoch": 0.012104743083003952,
"grad_norm": 4.088750742873491,
"kl": 0.10986328125,
"learning_rate": 9.881422924901185e-07,
"loss": 0.0044,
"reward": 2.7474145889282227,
"reward_std": 0.12706872820854187,
"rewards/accuracy_reward_stage2": 0.8099147081375122,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 49
},
{
"completion_length": 82.171875,
"epoch": 0.012351778656126482,
"grad_norm": 4.328291131941699,
"kl": 0.12890625,
"learning_rate": 9.87895256916996e-07,
"loss": 0.0052,
"reward": 2.3505635261535645,
"reward_std": 0.19469095766544342,
"rewards/accuracy_reward_stage2": 0.538063645362854,
"rewards/format_reward_all_stage": 1.8125,
"scores/refine_times": 1.25,
"step": 50
},
{
"completion_length": 60.578125,
"epoch": 0.012598814229249012,
"grad_norm": 5.181949589681223,
"kl": 0.12255859375,
"learning_rate": 9.876482213438736e-07,
"loss": 0.0049,
"reward": 2.6888465881347656,
"reward_std": 0.09551382809877396,
"rewards/accuracy_reward_stage2": 0.6888466477394104,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 51
},
{
"completion_length": 62.3125,
"epoch": 0.012845849802371542,
"grad_norm": 5.337786602273433,
"kl": 0.1201171875,
"learning_rate": 9.874011857707509e-07,
"loss": 0.0048,
"reward": 2.4908738136291504,
"reward_std": 0.19815078377723694,
"rewards/accuracy_reward_stage2": 0.6158738136291504,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 52
},
{
"completion_length": 55.40625,
"epoch": 0.013092885375494072,
"grad_norm": 3.890124070261549,
"kl": 0.1103515625,
"learning_rate": 9.871541501976284e-07,
"loss": 0.0044,
"reward": 2.40610933303833,
"reward_std": 0.18748030066490173,
"rewards/accuracy_reward_stage2": 0.5311094522476196,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.15625,
"step": 53
},
{
"completion_length": 74.390625,
"epoch": 0.0133399209486166,
"grad_norm": 4.895191389307928,
"kl": 0.130859375,
"learning_rate": 9.86907114624506e-07,
"loss": 0.0053,
"reward": 2.4812636375427246,
"reward_std": 0.05580512434244156,
"rewards/accuracy_reward_stage2": 0.48126381635665894,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 54
},
{
"completion_length": 71.96875,
"epoch": 0.01358695652173913,
"grad_norm": 4.883363116264003,
"kl": 0.1279296875,
"learning_rate": 9.866600790513833e-07,
"loss": 0.0051,
"reward": 2.406097650527954,
"reward_std": 0.22400271892547607,
"rewards/accuracy_reward_stage2": 0.5935976505279541,
"rewards/format_reward_all_stage": 1.8125,
"scores/refine_times": 1.140625,
"step": 55
},
{
"completion_length": 85.546875,
"epoch": 0.01383399209486166,
"grad_norm": 3.868475183095971,
"kl": 0.11181640625,
"learning_rate": 9.864130434782608e-07,
"loss": 0.0045,
"reward": 2.4206976890563965,
"reward_std": 0.07123995572328568,
"rewards/accuracy_reward_stage2": 0.42694777250289917,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.40625,
"step": 56
},
{
"completion_length": 66.890625,
"epoch": 0.01408102766798419,
"grad_norm": 5.0953233667661095,
"kl": 0.0966796875,
"learning_rate": 9.861660079051384e-07,
"loss": 0.0039,
"reward": 2.7497503757476807,
"reward_std": 0.1626994013786316,
"rewards/accuracy_reward_stage2": 0.8747504949569702,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.078125,
"step": 57
},
{
"completion_length": 63.90625,
"epoch": 0.01432806324110672,
"grad_norm": 4.880185819937392,
"kl": 0.064453125,
"learning_rate": 9.85918972332016e-07,
"loss": 0.0026,
"reward": 2.616400957107544,
"reward_std": 0.10538823902606964,
"rewards/accuracy_reward_stage2": 0.6851509213447571,
"rewards/format_reward_all_stage": 1.931249976158142,
"scores/refine_times": 1.28125,
"step": 58
},
{
"completion_length": 57.1875,
"epoch": 0.01457509881422925,
"grad_norm": 1.809541273776412,
"kl": 0.0869140625,
"learning_rate": 9.856719367588932e-07,
"loss": 0.0035,
"reward": 2.748608112335205,
"reward_std": 0.009172855876386166,
"rewards/accuracy_reward_stage2": 0.7486082315444946,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 59
},
{
"completion_length": 73.890625,
"epoch": 0.014822134387351778,
"grad_norm": 4.654926557564929,
"kl": 0.111328125,
"learning_rate": 9.854249011857708e-07,
"loss": 0.0045,
"reward": 2.5640292167663574,
"reward_std": 0.03297269344329834,
"rewards/accuracy_reward_stage2": 0.5640289783477783,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 60
},
{
"completion_length": 86.78125,
"epoch": 0.015069169960474308,
"grad_norm": 4.5725168834876975,
"kl": 0.0712890625,
"learning_rate": 9.85177865612648e-07,
"loss": 0.0029,
"reward": 2.499513626098633,
"reward_std": 0.2176150381565094,
"rewards/accuracy_reward_stage2": 0.5849303603172302,
"rewards/format_reward_all_stage": 1.9145833253860474,
"scores/refine_times": 1.375,
"step": 61
},
{
"completion_length": 70.90625,
"epoch": 0.015316205533596838,
"grad_norm": 2.585985758788352,
"kl": 0.076171875,
"learning_rate": 9.849308300395256e-07,
"loss": 0.003,
"reward": 2.589341640472412,
"reward_std": 0.06622620671987534,
"rewards/accuracy_reward_stage2": 0.6424667239189148,
"rewards/format_reward_all_stage": 1.946874976158142,
"scores/refine_times": 1.171875,
"step": 62
},
{
"completion_length": 68.046875,
"epoch": 0.015563241106719368,
"grad_norm": 6.250671138658764,
"kl": 0.1904296875,
"learning_rate": 9.846837944664032e-07,
"loss": 0.0076,
"reward": 2.6439754962921143,
"reward_std": 0.31126198172569275,
"rewards/accuracy_reward_stage2": 0.8262671232223511,
"rewards/format_reward_all_stage": 1.8177083730697632,
"scores/refine_times": 1.078125,
"step": 63
},
{
"completion_length": 101.96875,
"epoch": 0.015810276679841896,
"grad_norm": 4.14719666980477,
"kl": 0.064453125,
"learning_rate": 9.844367588932805e-07,
"loss": 0.0026,
"reward": 2.47231388092041,
"reward_std": 0.32240432500839233,
"rewards/accuracy_reward_stage2": 0.7478347420692444,
"rewards/format_reward_all_stage": 1.7244791984558105,
"scores/refine_times": 1.390625,
"step": 64
},
{
"completion_length": 73.71875,
"epoch": 0.016057312252964428,
"grad_norm": 5.183498665871656,
"kl": 0.07861328125,
"learning_rate": 9.84189723320158e-07,
"loss": 0.0032,
"reward": 2.67384672164917,
"reward_std": 0.08753049373626709,
"rewards/accuracy_reward_stage2": 0.6738468408584595,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 65
},
{
"completion_length": 67.578125,
"epoch": 0.016304347826086956,
"grad_norm": 4.8680399656731845,
"kl": 0.05322265625,
"learning_rate": 9.839426877470356e-07,
"loss": 0.0021,
"reward": 2.583557605743408,
"reward_std": 0.12607493996620178,
"rewards/accuracy_reward_stage2": 0.5835577845573425,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 66
},
{
"completion_length": 66.875,
"epoch": 0.016551383399209488,
"grad_norm": 6.343983489288195,
"kl": 0.06982421875,
"learning_rate": 9.836956521739131e-07,
"loss": 0.0028,
"reward": 2.603722095489502,
"reward_std": 0.11146017163991928,
"rewards/accuracy_reward_stage2": 0.6037219762802124,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 67
},
{
"completion_length": 85.46875,
"epoch": 0.016798418972332016,
"grad_norm": 5.404842087971315,
"kl": 0.1083984375,
"learning_rate": 9.834486166007905e-07,
"loss": 0.0043,
"reward": 2.205897808074951,
"reward_std": 0.2572137117385864,
"rewards/accuracy_reward_stage2": 0.41527265310287476,
"rewards/format_reward_all_stage": 1.790624976158142,
"scores/refine_times": 1.25,
"step": 68
},
{
"completion_length": 88.125,
"epoch": 0.017045454545454544,
"grad_norm": 4.340005446967431,
"kl": 0.052734375,
"learning_rate": 9.83201581027668e-07,
"loss": 0.0021,
"reward": 2.445634126663208,
"reward_std": 0.36941787600517273,
"rewards/accuracy_reward_stage2": 0.6956342458724976,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.0,
"step": 69
},
{
"completion_length": 86.671875,
"epoch": 0.017292490118577076,
"grad_norm": 4.28977539978035,
"kl": 0.043701171875,
"learning_rate": 9.829545454545453e-07,
"loss": 0.0018,
"reward": 2.5991945266723633,
"reward_std": 0.10131914913654327,
"rewards/accuracy_reward_stage2": 0.5991945862770081,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 70
},
{
"completion_length": 70.0,
"epoch": 0.017539525691699604,
"grad_norm": 4.632147156699228,
"kl": 0.054443359375,
"learning_rate": 9.827075098814229e-07,
"loss": 0.0022,
"reward": 2.4850528240203857,
"reward_std": 0.15690843760967255,
"rewards/accuracy_reward_stage2": 0.6100528240203857,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 71
},
{
"completion_length": 74.0625,
"epoch": 0.017786561264822136,
"grad_norm": 7.003739986456345,
"kl": 0.06396484375,
"learning_rate": 9.824604743083004e-07,
"loss": 0.0026,
"reward": 2.482394218444824,
"reward_std": 0.38058096170425415,
"rewards/accuracy_reward_stage2": 0.6073942184448242,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 72
},
{
"completion_length": 65.375,
"epoch": 0.018033596837944664,
"grad_norm": 3.770333175418537,
"kl": 0.0634765625,
"learning_rate": 9.822134387351777e-07,
"loss": 0.0025,
"reward": 2.595181703567505,
"reward_std": 0.00595674104988575,
"rewards/accuracy_reward_stage2": 0.5951815247535706,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 73
},
{
"completion_length": 79.703125,
"epoch": 0.018280632411067192,
"grad_norm": 4.384564430068585,
"kl": 0.0634765625,
"learning_rate": 9.819664031620553e-07,
"loss": 0.0025,
"reward": 2.560758590698242,
"reward_std": 0.11315355449914932,
"rewards/accuracy_reward_stage2": 0.5795084834098816,
"rewards/format_reward_all_stage": 1.9812500476837158,
"scores/refine_times": 1.15625,
"step": 74
},
{
"completion_length": 81.9375,
"epoch": 0.018527667984189724,
"grad_norm": 6.555813557025346,
"kl": 0.051025390625,
"learning_rate": 9.817193675889328e-07,
"loss": 0.002,
"reward": 2.257451295852661,
"reward_std": 0.10627569258213043,
"rewards/accuracy_reward_stage2": 0.25745123624801636,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 75
},
{
"completion_length": 71.609375,
"epoch": 0.018774703557312252,
"grad_norm": 2.549306492315234,
"kl": 0.0634765625,
"learning_rate": 9.814723320158103e-07,
"loss": 0.0025,
"reward": 2.7328977584838867,
"reward_std": 0.06272567808628082,
"rewards/accuracy_reward_stage2": 0.7328977584838867,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 76
},
{
"completion_length": 78.375,
"epoch": 0.019021739130434784,
"grad_norm": 2.5658656287329453,
"kl": 0.07373046875,
"learning_rate": 9.812252964426877e-07,
"loss": 0.003,
"reward": 2.7386789321899414,
"reward_std": 0.021051663905382156,
"rewards/accuracy_reward_stage2": 0.7386791706085205,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 77
},
{
"completion_length": 99.03125,
"epoch": 0.019268774703557312,
"grad_norm": 4.112277800837744,
"kl": 0.06787109375,
"learning_rate": 9.809782608695652e-07,
"loss": 0.0027,
"reward": 2.516645908355713,
"reward_std": 0.11490845680236816,
"rewards/accuracy_reward_stage2": 0.5166457891464233,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 78
},
{
"completion_length": 68.75,
"epoch": 0.01951581027667984,
"grad_norm": 1.7167798311826008,
"kl": 0.07080078125,
"learning_rate": 9.807312252964425e-07,
"loss": 0.0028,
"reward": 2.8159339427948,
"reward_std": 0.0637812465429306,
"rewards/accuracy_reward_stage2": 0.8159340620040894,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 79
},
{
"completion_length": 75.578125,
"epoch": 0.019762845849802372,
"grad_norm": 5.315603208425231,
"kl": 0.06298828125,
"learning_rate": 9.8048418972332e-07,
"loss": 0.0025,
"reward": 2.4659554958343506,
"reward_std": 0.020968768745660782,
"rewards/accuracy_reward_stage2": 0.46595555543899536,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 80
},
{
"completion_length": 82.765625,
"epoch": 0.0200098814229249,
"grad_norm": 5.529728305997517,
"kl": 0.061767578125,
"learning_rate": 9.802371541501976e-07,
"loss": 0.0025,
"reward": 2.5143938064575195,
"reward_std": 0.24701416492462158,
"rewards/accuracy_reward_stage2": 0.5143939256668091,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 81
},
{
"completion_length": 72.25,
"epoch": 0.020256916996047432,
"grad_norm": 5.475938413894696,
"kl": 0.0634765625,
"learning_rate": 9.79990118577075e-07,
"loss": 0.0025,
"reward": 2.6759824752807617,
"reward_std": 0.2955029606819153,
"rewards/accuracy_reward_stage2": 0.8009825944900513,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 82
},
{
"completion_length": 70.84375,
"epoch": 0.02050395256916996,
"grad_norm": 6.444654069519576,
"kl": 0.08740234375,
"learning_rate": 9.797430830039525e-07,
"loss": 0.0035,
"reward": 2.538287878036499,
"reward_std": 0.12264619767665863,
"rewards/accuracy_reward_stage2": 0.5382877588272095,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 83
},
{
"completion_length": 71.3125,
"epoch": 0.020750988142292492,
"grad_norm": 3.595656141306728,
"kl": 0.09033203125,
"learning_rate": 9.7949604743083e-07,
"loss": 0.0036,
"reward": 2.7061915397644043,
"reward_std": 0.011706423945724964,
"rewards/accuracy_reward_stage2": 0.7061916589736938,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 84
},
{
"completion_length": 102.09375,
"epoch": 0.02099802371541502,
"grad_norm": 5.812639601627871,
"kl": 0.095703125,
"learning_rate": 9.792490118577076e-07,
"loss": 0.0038,
"reward": 2.2463674545288086,
"reward_std": 0.37936723232269287,
"rewards/accuracy_reward_stage2": 0.4276173710823059,
"rewards/format_reward_all_stage": 1.8187499046325684,
"scores/refine_times": 1.265625,
"step": 85
},
{
"completion_length": 82.578125,
"epoch": 0.021245059288537548,
"grad_norm": 5.621200937633171,
"kl": 0.07373046875,
"learning_rate": 9.79001976284585e-07,
"loss": 0.003,
"reward": 2.2440762519836426,
"reward_std": 0.26927846670150757,
"rewards/accuracy_reward_stage2": 0.3690761625766754,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 86
},
{
"completion_length": 92.609375,
"epoch": 0.02149209486166008,
"grad_norm": 5.197685430679634,
"kl": 0.07666015625,
"learning_rate": 9.787549407114624e-07,
"loss": 0.0031,
"reward": 2.726613759994507,
"reward_std": 0.14432454109191895,
"rewards/accuracy_reward_stage2": 0.7266137599945068,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 87
},
{
"completion_length": 91.125,
"epoch": 0.021739130434782608,
"grad_norm": 2.9714055097674095,
"kl": 0.0908203125,
"learning_rate": 9.7850790513834e-07,
"loss": 0.0036,
"reward": 2.7476940155029297,
"reward_std": 0.22233270108699799,
"rewards/accuracy_reward_stage2": 0.8883191347122192,
"rewards/format_reward_all_stage": 1.859375,
"scores/refine_times": 1.3125,
"step": 88
},
{
"completion_length": 95.90625,
"epoch": 0.02198616600790514,
"grad_norm": 4.966352277090883,
"kl": 0.083984375,
"learning_rate": 9.782608695652173e-07,
"loss": 0.0034,
"reward": 2.6650359630584717,
"reward_std": 0.04115064814686775,
"rewards/accuracy_reward_stage2": 0.6650359034538269,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 89
},
{
"completion_length": 91.6875,
"epoch": 0.022233201581027668,
"grad_norm": 3.475945047080351,
"kl": 0.061279296875,
"learning_rate": 9.780138339920948e-07,
"loss": 0.0024,
"reward": 2.6070141792297363,
"reward_std": 0.1261502206325531,
"rewards/accuracy_reward_stage2": 0.6070142388343811,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 90
},
{
"completion_length": 60.6875,
"epoch": 0.022480237154150196,
"grad_norm": 4.508462809688603,
"kl": 0.07373046875,
"learning_rate": 9.777667984189722e-07,
"loss": 0.0029,
"reward": 2.6952967643737793,
"reward_std": 0.08862090110778809,
"rewards/accuracy_reward_stage2": 0.6952967643737793,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 91
},
{
"completion_length": 85.203125,
"epoch": 0.022727272727272728,
"grad_norm": 6.403954189418028,
"kl": 0.068359375,
"learning_rate": 9.775197628458497e-07,
"loss": 0.0027,
"reward": 2.4433703422546387,
"reward_std": 0.20903919637203217,
"rewards/accuracy_reward_stage2": 0.5058705806732178,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 92
},
{
"completion_length": 77.125,
"epoch": 0.022974308300395256,
"grad_norm": 3.6283748627126804,
"kl": 0.068359375,
"learning_rate": 9.772727272727273e-07,
"loss": 0.0027,
"reward": 2.6447415351867676,
"reward_std": 0.08912975341081619,
"rewards/accuracy_reward_stage2": 0.6447416543960571,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 93
},
{
"completion_length": 95.125,
"epoch": 0.023221343873517788,
"grad_norm": 5.182312100272035,
"kl": 0.0712890625,
"learning_rate": 9.770256916996048e-07,
"loss": 0.0029,
"reward": 2.412106990814209,
"reward_std": 0.03851859271526337,
"rewards/accuracy_reward_stage2": 0.4121071696281433,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 94
},
{
"completion_length": 82.875,
"epoch": 0.023468379446640316,
"grad_norm": 4.024559027790135,
"kl": 0.055419921875,
"learning_rate": 9.767786561264821e-07,
"loss": 0.0022,
"reward": 2.6596741676330566,
"reward_std": 0.08960773050785065,
"rewards/accuracy_reward_stage2": 0.6596741676330566,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 95
},
{
"completion_length": 97.015625,
"epoch": 0.023715415019762844,
"grad_norm": 3.8218241857549216,
"kl": 0.06640625,
"learning_rate": 9.765316205533597e-07,
"loss": 0.0027,
"reward": 2.511441707611084,
"reward_std": 0.1077704057097435,
"rewards/accuracy_reward_stage2": 0.511441707611084,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 96
},
{
"completion_length": 84.578125,
"epoch": 0.023962450592885376,
"grad_norm": 5.000582952776722,
"kl": 0.07568359375,
"learning_rate": 9.762845849802372e-07,
"loss": 0.003,
"reward": 2.4915847778320312,
"reward_std": 0.1341104507446289,
"rewards/accuracy_reward_stage2": 0.4915849566459656,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 97
},
{
"completion_length": 63.15625,
"epoch": 0.024209486166007904,
"grad_norm": 5.040940426798209,
"kl": 0.09912109375,
"learning_rate": 9.760375494071145e-07,
"loss": 0.004,
"reward": 2.692129611968994,
"reward_std": 0.09403635561466217,
"rewards/accuracy_reward_stage2": 0.6921296119689941,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 98
},
{
"completion_length": 89.546875,
"epoch": 0.024456521739130436,
"grad_norm": 4.932061600457188,
"kl": 0.060302734375,
"learning_rate": 9.75790513833992e-07,
"loss": 0.0024,
"reward": 2.525324821472168,
"reward_std": 0.15960124135017395,
"rewards/accuracy_reward_stage2": 0.5878250598907471,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 99
},
{
"completion_length": 72.640625,
"epoch": 0.024703557312252964,
"grad_norm": 4.56806382193382,
"kl": 0.064453125,
"learning_rate": 9.755434782608694e-07,
"loss": 0.0026,
"reward": 2.7026538848876953,
"reward_std": 0.1532786637544632,
"rewards/accuracy_reward_stage2": 0.7729662656784058,
"rewards/format_reward_all_stage": 1.9296875,
"scores/refine_times": 1.125,
"step": 100
},
{
"completion_length": 84.03125,
"epoch": 0.024950592885375496,
"grad_norm": 3.7622549256413014,
"kl": 0.07861328125,
"learning_rate": 9.75296442687747e-07,
"loss": 0.0031,
"reward": 2.604541778564453,
"reward_std": 0.1364867091178894,
"rewards/accuracy_reward_stage2": 0.6107918620109558,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.1875,
"step": 101
},
{
"completion_length": 93.9375,
"epoch": 0.025197628458498024,
"grad_norm": 4.0252707546161774,
"kl": 0.08251953125,
"learning_rate": 9.750494071146245e-07,
"loss": 0.0033,
"reward": 2.581124782562256,
"reward_std": 0.07142534106969833,
"rewards/accuracy_reward_stage2": 0.5811247229576111,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 102
},
{
"completion_length": 84.109375,
"epoch": 0.025444664031620552,
"grad_norm": 2.979249409154134,
"kl": 0.06640625,
"learning_rate": 9.74802371541502e-07,
"loss": 0.0027,
"reward": 2.6580286026000977,
"reward_std": 0.07319141179323196,
"rewards/accuracy_reward_stage2": 0.6580287218093872,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 103
},
{
"completion_length": 61.125,
"epoch": 0.025691699604743084,
"grad_norm": 0.28783476698060495,
"kl": 0.061279296875,
"learning_rate": 9.745553359683793e-07,
"loss": 0.0024,
"reward": 2.686764717102051,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.6867647767066956,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 104
},
{
"completion_length": 113.265625,
"epoch": 0.025938735177865612,
"grad_norm": 4.594788362626608,
"kl": 0.05810546875,
"learning_rate": 9.743083003952569e-07,
"loss": 0.0023,
"reward": 2.4998667240142822,
"reward_std": 0.04374603182077408,
"rewards/accuracy_reward_stage2": 0.4998666048049927,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 105
},
{
"completion_length": 72.0625,
"epoch": 0.026185770750988144,
"grad_norm": 4.674866102803609,
"kl": 0.048583984375,
"learning_rate": 9.740612648221344e-07,
"loss": 0.002,
"reward": 2.6107826232910156,
"reward_std": 0.05181875079870224,
"rewards/accuracy_reward_stage2": 0.6107826828956604,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 106
},
{
"completion_length": 76.8125,
"epoch": 0.026432806324110672,
"grad_norm": 4.443840034303371,
"kl": 0.05224609375,
"learning_rate": 9.738142292490117e-07,
"loss": 0.0021,
"reward": 2.552403450012207,
"reward_std": 0.14651192724704742,
"rewards/accuracy_reward_stage2": 0.552403450012207,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 107
},
{
"completion_length": 81.765625,
"epoch": 0.0266798418972332,
"grad_norm": 2.0441554829485145,
"kl": 0.06298828125,
"learning_rate": 9.735671936758893e-07,
"loss": 0.0025,
"reward": 2.7436699867248535,
"reward_std": 0.037195369601249695,
"rewards/accuracy_reward_stage2": 0.7436702251434326,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 108
},
{
"completion_length": 128.015625,
"epoch": 0.026926877470355732,
"grad_norm": 3.968644058753578,
"kl": 0.08984375,
"learning_rate": 9.733201581027668e-07,
"loss": 0.0036,
"reward": 2.570418357849121,
"reward_std": 0.11060214787721634,
"rewards/accuracy_reward_stage2": 0.6329183578491211,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.40625,
"step": 109
},
{
"completion_length": 136.390625,
"epoch": 0.02717391304347826,
"grad_norm": 1.5281023437512695,
"kl": 0.050537109375,
"learning_rate": 9.730731225296442e-07,
"loss": 0.002,
"reward": 2.631211519241333,
"reward_std": 0.07405112683773041,
"rewards/accuracy_reward_stage2": 0.6312115788459778,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.390625,
"step": 110
},
{
"completion_length": 103.125,
"epoch": 0.027420948616600792,
"grad_norm": 4.3848043520379605,
"kl": 0.07275390625,
"learning_rate": 9.728260869565217e-07,
"loss": 0.0029,
"reward": 2.532392978668213,
"reward_std": 0.2085559368133545,
"rewards/accuracy_reward_stage2": 0.5948929190635681,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 111
},
{
"completion_length": 101.4375,
"epoch": 0.02766798418972332,
"grad_norm": 3.9823453743402126,
"kl": 0.05810546875,
"learning_rate": 9.72579051383399e-07,
"loss": 0.0023,
"reward": 2.5878045558929443,
"reward_std": 0.12208257615566254,
"rewards/accuracy_reward_stage2": 0.5940545797348022,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.203125,
"step": 112
},
{
"completion_length": 108.640625,
"epoch": 0.027915019762845848,
"grad_norm": 2.487090870664229,
"kl": 0.0341796875,
"learning_rate": 9.723320158102768e-07,
"loss": 0.0014,
"reward": 2.7555534839630127,
"reward_std": 0.015020077116787434,
"rewards/accuracy_reward_stage2": 0.7555533647537231,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 113
},
{
"completion_length": 149.234375,
"epoch": 0.02816205533596838,
"grad_norm": 1.5545707236666886,
"kl": 0.036376953125,
"learning_rate": 9.72084980237154e-07,
"loss": 0.0015,
"reward": 2.5128674507141113,
"reward_std": 0.08764077723026276,
"rewards/accuracy_reward_stage2": 0.5347423553466797,
"rewards/format_reward_all_stage": 1.978124976158142,
"scores/refine_times": 1.53125,
"step": 114
},
{
"completion_length": 111.0625,
"epoch": 0.028409090909090908,
"grad_norm": 3.9159999459644754,
"kl": 0.047607421875,
"learning_rate": 9.718379446640316e-07,
"loss": 0.0019,
"reward": 2.7607569694519043,
"reward_std": 0.22762958705425262,
"rewards/accuracy_reward_stage2": 0.7670071125030518,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.21875,
"step": 115
},
{
"completion_length": 91.890625,
"epoch": 0.02865612648221344,
"grad_norm": 2.314903676922173,
"kl": 0.037841796875,
"learning_rate": 9.71590909090909e-07,
"loss": 0.0015,
"reward": 2.563828945159912,
"reward_std": 0.0933101698756218,
"rewards/accuracy_reward_stage2": 0.6263290047645569,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 116
},
{
"completion_length": 119.109375,
"epoch": 0.028903162055335968,
"grad_norm": 4.047428002907611,
"kl": 0.04296875,
"learning_rate": 9.713438735177865e-07,
"loss": 0.0017,
"reward": 2.6038811206817627,
"reward_std": 0.20530042052268982,
"rewards/accuracy_reward_stage2": 0.6038811206817627,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 117
},
{
"completion_length": 107.71875,
"epoch": 0.0291501976284585,
"grad_norm": 3.3403630742196513,
"kl": 0.0400390625,
"learning_rate": 9.71096837944664e-07,
"loss": 0.0016,
"reward": 2.4648709297180176,
"reward_std": 0.13606658577919006,
"rewards/accuracy_reward_stage2": 0.46487098932266235,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 118
},
{
"completion_length": 104.828125,
"epoch": 0.029397233201581028,
"grad_norm": 3.9940498248823206,
"kl": 0.039306640625,
"learning_rate": 9.708498023715414e-07,
"loss": 0.0016,
"reward": 2.6417078971862793,
"reward_std": 0.0913887768983841,
"rewards/accuracy_reward_stage2": 0.6417078971862793,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 119
},
{
"completion_length": 97.1875,
"epoch": 0.029644268774703556,
"grad_norm": 3.73814994296708,
"kl": 0.042724609375,
"learning_rate": 9.70602766798419e-07,
"loss": 0.0017,
"reward": 2.562872886657715,
"reward_std": 0.07472334057092667,
"rewards/accuracy_reward_stage2": 0.5628727078437805,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 120
},
{
"completion_length": 132.296875,
"epoch": 0.029891304347826088,
"grad_norm": 2.9384630818108612,
"kl": 0.058349609375,
"learning_rate": 9.703557312252962e-07,
"loss": 0.0023,
"reward": 2.676942825317383,
"reward_std": 0.12600594758987427,
"rewards/accuracy_reward_stage2": 0.7378804087638855,
"rewards/format_reward_all_stage": 1.939062476158142,
"scores/refine_times": 1.546875,
"step": 121
},
{
"completion_length": 127.84375,
"epoch": 0.030138339920948616,
"grad_norm": 3.0365235817191767,
"kl": 0.05029296875,
"learning_rate": 9.70108695652174e-07,
"loss": 0.002,
"reward": 2.704944133758545,
"reward_std": 0.07274624705314636,
"rewards/accuracy_reward_stage2": 0.7257775664329529,
"rewards/format_reward_all_stage": 1.9791667461395264,
"scores/refine_times": 1.390625,
"step": 122
},
{
"completion_length": 100.734375,
"epoch": 0.030385375494071148,
"grad_norm": 3.5805778693833235,
"kl": 0.042236328125,
"learning_rate": 9.698616600790513e-07,
"loss": 0.0017,
"reward": 2.7929563522338867,
"reward_std": 0.12518826127052307,
"rewards/accuracy_reward_stage2": 0.803372859954834,
"rewards/format_reward_all_stage": 1.9895832538604736,
"scores/refine_times": 1.140625,
"step": 123
},
{
"completion_length": 79.609375,
"epoch": 0.030632411067193676,
"grad_norm": 1.9142569203178765,
"kl": 0.049560546875,
"learning_rate": 9.696146245059289e-07,
"loss": 0.002,
"reward": 2.949510335922241,
"reward_std": 0.01902618445456028,
"rewards/accuracy_reward_stage2": 0.9495103359222412,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 124
},
{
"completion_length": 115.65625,
"epoch": 0.030879446640316204,
"grad_norm": 3.192939893717315,
"kl": 0.049560546875,
"learning_rate": 9.693675889328062e-07,
"loss": 0.002,
"reward": 2.6586837768554688,
"reward_std": 0.07593096792697906,
"rewards/accuracy_reward_stage2": 0.6586835980415344,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 125
},
{
"completion_length": 115.65625,
"epoch": 0.031126482213438736,
"grad_norm": 4.448628078761415,
"kl": 0.06005859375,
"learning_rate": 9.691205533596837e-07,
"loss": 0.0024,
"reward": 2.4910385608673096,
"reward_std": 0.19780105352401733,
"rewards/accuracy_reward_stage2": 0.5014550685882568,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.28125,
"step": 126
},
{
"completion_length": 102.5625,
"epoch": 0.031373517786561264,
"grad_norm": 4.306073074264426,
"kl": 0.04736328125,
"learning_rate": 9.688735177865613e-07,
"loss": 0.0019,
"reward": 2.328153133392334,
"reward_std": 0.06073104217648506,
"rewards/accuracy_reward_stage2": 0.32815316319465637,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 127
},
{
"completion_length": 120.96875,
"epoch": 0.03162055335968379,
"grad_norm": 2.5626638879865524,
"kl": 0.05078125,
"learning_rate": 9.686264822134386e-07,
"loss": 0.002,
"reward": 2.7157416343688965,
"reward_std": 0.09564615786075592,
"rewards/accuracy_reward_stage2": 0.7157415151596069,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 128
},
{
"completion_length": 120.0,
"epoch": 0.03186758893280633,
"grad_norm": 2.872358125881173,
"kl": 0.061279296875,
"learning_rate": 9.683794466403161e-07,
"loss": 0.0025,
"reward": 2.68588924407959,
"reward_std": 0.03977838158607483,
"rewards/accuracy_reward_stage2": 0.6858893632888794,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 129
},
{
"completion_length": 122.8125,
"epoch": 0.032114624505928856,
"grad_norm": 4.061774924569719,
"kl": 0.064453125,
"learning_rate": 9.681324110671937e-07,
"loss": 0.0026,
"reward": 2.5638954639434814,
"reward_std": 0.1816643923521042,
"rewards/accuracy_reward_stage2": 0.5638953447341919,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.484375,
"step": 130
},
{
"completion_length": 84.875,
"epoch": 0.032361660079051384,
"grad_norm": 4.115640976165146,
"kl": 0.047119140625,
"learning_rate": 9.678853754940712e-07,
"loss": 0.0019,
"reward": 2.612516403198242,
"reward_std": 0.19447211921215057,
"rewards/accuracy_reward_stage2": 0.6125162839889526,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 131
},
{
"completion_length": 138.390625,
"epoch": 0.03260869565217391,
"grad_norm": 3.44122277510103,
"kl": 0.044189453125,
"learning_rate": 9.676383399209485e-07,
"loss": 0.0018,
"reward": 2.6405930519104004,
"reward_std": 0.10844551026821136,
"rewards/accuracy_reward_stage2": 0.6405929327011108,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.609375,
"step": 132
},
{
"completion_length": 114.34375,
"epoch": 0.03285573122529644,
"grad_norm": 3.3056082707747776,
"kl": 0.04638671875,
"learning_rate": 9.67391304347826e-07,
"loss": 0.0019,
"reward": 2.246976375579834,
"reward_std": 0.13054856657981873,
"rewards/accuracy_reward_stage2": 0.25322651863098145,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.25,
"step": 133
},
{
"completion_length": 91.25,
"epoch": 0.033102766798418976,
"grad_norm": 1.7128079673479029,
"kl": 0.048583984375,
"learning_rate": 9.671442687747036e-07,
"loss": 0.0019,
"reward": 2.833665370941162,
"reward_std": 0.010462751612067223,
"rewards/accuracy_reward_stage2": 0.8336653709411621,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 134
},
{
"completion_length": 113.765625,
"epoch": 0.033349802371541504,
"grad_norm": 3.3425383925164325,
"kl": 0.04052734375,
"learning_rate": 9.66897233201581e-07,
"loss": 0.0016,
"reward": 2.676530122756958,
"reward_std": 0.14146235585212708,
"rewards/accuracy_reward_stage2": 0.6827802062034607,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.359375,
"step": 135
},
{
"completion_length": 130.125,
"epoch": 0.03359683794466403,
"grad_norm": 1.1676928781318863,
"kl": 0.048583984375,
"learning_rate": 9.666501976284585e-07,
"loss": 0.0019,
"reward": 2.4324049949645996,
"reward_std": 0.10420753061771393,
"rewards/accuracy_reward_stage2": 0.44490504264831543,
"rewards/format_reward_all_stage": 1.9874999523162842,
"scores/refine_times": 1.484375,
"step": 136
},
{
"completion_length": 95.421875,
"epoch": 0.03384387351778656,
"grad_norm": 3.1237238960935723,
"kl": 0.047607421875,
"learning_rate": 9.664031620553358e-07,
"loss": 0.0019,
"reward": 2.6851418018341064,
"reward_std": 0.14544668793678284,
"rewards/accuracy_reward_stage2": 0.7163918614387512,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.328125,
"step": 137
},
{
"completion_length": 96.8125,
"epoch": 0.03409090909090909,
"grad_norm": 1.7079196696529952,
"kl": 0.041015625,
"learning_rate": 9.661561264822134e-07,
"loss": 0.0016,
"reward": 2.799861431121826,
"reward_std": 0.09301671385765076,
"rewards/accuracy_reward_stage2": 0.8061115145683289,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.3125,
"step": 138
},
{
"completion_length": 87.8125,
"epoch": 0.034337944664031624,
"grad_norm": 2.4549781302932483,
"kl": 0.040771484375,
"learning_rate": 9.65909090909091e-07,
"loss": 0.0016,
"reward": 2.8591620922088623,
"reward_std": 0.0772564709186554,
"rewards/accuracy_reward_stage2": 0.8591620922088623,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 139
},
{
"completion_length": 105.484375,
"epoch": 0.03458498023715415,
"grad_norm": 4.243346429014275,
"kl": 0.05615234375,
"learning_rate": 9.656620553359684e-07,
"loss": 0.0022,
"reward": 2.712839365005493,
"reward_std": 0.12247423827648163,
"rewards/accuracy_reward_stage2": 0.7232558727264404,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.328125,
"step": 140
},
{
"completion_length": 73.890625,
"epoch": 0.03483201581027668,
"grad_norm": 3.8646588914693787,
"kl": 0.05126953125,
"learning_rate": 9.654150197628458e-07,
"loss": 0.002,
"reward": 2.6789019107818604,
"reward_std": 0.10791897773742676,
"rewards/accuracy_reward_stage2": 0.6789019107818604,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 141
},
{
"completion_length": 102.359375,
"epoch": 0.03507905138339921,
"grad_norm": 4.4491213672022285,
"kl": 0.06787109375,
"learning_rate": 9.651679841897233e-07,
"loss": 0.0027,
"reward": 2.4613142013549805,
"reward_std": 0.12597447633743286,
"rewards/accuracy_reward_stage2": 0.47693929076194763,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.3125,
"step": 142
},
{
"completion_length": 110.609375,
"epoch": 0.035326086956521736,
"grad_norm": 2.987716235712274,
"kl": 0.045166015625,
"learning_rate": 9.649209486166008e-07,
"loss": 0.0018,
"reward": 2.57698392868042,
"reward_std": 0.08374475687742233,
"rewards/accuracy_reward_stage2": 0.5769840478897095,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.5625,
"step": 143
},
{
"completion_length": 103.328125,
"epoch": 0.03557312252964427,
"grad_norm": 3.350336752038281,
"kl": 0.04150390625,
"learning_rate": 9.646739130434782e-07,
"loss": 0.0017,
"reward": 2.5754241943359375,
"reward_std": 0.12323600053787231,
"rewards/accuracy_reward_stage2": 0.5754240155220032,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 144
},
{
"completion_length": 112.171875,
"epoch": 0.0358201581027668,
"grad_norm": 3.3004897234036346,
"kl": 0.07177734375,
"learning_rate": 9.644268774703557e-07,
"loss": 0.0029,
"reward": 2.7073974609375,
"reward_std": 0.1341802179813385,
"rewards/accuracy_reward_stage2": 0.7198973894119263,
"rewards/format_reward_all_stage": 1.9874999523162842,
"scores/refine_times": 1.5625,
"step": 145
},
{
"completion_length": 70.109375,
"epoch": 0.03606719367588933,
"grad_norm": 3.746695836459905,
"kl": 0.08251953125,
"learning_rate": 9.64179841897233e-07,
"loss": 0.0033,
"reward": 2.7705631256103516,
"reward_std": 0.07979334890842438,
"rewards/accuracy_reward_stage2": 0.7705631256103516,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 146
},
{
"completion_length": 68.84375,
"epoch": 0.036314229249011856,
"grad_norm": 4.993912255293353,
"kl": 0.064453125,
"learning_rate": 9.639328063241106e-07,
"loss": 0.0026,
"reward": 2.462231397628784,
"reward_std": 0.2417951375246048,
"rewards/accuracy_reward_stage2": 0.587231457233429,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 147
},
{
"completion_length": 89.359375,
"epoch": 0.036561264822134384,
"grad_norm": 1.2055002867444848,
"kl": 0.0751953125,
"learning_rate": 9.636857707509881e-07,
"loss": 0.003,
"reward": 2.6344380378723145,
"reward_std": 0.05059962347149849,
"rewards/accuracy_reward_stage2": 0.6344380974769592,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 148
},
{
"completion_length": 56.4375,
"epoch": 0.03680830039525692,
"grad_norm": 0.27507548410419386,
"kl": 0.062255859375,
"learning_rate": 9.634387351778657e-07,
"loss": 0.0025,
"reward": 2.75,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.75,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 149
},
{
"completion_length": 85.234375,
"epoch": 0.03705533596837945,
"grad_norm": 3.7285465705439877,
"kl": 0.06396484375,
"learning_rate": 9.63191699604743e-07,
"loss": 0.0026,
"reward": 2.714507579803467,
"reward_std": 0.07993803918361664,
"rewards/accuracy_reward_stage2": 0.7145076990127563,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 150
},
{
"completion_length": 69.53125,
"epoch": 0.037302371541501976,
"grad_norm": 2.7754541338531924,
"kl": 0.0625,
"learning_rate": 9.629446640316205e-07,
"loss": 0.0025,
"reward": 2.5262837409973145,
"reward_std": 0.015376383438706398,
"rewards/accuracy_reward_stage2": 0.5262836217880249,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 151
},
{
"completion_length": 59.703125,
"epoch": 0.037549407114624504,
"grad_norm": 3.6132525092697256,
"kl": 0.0849609375,
"learning_rate": 9.62697628458498e-07,
"loss": 0.0034,
"reward": 2.8007984161376953,
"reward_std": 0.05922838672995567,
"rewards/accuracy_reward_stage2": 0.8007984757423401,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 152
},
{
"completion_length": 73.09375,
"epoch": 0.03779644268774703,
"grad_norm": 4.7072408718848155,
"kl": 0.08447265625,
"learning_rate": 9.624505928853754e-07,
"loss": 0.0034,
"reward": 2.7883553504943848,
"reward_std": 0.10380949825048447,
"rewards/accuracy_reward_stage2": 0.78835529088974,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 153
},
{
"completion_length": 70.625,
"epoch": 0.03804347826086957,
"grad_norm": 4.015698810880644,
"kl": 0.064453125,
"learning_rate": 9.62203557312253e-07,
"loss": 0.0026,
"reward": 2.7374773025512695,
"reward_std": 0.07273007929325104,
"rewards/accuracy_reward_stage2": 0.7374772429466248,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 154
},
{
"completion_length": 80.09375,
"epoch": 0.038290513833992096,
"grad_norm": 4.619917963588498,
"kl": 0.06201171875,
"learning_rate": 9.619565217391305e-07,
"loss": 0.0025,
"reward": 2.6052746772766113,
"reward_std": 0.12084851413965225,
"rewards/accuracy_reward_stage2": 0.6901706457138062,
"rewards/format_reward_all_stage": 1.9151042699813843,
"scores/refine_times": 1.296875,
"step": 155
},
{
"completion_length": 68.359375,
"epoch": 0.038537549407114624,
"grad_norm": 4.5603936373643466,
"kl": 0.0859375,
"learning_rate": 9.617094861660078e-07,
"loss": 0.0034,
"reward": 2.6238441467285156,
"reward_std": 0.18369004130363464,
"rewards/accuracy_reward_stage2": 0.6238440275192261,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 156
},
{
"completion_length": 59.4375,
"epoch": 0.03878458498023715,
"grad_norm": 2.5329780180971126,
"kl": 0.07763671875,
"learning_rate": 9.614624505928853e-07,
"loss": 0.0031,
"reward": 2.679924964904785,
"reward_std": 0.07290495187044144,
"rewards/accuracy_reward_stage2": 0.6861748099327087,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.171875,
"step": 157
},
{
"completion_length": 68.046875,
"epoch": 0.03903162055335968,
"grad_norm": 5.443269730676339,
"kl": 0.0810546875,
"learning_rate": 9.612154150197627e-07,
"loss": 0.0033,
"reward": 2.567551612854004,
"reward_std": 0.1361953318119049,
"rewards/accuracy_reward_stage2": 0.5675517916679382,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 158
},
{
"completion_length": 57.9375,
"epoch": 0.039278656126482216,
"grad_norm": 9.030402072146797,
"kl": 0.0986328125,
"learning_rate": 9.609683794466402e-07,
"loss": 0.0039,
"reward": 2.3710129261016846,
"reward_std": 0.1857261210680008,
"rewards/accuracy_reward_stage2": 0.37101292610168457,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 159
},
{
"completion_length": 74.1875,
"epoch": 0.039525691699604744,
"grad_norm": 4.182121286336387,
"kl": 0.07861328125,
"learning_rate": 9.607213438735178e-07,
"loss": 0.0031,
"reward": 2.624070405960083,
"reward_std": 0.053435277193784714,
"rewards/accuracy_reward_stage2": 0.6240705251693726,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 160
},
{
"completion_length": 58.5625,
"epoch": 0.03977272727272727,
"grad_norm": 5.883898696205534,
"kl": 0.09228515625,
"learning_rate": 9.604743083003953e-07,
"loss": 0.0037,
"reward": 2.614293098449707,
"reward_std": 0.12666520476341248,
"rewards/accuracy_reward_stage2": 0.6142929792404175,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 161
},
{
"completion_length": 101.296875,
"epoch": 0.0400197628458498,
"grad_norm": 3.180252896906488,
"kl": 0.08642578125,
"learning_rate": 9.602272727272726e-07,
"loss": 0.0035,
"reward": 2.741650104522705,
"reward_std": 0.10522251576185226,
"rewards/accuracy_reward_stage2": 0.7479000091552734,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.484375,
"step": 162
},
{
"completion_length": 56.4375,
"epoch": 0.040266798418972335,
"grad_norm": 7.1537952621412355,
"kl": 0.0703125,
"learning_rate": 9.599802371541502e-07,
"loss": 0.0028,
"reward": 2.624744176864624,
"reward_std": 0.21947458386421204,
"rewards/accuracy_reward_stage2": 0.624744176864624,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 163
},
{
"completion_length": 68.203125,
"epoch": 0.040513833992094864,
"grad_norm": 5.366994523712258,
"kl": 0.08056640625,
"learning_rate": 9.597332015810277e-07,
"loss": 0.0032,
"reward": 2.524510622024536,
"reward_std": 0.12654046714305878,
"rewards/accuracy_reward_stage2": 0.5245106220245361,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 164
},
{
"completion_length": 68.6875,
"epoch": 0.04076086956521739,
"grad_norm": 3.7291692337053273,
"kl": 0.0625,
"learning_rate": 9.59486166007905e-07,
"loss": 0.0025,
"reward": 2.5265703201293945,
"reward_std": 0.16580721735954285,
"rewards/accuracy_reward_stage2": 0.5265705585479736,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 165
},
{
"completion_length": 73.15625,
"epoch": 0.04100790513833992,
"grad_norm": 2.735126529174995,
"kl": 0.0654296875,
"learning_rate": 9.592391304347826e-07,
"loss": 0.0026,
"reward": 2.75,
"reward_std": 0.06681530922651291,
"rewards/accuracy_reward_stage2": 0.75,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 166
},
{
"completion_length": 78.84375,
"epoch": 0.04125494071146245,
"grad_norm": 2.8700463474317703,
"kl": 0.07470703125,
"learning_rate": 9.5899209486166e-07,
"loss": 0.003,
"reward": 2.7168657779693604,
"reward_std": 0.035781100392341614,
"rewards/accuracy_reward_stage2": 0.7168656587600708,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 167
},
{
"completion_length": 78.0625,
"epoch": 0.041501976284584984,
"grad_norm": 4.890025569430008,
"kl": 0.0615234375,
"learning_rate": 9.587450592885376e-07,
"loss": 0.0025,
"reward": 2.6501522064208984,
"reward_std": 0.1081872433423996,
"rewards/accuracy_reward_stage2": 0.6501523852348328,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.109375,
"step": 168
},
{
"completion_length": 87.875,
"epoch": 0.04174901185770751,
"grad_norm": 4.04344669587095,
"kl": 0.058837890625,
"learning_rate": 9.58498023715415e-07,
"loss": 0.0023,
"reward": 2.6246044635772705,
"reward_std": 0.06840028613805771,
"rewards/accuracy_reward_stage2": 0.6246042847633362,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 169
},
{
"completion_length": 84.203125,
"epoch": 0.04199604743083004,
"grad_norm": 3.7630262620951216,
"kl": 0.0498046875,
"learning_rate": 9.582509881422925e-07,
"loss": 0.002,
"reward": 2.6967902183532715,
"reward_std": 0.03876494616270065,
"rewards/accuracy_reward_stage2": 0.6967902183532715,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 170
},
{
"completion_length": 65.3125,
"epoch": 0.04224308300395257,
"grad_norm": 3.921070473475162,
"kl": 0.060546875,
"learning_rate": 9.580039525691698e-07,
"loss": 0.0024,
"reward": 2.5751843452453613,
"reward_std": 0.04537220671772957,
"rewards/accuracy_reward_stage2": 0.5751842260360718,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 171
},
{
"completion_length": 86.046875,
"epoch": 0.042490118577075096,
"grad_norm": 2.6069307191323023,
"kl": 0.083984375,
"learning_rate": 9.577569169960474e-07,
"loss": 0.0034,
"reward": 2.6333060264587402,
"reward_std": 0.16853483021259308,
"rewards/accuracy_reward_stage2": 0.6489310264587402,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.21875,
"step": 172
},
{
"completion_length": 70.0,
"epoch": 0.04273715415019763,
"grad_norm": 4.409216803473999,
"kl": 0.046630859375,
"learning_rate": 9.57509881422925e-07,
"loss": 0.0019,
"reward": 2.7234153747558594,
"reward_std": 0.20152220129966736,
"rewards/accuracy_reward_stage2": 0.7234152555465698,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 173
},
{
"completion_length": 77.09375,
"epoch": 0.04298418972332016,
"grad_norm": 3.94030379937351,
"kl": 0.0751953125,
"learning_rate": 9.572628458498022e-07,
"loss": 0.003,
"reward": 2.7134218215942383,
"reward_std": 0.16080144047737122,
"rewards/accuracy_reward_stage2": 0.7134219408035278,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 174
},
{
"completion_length": 82.515625,
"epoch": 0.04323122529644269,
"grad_norm": 1.861547490586632,
"kl": 0.04541015625,
"learning_rate": 9.570158102766798e-07,
"loss": 0.0018,
"reward": 2.659064292907715,
"reward_std": 0.04930752515792847,
"rewards/accuracy_reward_stage2": 0.6590641140937805,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 175
},
{
"completion_length": 79.765625,
"epoch": 0.043478260869565216,
"grad_norm": 4.0254532203837305,
"kl": 0.054931640625,
"learning_rate": 9.567687747035573e-07,
"loss": 0.0022,
"reward": 2.5477709770202637,
"reward_std": 0.09522654861211777,
"rewards/accuracy_reward_stage2": 0.547771155834198,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 176
},
{
"completion_length": 80.625,
"epoch": 0.043725296442687744,
"grad_norm": 4.304267511221348,
"kl": 0.087890625,
"learning_rate": 9.565217391304349e-07,
"loss": 0.0035,
"reward": 2.5806241035461426,
"reward_std": 0.12464433908462524,
"rewards/accuracy_reward_stage2": 0.5868740081787109,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.125,
"step": 177
},
{
"completion_length": 97.328125,
"epoch": 0.04397233201581028,
"grad_norm": 4.304249554872454,
"kl": 0.0576171875,
"learning_rate": 9.562747035573122e-07,
"loss": 0.0023,
"reward": 2.5728487968444824,
"reward_std": 0.06610282510519028,
"rewards/accuracy_reward_stage2": 0.572848916053772,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 178
},
{
"completion_length": 73.125,
"epoch": 0.04421936758893281,
"grad_norm": 4.062110192757534,
"kl": 0.05712890625,
"learning_rate": 9.560276679841897e-07,
"loss": 0.0023,
"reward": 2.630199909210205,
"reward_std": 0.03811332583427429,
"rewards/accuracy_reward_stage2": 0.6302000284194946,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 179
},
{
"completion_length": 67.6875,
"epoch": 0.044466403162055336,
"grad_norm": 3.0927557325941843,
"kl": 0.054443359375,
"learning_rate": 9.55780632411067e-07,
"loss": 0.0022,
"reward": 2.8124020099639893,
"reward_std": 0.02681785449385643,
"rewards/accuracy_reward_stage2": 0.8124019503593445,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 180
},
{
"completion_length": 92.65625,
"epoch": 0.044713438735177864,
"grad_norm": 4.035197870543932,
"kl": 0.08642578125,
"learning_rate": 9.555335968379446e-07,
"loss": 0.0035,
"reward": 2.6102566719055176,
"reward_std": 0.24840806424617767,
"rewards/accuracy_reward_stage2": 0.6727566719055176,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.296875,
"step": 181
},
{
"completion_length": 77.75,
"epoch": 0.04496047430830039,
"grad_norm": 4.755395802023161,
"kl": 0.0654296875,
"learning_rate": 9.552865612648221e-07,
"loss": 0.0026,
"reward": 2.6376709938049316,
"reward_std": 0.142973855137825,
"rewards/accuracy_reward_stage2": 0.6376707553863525,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 182
},
{
"completion_length": 86.65625,
"epoch": 0.04520750988142293,
"grad_norm": 3.041899055085519,
"kl": 0.0419921875,
"learning_rate": 9.550395256916995e-07,
"loss": 0.0017,
"reward": 2.713804244995117,
"reward_std": 0.018256813287734985,
"rewards/accuracy_reward_stage2": 0.7138041257858276,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 183
},
{
"completion_length": 72.6875,
"epoch": 0.045454545454545456,
"grad_norm": 5.254526969633487,
"kl": 0.03369140625,
"learning_rate": 9.54792490118577e-07,
"loss": 0.0013,
"reward": 2.582467555999756,
"reward_std": 0.07811328768730164,
"rewards/accuracy_reward_stage2": 0.5824676752090454,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 184
},
{
"completion_length": 95.265625,
"epoch": 0.045701581027667984,
"grad_norm": 3.58376936857761,
"kl": 0.044921875,
"learning_rate": 9.545454545454546e-07,
"loss": 0.0018,
"reward": 2.671761989593506,
"reward_std": 0.09435681998729706,
"rewards/accuracy_reward_stage2": 0.6717619299888611,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 185
},
{
"completion_length": 79.25,
"epoch": 0.04594861660079051,
"grad_norm": 2.79408117557805,
"kl": 0.057861328125,
"learning_rate": 9.54298418972332e-07,
"loss": 0.0023,
"reward": 2.7370827198028564,
"reward_std": 0.011032424867153168,
"rewards/accuracy_reward_stage2": 0.7370827198028564,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 186
},
{
"completion_length": 81.1875,
"epoch": 0.04619565217391304,
"grad_norm": 3.070893637345864,
"kl": 0.0634765625,
"learning_rate": 9.540513833992094e-07,
"loss": 0.0025,
"reward": 2.6037180423736572,
"reward_std": 0.0603860504925251,
"rewards/accuracy_reward_stage2": 0.603718101978302,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 187
},
{
"completion_length": 83.90625,
"epoch": 0.046442687747035576,
"grad_norm": 3.9266085023435253,
"kl": 0.07470703125,
"learning_rate": 9.53804347826087e-07,
"loss": 0.003,
"reward": 2.3806803226470947,
"reward_std": 0.08710526674985886,
"rewards/accuracy_reward_stage2": 0.4431803524494171,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 188
},
{
"completion_length": 83.484375,
"epoch": 0.046689723320158104,
"grad_norm": 4.685942150283861,
"kl": 0.054443359375,
"learning_rate": 9.535573122529644e-07,
"loss": 0.0022,
"reward": 2.4631409645080566,
"reward_std": 0.21757805347442627,
"rewards/accuracy_reward_stage2": 0.46314099431037903,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 189
},
{
"completion_length": 72.4375,
"epoch": 0.04693675889328063,
"grad_norm": 1.8005226063530777,
"kl": 0.0634765625,
"learning_rate": 9.533102766798418e-07,
"loss": 0.0025,
"reward": 2.794851779937744,
"reward_std": 0.009321765042841434,
"rewards/accuracy_reward_stage2": 0.7948517203330994,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 190
},
{
"completion_length": 75.125,
"epoch": 0.04718379446640316,
"grad_norm": 1.9302369432589355,
"kl": 0.06396484375,
"learning_rate": 9.530632411067194e-07,
"loss": 0.0026,
"reward": 2.8036766052246094,
"reward_std": 0.018778154626488686,
"rewards/accuracy_reward_stage2": 0.8036764860153198,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 191
},
{
"completion_length": 74.671875,
"epoch": 0.04743083003952569,
"grad_norm": 0.6450107155642202,
"kl": 0.05224609375,
"learning_rate": 9.528162055335968e-07,
"loss": 0.0021,
"reward": 2.761014223098755,
"reward_std": 0.022562123835086823,
"rewards/accuracy_reward_stage2": 0.767264187335968,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.125,
"step": 192
},
{
"completion_length": 82.265625,
"epoch": 0.047677865612648224,
"grad_norm": 2.7725448937542465,
"kl": 0.06494140625,
"learning_rate": 9.525691699604743e-07,
"loss": 0.0026,
"reward": 2.734375,
"reward_std": 0.15981829166412354,
"rewards/accuracy_reward_stage2": 0.796875,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 193
},
{
"completion_length": 109.21875,
"epoch": 0.04792490118577075,
"grad_norm": 3.3462144334568045,
"kl": 0.083984375,
"learning_rate": 9.523221343873518e-07,
"loss": 0.0034,
"reward": 2.294902801513672,
"reward_std": 0.17553241550922394,
"rewards/accuracy_reward_stage2": 0.3730279803276062,
"rewards/format_reward_all_stage": 1.921875,
"scores/refine_times": 1.3125,
"step": 194
},
{
"completion_length": 88.421875,
"epoch": 0.04817193675889328,
"grad_norm": 2.6171281677758076,
"kl": 0.043212890625,
"learning_rate": 9.520750988142292e-07,
"loss": 0.0017,
"reward": 2.856783628463745,
"reward_std": 0.11217740178108215,
"rewards/accuracy_reward_stage2": 0.8672002553939819,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.078125,
"step": 195
},
{
"completion_length": 95.296875,
"epoch": 0.04841897233201581,
"grad_norm": 4.039571176888931,
"kl": 0.0439453125,
"learning_rate": 9.518280632411066e-07,
"loss": 0.0018,
"reward": 2.6629042625427246,
"reward_std": 0.12791165709495544,
"rewards/accuracy_reward_stage2": 0.6629043817520142,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 196
},
{
"completion_length": 73.0,
"epoch": 0.048666007905138337,
"grad_norm": 5.175846675590277,
"kl": 0.07080078125,
"learning_rate": 9.515810276679841e-07,
"loss": 0.0028,
"reward": 2.671309232711792,
"reward_std": 0.13797426223754883,
"rewards/accuracy_reward_stage2": 0.671309232711792,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 197
},
{
"completion_length": 61.75,
"epoch": 0.04891304347826087,
"grad_norm": 4.218652136398897,
"kl": 0.05126953125,
"learning_rate": 9.513339920948616e-07,
"loss": 0.002,
"reward": 2.655543088912964,
"reward_std": 0.08075182139873505,
"rewards/accuracy_reward_stage2": 0.6555430293083191,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 198
},
{
"completion_length": 71.8125,
"epoch": 0.0491600790513834,
"grad_norm": 3.247173481906321,
"kl": 0.0517578125,
"learning_rate": 9.51086956521739e-07,
"loss": 0.0021,
"reward": 2.669358253479004,
"reward_std": 0.13766998052597046,
"rewards/accuracy_reward_stage2": 0.7943581342697144,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 199
},
{
"completion_length": 93.953125,
"epoch": 0.04940711462450593,
"grad_norm": 3.734532512883597,
"kl": 0.05615234375,
"learning_rate": 9.508399209486166e-07,
"loss": 0.0022,
"reward": 2.4314818382263184,
"reward_std": 0.09143616259098053,
"rewards/accuracy_reward_stage2": 0.49398165941238403,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 200
},
{
"completion_length": 96.40625,
"epoch": 0.049654150197628456,
"grad_norm": 2.5177431696993207,
"kl": 0.0576171875,
"learning_rate": 9.50592885375494e-07,
"loss": 0.0023,
"reward": 2.620957136154175,
"reward_std": 0.09463383257389069,
"rewards/accuracy_reward_stage2": 0.6209571361541748,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 201
},
{
"completion_length": 84.5625,
"epoch": 0.04990118577075099,
"grad_norm": 4.086803835650893,
"kl": 0.0625,
"learning_rate": 9.503458498023716e-07,
"loss": 0.0025,
"reward": 2.77097225189209,
"reward_std": 0.15935085713863373,
"rewards/accuracy_reward_stage2": 0.7709720730781555,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 202
},
{
"completion_length": 87.765625,
"epoch": 0.05014822134387352,
"grad_norm": 2.694826701394137,
"kl": 0.056884765625,
"learning_rate": 9.50098814229249e-07,
"loss": 0.0023,
"reward": 2.6979928016662598,
"reward_std": 0.054207898676395416,
"rewards/accuracy_reward_stage2": 0.6979928016662598,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 203
},
{
"completion_length": 74.0,
"epoch": 0.05039525691699605,
"grad_norm": 2.722697467089469,
"kl": 0.054931640625,
"learning_rate": 9.498517786561264e-07,
"loss": 0.0022,
"reward": 2.6437833309173584,
"reward_std": 0.042557310312986374,
"rewards/accuracy_reward_stage2": 0.6437833309173584,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 204
},
{
"completion_length": 81.5625,
"epoch": 0.050642292490118576,
"grad_norm": 2.0886379248475095,
"kl": 0.06591796875,
"learning_rate": 9.496047430830039e-07,
"loss": 0.0026,
"reward": 2.9196255207061768,
"reward_std": 0.005610581487417221,
"rewards/accuracy_reward_stage2": 0.9196255207061768,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 205
},
{
"completion_length": 91.390625,
"epoch": 0.050889328063241104,
"grad_norm": 3.6807652962283637,
"kl": 0.0703125,
"learning_rate": 9.493577075098814e-07,
"loss": 0.0028,
"reward": 2.5606374740600586,
"reward_std": 0.09435243904590607,
"rewards/accuracy_reward_stage2": 0.5606374740600586,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 206
},
{
"completion_length": 100.796875,
"epoch": 0.05113636363636364,
"grad_norm": 3.222119591626723,
"kl": 0.06103515625,
"learning_rate": 9.491106719367588e-07,
"loss": 0.0024,
"reward": 2.5677084922790527,
"reward_std": 0.12332375347614288,
"rewards/accuracy_reward_stage2": 0.578125,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.171875,
"step": 207
},
{
"completion_length": 96.5625,
"epoch": 0.05138339920948617,
"grad_norm": 4.428189055231432,
"kl": 0.080078125,
"learning_rate": 9.488636363636363e-07,
"loss": 0.0032,
"reward": 2.6954102516174316,
"reward_std": 0.10087625682353973,
"rewards/accuracy_reward_stage2": 0.6954102516174316,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 208
},
{
"completion_length": 85.1875,
"epoch": 0.051630434782608696,
"grad_norm": 5.422275433434783,
"kl": 0.060546875,
"learning_rate": 9.486166007905137e-07,
"loss": 0.0024,
"reward": 2.676901340484619,
"reward_std": 0.19727489352226257,
"rewards/accuracy_reward_stage2": 0.6769014596939087,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 209
},
{
"completion_length": 100.09375,
"epoch": 0.051877470355731224,
"grad_norm": 3.968883032154412,
"kl": 0.07421875,
"learning_rate": 9.483695652173913e-07,
"loss": 0.003,
"reward": 2.7428462505340576,
"reward_std": 0.1038169115781784,
"rewards/accuracy_reward_stage2": 0.7490963935852051,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.109375,
"step": 210
},
{
"completion_length": 88.46875,
"epoch": 0.05212450592885375,
"grad_norm": 3.4686533432761637,
"kl": 0.05908203125,
"learning_rate": 9.481225296442688e-07,
"loss": 0.0024,
"reward": 2.782975196838379,
"reward_std": 0.07370894402265549,
"rewards/accuracy_reward_stage2": 0.7829753160476685,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 211
},
{
"completion_length": 116.34375,
"epoch": 0.05237154150197629,
"grad_norm": 4.12293075856129,
"kl": 0.08544921875,
"learning_rate": 9.478754940711462e-07,
"loss": 0.0034,
"reward": 2.6525073051452637,
"reward_std": 0.06538750976324081,
"rewards/accuracy_reward_stage2": 0.6525071859359741,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 212
},
{
"completion_length": 90.5625,
"epoch": 0.052618577075098816,
"grad_norm": 5.499570840391799,
"kl": 0.158203125,
"learning_rate": 9.476284584980236e-07,
"loss": 0.0063,
"reward": 2.620041847229004,
"reward_std": 0.158025860786438,
"rewards/accuracy_reward_stage2": 0.6200418472290039,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 213
},
{
"completion_length": 101.046875,
"epoch": 0.052865612648221344,
"grad_norm": 2.8991649860690845,
"kl": 0.0517578125,
"learning_rate": 9.473814229249012e-07,
"loss": 0.0021,
"reward": 2.827629804611206,
"reward_std": 0.026947414502501488,
"rewards/accuracy_reward_stage2": 0.8276296854019165,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 214
},
{
"completion_length": 73.875,
"epoch": 0.05311264822134387,
"grad_norm": 2.3688254496479493,
"kl": 0.05029296875,
"learning_rate": 9.471343873517786e-07,
"loss": 0.002,
"reward": 2.40625,
"reward_std": 0.033407654613256454,
"rewards/accuracy_reward_stage2": 0.40625,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 215
},
{
"completion_length": 98.703125,
"epoch": 0.0533596837944664,
"grad_norm": 4.150945077897221,
"kl": 0.06201171875,
"learning_rate": 9.468873517786561e-07,
"loss": 0.0025,
"reward": 2.7674636840820312,
"reward_std": 0.1500820368528366,
"rewards/accuracy_reward_stage2": 0.7674636840820312,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 216
},
{
"completion_length": 80.0,
"epoch": 0.053606719367588936,
"grad_norm": 3.428179312141899,
"kl": 0.07421875,
"learning_rate": 9.466403162055335e-07,
"loss": 0.003,
"reward": 2.686192274093628,
"reward_std": 0.023959007114171982,
"rewards/accuracy_reward_stage2": 0.6861922740936279,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 217
},
{
"completion_length": 113.328125,
"epoch": 0.053853754940711464,
"grad_norm": 3.9383187627551157,
"kl": 0.048583984375,
"learning_rate": 9.463932806324109e-07,
"loss": 0.0019,
"reward": 2.5574302673339844,
"reward_std": 0.25754785537719727,
"rewards/accuracy_reward_stage2": 0.750138521194458,
"rewards/format_reward_all_stage": 1.8072917461395264,
"scores/refine_times": 1.25,
"step": 218
},
{
"completion_length": 131.21875,
"epoch": 0.05410079051383399,
"grad_norm": 3.309098889669061,
"kl": 0.049072265625,
"learning_rate": 9.461462450592886e-07,
"loss": 0.002,
"reward": 2.5466530323028564,
"reward_std": 0.20897339284420013,
"rewards/accuracy_reward_stage2": 0.6716530323028564,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.1875,
"step": 219
},
{
"completion_length": 86.078125,
"epoch": 0.05434782608695652,
"grad_norm": 4.946057628757396,
"kl": 0.0478515625,
"learning_rate": 9.45899209486166e-07,
"loss": 0.0019,
"reward": 2.5970003604888916,
"reward_std": 0.164507195353508,
"rewards/accuracy_reward_stage2": 0.597000241279602,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 220
},
{
"completion_length": 95.546875,
"epoch": 0.05459486166007905,
"grad_norm": 2.4617835123837835,
"kl": 0.052978515625,
"learning_rate": 9.456521739130434e-07,
"loss": 0.0021,
"reward": 2.8430728912353516,
"reward_std": 0.050867728888988495,
"rewards/accuracy_reward_stage2": 0.843072772026062,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 221
},
{
"completion_length": 85.625,
"epoch": 0.054841897233201584,
"grad_norm": 12.694258343723256,
"kl": 0.271484375,
"learning_rate": 9.454051383399209e-07,
"loss": 0.0109,
"reward": 2.516552448272705,
"reward_std": 0.07157387584447861,
"rewards/accuracy_reward_stage2": 0.5790524482727051,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.15625,
"step": 222
},
{
"completion_length": 121.484375,
"epoch": 0.05508893280632411,
"grad_norm": 3.2893095767759926,
"kl": 0.04833984375,
"learning_rate": 9.451581027667984e-07,
"loss": 0.0019,
"reward": 2.6593329906463623,
"reward_std": 0.21269533038139343,
"rewards/accuracy_reward_stage2": 0.7197496294975281,
"rewards/format_reward_all_stage": 1.9395833015441895,
"scores/refine_times": 1.46875,
"step": 223
},
{
"completion_length": 95.046875,
"epoch": 0.05533596837944664,
"grad_norm": 2.2737834746774084,
"kl": 0.0615234375,
"learning_rate": 9.449110671936758e-07,
"loss": 0.0025,
"reward": 2.78125,
"reward_std": 0.1246790662407875,
"rewards/accuracy_reward_stage2": 0.78125,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 224
},
{
"completion_length": 107.421875,
"epoch": 0.05558300395256917,
"grad_norm": 2.662768928485288,
"kl": 0.05615234375,
"learning_rate": 9.446640316205533e-07,
"loss": 0.0022,
"reward": 2.66047739982605,
"reward_std": 0.020462632179260254,
"rewards/accuracy_reward_stage2": 0.6604773998260498,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 225
},
{
"completion_length": 76.390625,
"epoch": 0.055830039525691696,
"grad_norm": 5.91370963368309,
"kl": 0.06591796875,
"learning_rate": 9.444169960474307e-07,
"loss": 0.0026,
"reward": 2.7473721504211426,
"reward_std": 0.16072911024093628,
"rewards/accuracy_reward_stage2": 0.7473721504211426,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 226
},
{
"completion_length": 94.015625,
"epoch": 0.05607707509881423,
"grad_norm": 4.076250713612817,
"kl": 0.04150390625,
"learning_rate": 9.441699604743083e-07,
"loss": 0.0017,
"reward": 2.6946887969970703,
"reward_std": 0.07815377414226532,
"rewards/accuracy_reward_stage2": 0.6946887969970703,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 227
},
{
"completion_length": 110.984375,
"epoch": 0.05632411067193676,
"grad_norm": 5.016186365433335,
"kl": 0.05224609375,
"learning_rate": 9.439229249011858e-07,
"loss": 0.0021,
"reward": 2.7810983657836914,
"reward_std": 0.1505921632051468,
"rewards/accuracy_reward_stage2": 0.7810983657836914,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 228
},
{
"completion_length": 88.296875,
"epoch": 0.05657114624505929,
"grad_norm": 4.303632516413331,
"kl": 0.05810546875,
"learning_rate": 9.436758893280632e-07,
"loss": 0.0023,
"reward": 2.471911907196045,
"reward_std": 0.03463466465473175,
"rewards/accuracy_reward_stage2": 0.4719120264053345,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 229
},
{
"completion_length": 89.75,
"epoch": 0.056818181818181816,
"grad_norm": 1.0465741722158528,
"kl": 0.05810546875,
"learning_rate": 9.434288537549407e-07,
"loss": 0.0023,
"reward": 2.6418185234069824,
"reward_std": 0.03043236769735813,
"rewards/accuracy_reward_stage2": 0.6418185234069824,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 230
},
{
"completion_length": 116.359375,
"epoch": 0.057065217391304345,
"grad_norm": 4.845711684305512,
"kl": 0.0634765625,
"learning_rate": 9.431818181818182e-07,
"loss": 0.0025,
"reward": 2.639404296875,
"reward_std": 0.13679620623588562,
"rewards/accuracy_reward_stage2": 0.6394043564796448,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 231
},
{
"completion_length": 98.125,
"epoch": 0.05731225296442688,
"grad_norm": 4.2105692164904465,
"kl": 0.07763671875,
"learning_rate": 9.429347826086956e-07,
"loss": 0.0031,
"reward": 2.609795570373535,
"reward_std": 0.08919590711593628,
"rewards/accuracy_reward_stage2": 0.6097957491874695,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 232
},
{
"completion_length": 92.203125,
"epoch": 0.05755928853754941,
"grad_norm": 1.250927159243389,
"kl": 0.059326171875,
"learning_rate": 9.426877470355731e-07,
"loss": 0.0024,
"reward": 2.8027873039245605,
"reward_std": 0.014970174990594387,
"rewards/accuracy_reward_stage2": 0.8027871251106262,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 233
},
{
"completion_length": 95.65625,
"epoch": 0.057806324110671936,
"grad_norm": 1.7812890613719796,
"kl": 0.0634765625,
"learning_rate": 9.424407114624505e-07,
"loss": 0.0025,
"reward": 2.804018020629883,
"reward_std": 0.04018682241439819,
"rewards/accuracy_reward_stage2": 0.8040179014205933,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 234
},
{
"completion_length": 108.96875,
"epoch": 0.058053359683794464,
"grad_norm": 3.4975869844535055,
"kl": 0.06103515625,
"learning_rate": 9.421936758893279e-07,
"loss": 0.0024,
"reward": 2.629866123199463,
"reward_std": 0.11631540954113007,
"rewards/accuracy_reward_stage2": 0.6559078693389893,
"rewards/format_reward_all_stage": 1.9739583730697632,
"scores/refine_times": 1.265625,
"step": 235
},
{
"completion_length": 147.921875,
"epoch": 0.058300395256917,
"grad_norm": 4.268984542856603,
"kl": 0.0556640625,
"learning_rate": 9.419466403162055e-07,
"loss": 0.0022,
"reward": 2.572082042694092,
"reward_std": 0.15964874625205994,
"rewards/accuracy_reward_stage2": 0.6017696857452393,
"rewards/format_reward_all_stage": 1.970312476158142,
"scores/refine_times": 1.53125,
"step": 236
},
{
"completion_length": 115.21875,
"epoch": 0.05854743083003953,
"grad_norm": 4.65588209600243,
"kl": 0.0546875,
"learning_rate": 9.41699604743083e-07,
"loss": 0.0022,
"reward": 2.402869701385498,
"reward_std": 0.23863497376441956,
"rewards/accuracy_reward_stage2": 0.5341198444366455,
"rewards/format_reward_all_stage": 1.868749976158142,
"scores/refine_times": 1.25,
"step": 237
},
{
"completion_length": 106.515625,
"epoch": 0.058794466403162056,
"grad_norm": 3.984260162520113,
"kl": 0.0703125,
"learning_rate": 9.414525691699604e-07,
"loss": 0.0028,
"reward": 2.614504098892212,
"reward_std": 0.06337256729602814,
"rewards/accuracy_reward_stage2": 0.6207541823387146,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.28125,
"step": 238
},
{
"completion_length": 86.296875,
"epoch": 0.059041501976284584,
"grad_norm": 3.916424716324547,
"kl": 0.06884765625,
"learning_rate": 9.412055335968379e-07,
"loss": 0.0028,
"reward": 2.776068687438965,
"reward_std": 0.0822492316365242,
"rewards/accuracy_reward_stage2": 0.7760688066482544,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 239
},
{
"completion_length": 100.875,
"epoch": 0.05928853754940711,
"grad_norm": 4.688640502699326,
"kl": 0.0556640625,
"learning_rate": 9.409584980237154e-07,
"loss": 0.0022,
"reward": 2.622417449951172,
"reward_std": 0.17500078678131104,
"rewards/accuracy_reward_stage2": 0.6286673545837402,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.171875,
"step": 240
},
{
"completion_length": 95.921875,
"epoch": 0.05953557312252965,
"grad_norm": 4.039431146320975,
"kl": 0.057373046875,
"learning_rate": 9.407114624505929e-07,
"loss": 0.0023,
"reward": 2.611854314804077,
"reward_std": 0.13111275434494019,
"rewards/accuracy_reward_stage2": 0.6118543148040771,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 241
},
{
"completion_length": 89.5625,
"epoch": 0.059782608695652176,
"grad_norm": 3.2997919352845786,
"kl": 0.050048828125,
"learning_rate": 9.404644268774703e-07,
"loss": 0.002,
"reward": 2.652923822402954,
"reward_std": 0.11734248697757721,
"rewards/accuracy_reward_stage2": 0.7102153897285461,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.078125,
"step": 242
},
{
"completion_length": 104.640625,
"epoch": 0.060029644268774704,
"grad_norm": 5.076461913642015,
"kl": 0.056396484375,
"learning_rate": 9.402173913043477e-07,
"loss": 0.0023,
"reward": 2.7080979347229004,
"reward_std": 0.04040906950831413,
"rewards/accuracy_reward_stage2": 0.7080979347229004,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 243
},
{
"completion_length": 77.375,
"epoch": 0.06027667984189723,
"grad_norm": 1.995536991642715,
"kl": 0.057373046875,
"learning_rate": 9.399703557312253e-07,
"loss": 0.0023,
"reward": 2.7579073905944824,
"reward_std": 0.0003560198238119483,
"rewards/accuracy_reward_stage2": 0.7579072713851929,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 244
},
{
"completion_length": 123.328125,
"epoch": 0.06052371541501976,
"grad_norm": 3.4556227655695357,
"kl": 0.046875,
"learning_rate": 9.397233201581027e-07,
"loss": 0.0019,
"reward": 2.779745578765869,
"reward_std": 0.10972259938716888,
"rewards/accuracy_reward_stage2": 0.7953706383705139,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.328125,
"step": 245
},
{
"completion_length": 98.265625,
"epoch": 0.060770750988142296,
"grad_norm": 3.8073316504950308,
"kl": 0.05419921875,
"learning_rate": 9.394762845849802e-07,
"loss": 0.0022,
"reward": 2.611069679260254,
"reward_std": 0.15946999192237854,
"rewards/accuracy_reward_stage2": 0.7360695600509644,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 246
},
{
"completion_length": 107.1875,
"epoch": 0.061017786561264824,
"grad_norm": 2.2530508679929206,
"kl": 0.05712890625,
"learning_rate": 9.392292490118577e-07,
"loss": 0.0023,
"reward": 2.6858139038085938,
"reward_std": 0.03068363480269909,
"rewards/accuracy_reward_stage2": 0.6858140230178833,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 247
},
{
"completion_length": 81.421875,
"epoch": 0.06126482213438735,
"grad_norm": 4.610246667877454,
"kl": 0.06689453125,
"learning_rate": 9.389822134387352e-07,
"loss": 0.0027,
"reward": 2.4801125526428223,
"reward_std": 0.12876607477664948,
"rewards/accuracy_reward_stage2": 0.49052929878234863,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.078125,
"step": 248
},
{
"completion_length": 133.15625,
"epoch": 0.06151185770750988,
"grad_norm": 3.656412135533561,
"kl": 0.05615234375,
"learning_rate": 9.387351778656126e-07,
"loss": 0.0022,
"reward": 2.7299704551696777,
"reward_std": 0.15535692870616913,
"rewards/accuracy_reward_stage2": 0.7612204551696777,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.46875,
"step": 249
},
{
"completion_length": 111.03125,
"epoch": 0.06175889328063241,
"grad_norm": 2.9745455486264785,
"kl": 0.068359375,
"learning_rate": 9.384881422924901e-07,
"loss": 0.0027,
"reward": 2.878418445587158,
"reward_std": 0.08700025826692581,
"rewards/accuracy_reward_stage2": 0.8784183263778687,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 250
},
{
"completion_length": 108.640625,
"epoch": 0.062005928853754944,
"grad_norm": 4.592700429840661,
"kl": 0.076171875,
"learning_rate": 9.382411067193675e-07,
"loss": 0.0031,
"reward": 2.640062093734741,
"reward_std": 0.13130618631839752,
"rewards/accuracy_reward_stage2": 0.6400620341300964,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 251
},
{
"completion_length": 93.375,
"epoch": 0.06225296442687747,
"grad_norm": 4.080049204812087,
"kl": 0.06201171875,
"learning_rate": 9.37994071146245e-07,
"loss": 0.0025,
"reward": 2.484543561935425,
"reward_std": 0.03272121399641037,
"rewards/accuracy_reward_stage2": 0.48454350233078003,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 252
},
{
"completion_length": 133.703125,
"epoch": 0.0625,
"grad_norm": 3.8766421314465203,
"kl": 0.06884765625,
"learning_rate": 9.377470355731225e-07,
"loss": 0.0028,
"reward": 2.6636319160461426,
"reward_std": 0.12454654276371002,
"rewards/accuracy_reward_stage2": 0.6792569160461426,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.453125,
"step": 253
},
{
"completion_length": 102.640625,
"epoch": 0.06274703557312253,
"grad_norm": 3.125502153944731,
"kl": 0.064453125,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0026,
"reward": 2.661580801010132,
"reward_std": 0.0767604261636734,
"rewards/accuracy_reward_stage2": 0.6615808606147766,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 254
},
{
"completion_length": 100.875,
"epoch": 0.06299407114624506,
"grad_norm": 2.4047815253285703,
"kl": 0.08154296875,
"learning_rate": 9.372529644268774e-07,
"loss": 0.0033,
"reward": 2.870173692703247,
"reward_std": 0.04427599906921387,
"rewards/accuracy_reward_stage2": 0.8701735734939575,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 255
},
{
"completion_length": 102.046875,
"epoch": 0.06324110671936758,
"grad_norm": 3.9729431573321565,
"kl": 0.06982421875,
"learning_rate": 9.370059288537549e-07,
"loss": 0.0028,
"reward": 2.5646939277648926,
"reward_std": 0.03415513038635254,
"rewards/accuracy_reward_stage2": 0.5646939873695374,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 256
},
{
"completion_length": 87.359375,
"epoch": 0.06348814229249011,
"grad_norm": 2.8984209672527923,
"kl": 0.05712890625,
"learning_rate": 9.367588932806324e-07,
"loss": 0.0023,
"reward": 2.9039530754089355,
"reward_std": 0.048997893929481506,
"rewards/accuracy_reward_stage2": 0.9039530158042908,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 257
},
{
"completion_length": 81.203125,
"epoch": 0.06373517786561265,
"grad_norm": 3.197191357621096,
"kl": 0.0693359375,
"learning_rate": 9.365118577075099e-07,
"loss": 0.0028,
"reward": 2.474766254425049,
"reward_std": 0.017342764884233475,
"rewards/accuracy_reward_stage2": 0.4747660756111145,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 258
},
{
"completion_length": 96.390625,
"epoch": 0.06398221343873518,
"grad_norm": 3.5385421006445625,
"kl": 0.0654296875,
"learning_rate": 9.362648221343873e-07,
"loss": 0.0026,
"reward": 2.3430323600769043,
"reward_std": 0.12197308242321014,
"rewards/accuracy_reward_stage2": 0.3430321514606476,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 259
},
{
"completion_length": 76.875,
"epoch": 0.06422924901185771,
"grad_norm": 3.6584345558191624,
"kl": 0.068359375,
"learning_rate": 9.360177865612647e-07,
"loss": 0.0027,
"reward": 2.3733391761779785,
"reward_std": 0.041200902312994,
"rewards/accuracy_reward_stage2": 0.3733389973640442,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 260
},
{
"completion_length": 119.6875,
"epoch": 0.06447628458498024,
"grad_norm": 3.183150779739053,
"kl": 0.061767578125,
"learning_rate": 9.357707509881423e-07,
"loss": 0.0025,
"reward": 2.5673301219940186,
"reward_std": 0.09498921036720276,
"rewards/accuracy_reward_stage2": 0.5751426219940186,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.359375,
"step": 261
},
{
"completion_length": 144.78125,
"epoch": 0.06472332015810277,
"grad_norm": 3.550233651334434,
"kl": 0.06396484375,
"learning_rate": 9.355237154150197e-07,
"loss": 0.0026,
"reward": 2.5922813415527344,
"reward_std": 0.25461798906326294,
"rewards/accuracy_reward_stage2": 0.7172813415527344,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.609375,
"step": 262
},
{
"completion_length": 117.859375,
"epoch": 0.0649703557312253,
"grad_norm": 3.1484490414114945,
"kl": 0.060302734375,
"learning_rate": 9.352766798418971e-07,
"loss": 0.0024,
"reward": 2.6294641494750977,
"reward_std": 0.16589348018169403,
"rewards/accuracy_reward_stage2": 0.6294642686843872,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 263
},
{
"completion_length": 121.28125,
"epoch": 0.06521739130434782,
"grad_norm": 2.30350469441656,
"kl": 0.06982421875,
"learning_rate": 9.350296442687746e-07,
"loss": 0.0028,
"reward": 2.70902681350708,
"reward_std": 0.09053189307451248,
"rewards/accuracy_reward_stage2": 0.716839075088501,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.40625,
"step": 264
},
{
"completion_length": 93.109375,
"epoch": 0.06546442687747035,
"grad_norm": 5.426621994742126,
"kl": 0.0654296875,
"learning_rate": 9.347826086956522e-07,
"loss": 0.0026,
"reward": 2.4706788063049316,
"reward_std": 0.0817948505282402,
"rewards/accuracy_reward_stage2": 0.4706789255142212,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 265
},
{
"completion_length": 122.046875,
"epoch": 0.06571146245059288,
"grad_norm": 4.159572161738214,
"kl": 0.1298828125,
"learning_rate": 9.345355731225297e-07,
"loss": 0.0052,
"reward": 2.4963176250457764,
"reward_std": 0.1702859252691269,
"rewards/accuracy_reward_stage2": 0.5770467519760132,
"rewards/format_reward_all_stage": 1.9192708730697632,
"scores/refine_times": 1.4375,
"step": 266
},
{
"completion_length": 94.671875,
"epoch": 0.06595849802371541,
"grad_norm": 3.003493248206179,
"kl": 0.068359375,
"learning_rate": 9.342885375494071e-07,
"loss": 0.0027,
"reward": 2.7748560905456543,
"reward_std": 0.06840323656797409,
"rewards/accuracy_reward_stage2": 0.79048091173172,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.0625,
"step": 267
},
{
"completion_length": 111.65625,
"epoch": 0.06620553359683795,
"grad_norm": 3.0695130053036466,
"kl": 0.0703125,
"learning_rate": 9.340415019762845e-07,
"loss": 0.0028,
"reward": 2.609114170074463,
"reward_std": 0.11976105719804764,
"rewards/accuracy_reward_stage2": 0.6091140508651733,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 268
},
{
"completion_length": 128.21875,
"epoch": 0.06645256916996048,
"grad_norm": 3.7341130409363754,
"kl": 0.08349609375,
"learning_rate": 9.337944664031621e-07,
"loss": 0.0033,
"reward": 2.717738151550293,
"reward_std": 0.19703274965286255,
"rewards/accuracy_reward_stage2": 0.8271132707595825,
"rewards/format_reward_all_stage": 1.890625,
"scores/refine_times": 1.390625,
"step": 269
},
{
"completion_length": 122.96875,
"epoch": 0.06669960474308301,
"grad_norm": 3.5044994368102524,
"kl": 0.07958984375,
"learning_rate": 9.335474308300395e-07,
"loss": 0.0032,
"reward": 2.6777563095092773,
"reward_std": 0.08365817368030548,
"rewards/accuracy_reward_stage2": 0.7402562499046326,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.359375,
"step": 270
},
{
"completion_length": 145.765625,
"epoch": 0.06694664031620554,
"grad_norm": 1.1667471179803168,
"kl": 0.068359375,
"learning_rate": 9.333003952569169e-07,
"loss": 0.0027,
"reward": 2.629354953765869,
"reward_std": 0.06412048637866974,
"rewards/accuracy_reward_stage2": 0.6606047749519348,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.4375,
"step": 271
},
{
"completion_length": 103.90625,
"epoch": 0.06719367588932806,
"grad_norm": 4.510053925417986,
"kl": 0.072265625,
"learning_rate": 9.330533596837944e-07,
"loss": 0.0029,
"reward": 2.509962797164917,
"reward_std": 0.11727539449930191,
"rewards/accuracy_reward_stage2": 0.5099626779556274,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 272
},
{
"completion_length": 104.140625,
"epoch": 0.06744071146245059,
"grad_norm": 4.329434972997867,
"kl": 0.07861328125,
"learning_rate": 9.328063241106719e-07,
"loss": 0.0032,
"reward": 2.725374221801758,
"reward_std": 0.0988876223564148,
"rewards/accuracy_reward_stage2": 0.7253742218017578,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 273
},
{
"completion_length": 103.125,
"epoch": 0.06768774703557312,
"grad_norm": 1.6312530479379586,
"kl": 0.0771484375,
"learning_rate": 9.325592885375494e-07,
"loss": 0.0031,
"reward": 2.735292673110962,
"reward_std": 0.05476506054401398,
"rewards/accuracy_reward_stage2": 0.7509176731109619,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.234375,
"step": 274
},
{
"completion_length": 122.015625,
"epoch": 0.06793478260869565,
"grad_norm": 3.165395542950571,
"kl": 0.07470703125,
"learning_rate": 9.323122529644269e-07,
"loss": 0.003,
"reward": 2.4879045486450195,
"reward_std": 0.10839006304740906,
"rewards/accuracy_reward_stage2": 0.498320996761322,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.265625,
"step": 275
},
{
"completion_length": 104.796875,
"epoch": 0.06818181818181818,
"grad_norm": 2.9714488899285985,
"kl": 0.076171875,
"learning_rate": 9.320652173913043e-07,
"loss": 0.003,
"reward": 2.40559983253479,
"reward_std": 0.019545651972293854,
"rewards/accuracy_reward_stage2": 0.40559983253479004,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 276
},
{
"completion_length": 110.53125,
"epoch": 0.0684288537549407,
"grad_norm": 3.719685696298156,
"kl": 0.06298828125,
"learning_rate": 9.318181818181817e-07,
"loss": 0.0025,
"reward": 2.709977149963379,
"reward_std": 0.09086070209741592,
"rewards/accuracy_reward_stage2": 0.7672686576843262,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.28125,
"step": 277
},
{
"completion_length": 127.390625,
"epoch": 0.06867588932806325,
"grad_norm": 4.0171584662369675,
"kl": 0.07666015625,
"learning_rate": 9.315711462450593e-07,
"loss": 0.0031,
"reward": 2.358762264251709,
"reward_std": 0.09046010673046112,
"rewards/accuracy_reward_stage2": 0.3712621331214905,
"rewards/format_reward_all_stage": 1.9874999523162842,
"scores/refine_times": 1.375,
"step": 278
},
{
"completion_length": 96.125,
"epoch": 0.06892292490118578,
"grad_norm": 3.549293268141929,
"kl": 0.0859375,
"learning_rate": 9.313241106719367e-07,
"loss": 0.0034,
"reward": 2.6358108520507812,
"reward_std": 0.05308837443590164,
"rewards/accuracy_reward_stage2": 0.6514356136322021,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.25,
"step": 279
},
{
"completion_length": 111.5625,
"epoch": 0.0691699604743083,
"grad_norm": 3.426826676856291,
"kl": 0.0859375,
"learning_rate": 9.310770750988141e-07,
"loss": 0.0034,
"reward": 2.629148006439209,
"reward_std": 0.22529111802577972,
"rewards/accuracy_reward_stage2": 0.7541481256484985,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.34375,
"step": 280
},
{
"completion_length": 69.171875,
"epoch": 0.06941699604743083,
"grad_norm": 4.897619808608827,
"kl": 0.0888671875,
"learning_rate": 9.308300395256916e-07,
"loss": 0.0036,
"reward": 2.506143093109131,
"reward_std": 0.1796988546848297,
"rewards/accuracy_reward_stage2": 0.5061431527137756,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 281
},
{
"completion_length": 121.375,
"epoch": 0.06966403162055336,
"grad_norm": 3.280396954123298,
"kl": 0.0732421875,
"learning_rate": 9.305830039525691e-07,
"loss": 0.0029,
"reward": 2.4993786811828613,
"reward_std": 0.15204885601997375,
"rewards/accuracy_reward_stage2": 0.5566701889038086,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.34375,
"step": 282
},
{
"completion_length": 100.046875,
"epoch": 0.06991106719367589,
"grad_norm": 3.1470297174282957,
"kl": 0.05859375,
"learning_rate": 9.303359683794467e-07,
"loss": 0.0023,
"reward": 2.753185749053955,
"reward_std": 0.04245923087000847,
"rewards/accuracy_reward_stage2": 0.7531858086585999,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 283
},
{
"completion_length": 107.484375,
"epoch": 0.07015810276679842,
"grad_norm": 4.1904751636477195,
"kl": 0.06494140625,
"learning_rate": 9.300889328063241e-07,
"loss": 0.0026,
"reward": 2.5296671390533447,
"reward_std": 0.1889445036649704,
"rewards/accuracy_reward_stage2": 0.5973755717277527,
"rewards/format_reward_all_stage": 1.9322917461395264,
"scores/refine_times": 1.328125,
"step": 284
},
{
"completion_length": 91.015625,
"epoch": 0.07040513833992094,
"grad_norm": 3.489184308124264,
"kl": 0.05517578125,
"learning_rate": 9.298418972332015e-07,
"loss": 0.0022,
"reward": 2.7620816230773926,
"reward_std": 0.030113043263554573,
"rewards/accuracy_reward_stage2": 0.7620817422866821,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 285
},
{
"completion_length": 100.1875,
"epoch": 0.07065217391304347,
"grad_norm": 3.275397968060478,
"kl": 0.06396484375,
"learning_rate": 9.295948616600791e-07,
"loss": 0.0026,
"reward": 2.6670258045196533,
"reward_std": 0.13048681616783142,
"rewards/accuracy_reward_stage2": 0.6670258641242981,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.53125,
"step": 286
},
{
"completion_length": 126.015625,
"epoch": 0.070899209486166,
"grad_norm": 2.6549645591790405,
"kl": 0.06103515625,
"learning_rate": 9.293478260869565e-07,
"loss": 0.0024,
"reward": 2.641623020172119,
"reward_std": 0.22553008794784546,
"rewards/accuracy_reward_stage2": 0.774956226348877,
"rewards/format_reward_all_stage": 1.8666666746139526,
"scores/refine_times": 1.546875,
"step": 287
},
{
"completion_length": 104.671875,
"epoch": 0.07114624505928854,
"grad_norm": 3.636209601421511,
"kl": 0.06640625,
"learning_rate": 9.291007905138339e-07,
"loss": 0.0027,
"reward": 2.666322946548462,
"reward_std": 0.06766237318515778,
"rewards/accuracy_reward_stage2": 0.6741354465484619,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.34375,
"step": 288
},
{
"completion_length": 106.640625,
"epoch": 0.07139328063241107,
"grad_norm": 4.503204838778139,
"kl": 0.0595703125,
"learning_rate": 9.288537549407114e-07,
"loss": 0.0024,
"reward": 2.3697216510772705,
"reward_std": 0.09033536165952682,
"rewards/accuracy_reward_stage2": 0.3697216212749481,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 289
},
{
"completion_length": 104.0,
"epoch": 0.0716403162055336,
"grad_norm": 3.3246057454714024,
"kl": 0.06591796875,
"learning_rate": 9.286067193675889e-07,
"loss": 0.0026,
"reward": 2.5884175300598145,
"reward_std": 0.11718533933162689,
"rewards/accuracy_reward_stage2": 0.594667375087738,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.296875,
"step": 290
},
{
"completion_length": 103.296875,
"epoch": 0.07188735177865613,
"grad_norm": 3.7899360784183123,
"kl": 0.057861328125,
"learning_rate": 9.283596837944663e-07,
"loss": 0.0023,
"reward": 2.4471476078033447,
"reward_std": 0.12509356439113617,
"rewards/accuracy_reward_stage2": 0.5096475481987,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.3125,
"step": 291
},
{
"completion_length": 131.578125,
"epoch": 0.07213438735177866,
"grad_norm": 3.623459949472801,
"kl": 0.06591796875,
"learning_rate": 9.281126482213438e-07,
"loss": 0.0026,
"reward": 2.4355835914611816,
"reward_std": 0.17318613827228546,
"rewards/accuracy_reward_stage2": 0.5147501230239868,
"rewards/format_reward_all_stage": 1.9208333492279053,
"scores/refine_times": 1.4375,
"step": 292
},
{
"completion_length": 83.125,
"epoch": 0.07238142292490118,
"grad_norm": 4.8813524232727366,
"kl": 0.072265625,
"learning_rate": 9.278656126482213e-07,
"loss": 0.0029,
"reward": 2.58034610748291,
"reward_std": 0.18158404529094696,
"rewards/accuracy_reward_stage2": 0.5907625555992126,
"rewards/format_reward_all_stage": 1.9895832538604736,
"scores/refine_times": 1.15625,
"step": 293
},
{
"completion_length": 90.375,
"epoch": 0.07262845849802371,
"grad_norm": 3.4214252970122123,
"kl": 0.068359375,
"learning_rate": 9.276185770750988e-07,
"loss": 0.0027,
"reward": 2.790419578552246,
"reward_std": 0.07727587223052979,
"rewards/accuracy_reward_stage2": 0.8154194355010986,
"rewards/format_reward_all_stage": 1.975000023841858,
"scores/refine_times": 1.171875,
"step": 294
},
{
"completion_length": 97.0625,
"epoch": 0.07287549407114624,
"grad_norm": 2.9700993517489906,
"kl": 0.08349609375,
"learning_rate": 9.273715415019763e-07,
"loss": 0.0033,
"reward": 2.4873194694519043,
"reward_std": 0.01997227966785431,
"rewards/accuracy_reward_stage2": 0.4873194694519043,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 295
},
{
"completion_length": 72.28125,
"epoch": 0.07312252964426877,
"grad_norm": 3.329369020437154,
"kl": 0.0654296875,
"learning_rate": 9.271245059288537e-07,
"loss": 0.0026,
"reward": 2.7289249897003174,
"reward_std": 0.08622775226831436,
"rewards/accuracy_reward_stage2": 0.7289249300956726,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 296
},
{
"completion_length": 89.90625,
"epoch": 0.07336956521739131,
"grad_norm": 2.664881556264298,
"kl": 0.06689453125,
"learning_rate": 9.268774703557312e-07,
"loss": 0.0027,
"reward": 2.809821605682373,
"reward_std": 0.042036011815071106,
"rewards/accuracy_reward_stage2": 0.8202384114265442,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.234375,
"step": 297
},
{
"completion_length": 96.640625,
"epoch": 0.07361660079051384,
"grad_norm": 5.559187183825259,
"kl": 0.078125,
"learning_rate": 9.266304347826086e-07,
"loss": 0.0031,
"reward": 2.4564993381500244,
"reward_std": 0.17395678162574768,
"rewards/accuracy_reward_stage2": 0.5398328304290771,
"rewards/format_reward_all_stage": 1.9166667461395264,
"scores/refine_times": 1.34375,
"step": 298
},
{
"completion_length": 87.515625,
"epoch": 0.07386363636363637,
"grad_norm": 4.704220127005476,
"kl": 0.0869140625,
"learning_rate": 9.263833992094861e-07,
"loss": 0.0035,
"reward": 2.6666579246520996,
"reward_std": 0.06254095584154129,
"rewards/accuracy_reward_stage2": 0.6666580438613892,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 299
},
{
"completion_length": 84.578125,
"epoch": 0.0741106719367589,
"grad_norm": 4.262104472530315,
"kl": 0.07666015625,
"learning_rate": 9.261363636363636e-07,
"loss": 0.0031,
"reward": 2.7805142402648926,
"reward_std": 0.08827356994152069,
"rewards/accuracy_reward_stage2": 0.7961392998695374,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.1875,
"step": 300
},
{
"completion_length": 77.40625,
"epoch": 0.07435770750988142,
"grad_norm": 4.096353557974333,
"kl": 0.10693359375,
"learning_rate": 9.25889328063241e-07,
"loss": 0.0043,
"reward": 2.542158603668213,
"reward_std": 0.06200522556900978,
"rewards/accuracy_reward_stage2": 0.5421587228775024,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 301
},
{
"completion_length": 78.78125,
"epoch": 0.07460474308300395,
"grad_norm": 4.001605680444497,
"kl": 0.0830078125,
"learning_rate": 9.256422924901185e-07,
"loss": 0.0033,
"reward": 2.708712339401245,
"reward_std": 0.006489424966275692,
"rewards/accuracy_reward_stage2": 0.7087122797966003,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 302
},
{
"completion_length": 81.09375,
"epoch": 0.07485177865612648,
"grad_norm": 3.3709487646989804,
"kl": 0.10546875,
"learning_rate": 9.253952569169961e-07,
"loss": 0.0042,
"reward": 2.5341782569885254,
"reward_std": 0.15571407973766327,
"rewards/accuracy_reward_stage2": 0.6591783761978149,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 303
},
{
"completion_length": 65.484375,
"epoch": 0.07509881422924901,
"grad_norm": 3.3434150332690695,
"kl": 0.072265625,
"learning_rate": 9.251482213438735e-07,
"loss": 0.0029,
"reward": 2.7836451530456543,
"reward_std": 0.04097224026918411,
"rewards/accuracy_reward_stage2": 0.7836451530456543,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 304
},
{
"completion_length": 75.140625,
"epoch": 0.07534584980237154,
"grad_norm": 4.073041421040428,
"kl": 0.06884765625,
"learning_rate": 9.24901185770751e-07,
"loss": 0.0028,
"reward": 2.7703256607055664,
"reward_std": 0.06903526932001114,
"rewards/accuracy_reward_stage2": 0.7703255414962769,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 305
},
{
"completion_length": 86.875,
"epoch": 0.07559288537549406,
"grad_norm": 4.9587612243478905,
"kl": 0.0986328125,
"learning_rate": 9.246541501976284e-07,
"loss": 0.0039,
"reward": 2.566884994506836,
"reward_std": 0.11869431287050247,
"rewards/accuracy_reward_stage2": 0.5825099945068359,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.25,
"step": 306
},
{
"completion_length": 78.734375,
"epoch": 0.07583992094861661,
"grad_norm": 5.010353844664205,
"kl": 0.09423828125,
"learning_rate": 9.244071146245059e-07,
"loss": 0.0038,
"reward": 2.725599527359009,
"reward_std": 0.09302526712417603,
"rewards/accuracy_reward_stage2": 0.7255995273590088,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 307
},
{
"completion_length": 69.375,
"epoch": 0.07608695652173914,
"grad_norm": 2.452183271447089,
"kl": 0.09228515625,
"learning_rate": 9.241600790513834e-07,
"loss": 0.0037,
"reward": 2.8343749046325684,
"reward_std": 0.07092030346393585,
"rewards/accuracy_reward_stage2": 0.8343750238418579,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 308
},
{
"completion_length": 72.84375,
"epoch": 0.07633399209486166,
"grad_norm": 6.377563585569854,
"kl": 0.1240234375,
"learning_rate": 9.239130434782608e-07,
"loss": 0.005,
"reward": 2.4583377838134766,
"reward_std": 0.15416155755519867,
"rewards/accuracy_reward_stage2": 0.5260462760925293,
"rewards/format_reward_all_stage": 1.9322917461395264,
"scores/refine_times": 1.28125,
"step": 309
},
{
"completion_length": 63.6875,
"epoch": 0.07658102766798419,
"grad_norm": 3.40096591722913,
"kl": 0.1025390625,
"learning_rate": 9.236660079051382e-07,
"loss": 0.0041,
"reward": 2.791654348373413,
"reward_std": 0.04698639735579491,
"rewards/accuracy_reward_stage2": 0.7916543483734131,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 310
},
{
"completion_length": 69.328125,
"epoch": 0.07682806324110672,
"grad_norm": 2.5157220101442923,
"kl": 0.1376953125,
"learning_rate": 9.234189723320159e-07,
"loss": 0.0055,
"reward": 2.621166706085205,
"reward_std": 0.01371072232723236,
"rewards/accuracy_reward_stage2": 0.6211665868759155,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 311
},
{
"completion_length": 73.40625,
"epoch": 0.07707509881422925,
"grad_norm": 4.6095300665340195,
"kl": 0.1044921875,
"learning_rate": 9.231719367588933e-07,
"loss": 0.0042,
"reward": 2.7523388862609863,
"reward_std": 0.14407417178153992,
"rewards/accuracy_reward_stage2": 0.7679637670516968,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.140625,
"step": 312
},
{
"completion_length": 66.40625,
"epoch": 0.07732213438735178,
"grad_norm": 1.9319584273510186,
"kl": 0.11181640625,
"learning_rate": 9.229249011857707e-07,
"loss": 0.0045,
"reward": 2.7965087890625,
"reward_std": 0.010140997357666492,
"rewards/accuracy_reward_stage2": 0.7965086698532104,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 313
},
{
"completion_length": 63.0,
"epoch": 0.0775691699604743,
"grad_norm": 4.348923290801355,
"kl": 0.12158203125,
"learning_rate": 9.226778656126482e-07,
"loss": 0.0049,
"reward": 2.4945788383483887,
"reward_std": 0.027087727561593056,
"rewards/accuracy_reward_stage2": 0.49457883834838867,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 314
},
{
"completion_length": 73.53125,
"epoch": 0.07781620553359683,
"grad_norm": 4.488593163355316,
"kl": 0.1298828125,
"learning_rate": 9.224308300395256e-07,
"loss": 0.0052,
"reward": 2.7206943035125732,
"reward_std": 0.11720685660839081,
"rewards/accuracy_reward_stage2": 0.7206943035125732,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 315
},
{
"completion_length": 70.625,
"epoch": 0.07806324110671936,
"grad_norm": 6.557602468987205,
"kl": 0.119140625,
"learning_rate": 9.221837944664031e-07,
"loss": 0.0048,
"reward": 2.5555434226989746,
"reward_std": 0.14352944493293762,
"rewards/accuracy_reward_stage2": 0.5555435419082642,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 316
},
{
"completion_length": 91.765625,
"epoch": 0.0783102766798419,
"grad_norm": 3.0896623958939915,
"kl": 0.07666015625,
"learning_rate": 9.219367588932806e-07,
"loss": 0.0031,
"reward": 2.527045726776123,
"reward_std": 0.10674090683460236,
"rewards/accuracy_reward_stage2": 0.5473582744598389,
"rewards/format_reward_all_stage": 1.9796874523162842,
"scores/refine_times": 1.28125,
"step": 317
},
{
"completion_length": 65.8125,
"epoch": 0.07855731225296443,
"grad_norm": 4.73674356473737,
"kl": 0.125,
"learning_rate": 9.21689723320158e-07,
"loss": 0.005,
"reward": 2.770782947540283,
"reward_std": 0.07048628479242325,
"rewards/accuracy_reward_stage2": 0.7707828879356384,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 318
},
{
"completion_length": 78.328125,
"epoch": 0.07880434782608696,
"grad_norm": 4.422531982012408,
"kl": 0.12158203125,
"learning_rate": 9.214426877470354e-07,
"loss": 0.0049,
"reward": 2.5821871757507324,
"reward_std": 0.0276879221200943,
"rewards/accuracy_reward_stage2": 0.5821871161460876,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 319
},
{
"completion_length": 59.6875,
"epoch": 0.07905138339920949,
"grad_norm": 6.29720713319933,
"kl": 0.111328125,
"learning_rate": 9.211956521739131e-07,
"loss": 0.0045,
"reward": 2.796858310699463,
"reward_std": 0.11641418933868408,
"rewards/accuracy_reward_stage2": 0.7968584299087524,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 320
},
{
"completion_length": 62.125,
"epoch": 0.07929841897233202,
"grad_norm": 6.247716420446892,
"kl": 0.1015625,
"learning_rate": 9.209486166007905e-07,
"loss": 0.0041,
"reward": 2.609973430633545,
"reward_std": 0.14433087408542633,
"rewards/accuracy_reward_stage2": 0.6099736094474792,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 321
},
{
"completion_length": 44.1875,
"epoch": 0.07954545454545454,
"grad_norm": 2.1669285375452203,
"kl": 0.1279296875,
"learning_rate": 9.20701581027668e-07,
"loss": 0.0051,
"reward": 2.913513422012329,
"reward_std": 0.0015782499685883522,
"rewards/accuracy_reward_stage2": 0.9135133624076843,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 322
},
{
"completion_length": 54.5625,
"epoch": 0.07979249011857707,
"grad_norm": 5.141224866590071,
"kl": 0.12158203125,
"learning_rate": 9.204545454545454e-07,
"loss": 0.0049,
"reward": 2.738158702850342,
"reward_std": 0.031038541346788406,
"rewards/accuracy_reward_stage2": 0.7381587028503418,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 323
},
{
"completion_length": 60.625,
"epoch": 0.0800395256916996,
"grad_norm": 7.40413453314316,
"kl": 0.1298828125,
"learning_rate": 9.202075098814229e-07,
"loss": 0.0052,
"reward": 2.398099660873413,
"reward_std": 0.13175323605537415,
"rewards/accuracy_reward_stage2": 0.39809975028038025,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 324
},
{
"completion_length": 54.5625,
"epoch": 0.08028656126482213,
"grad_norm": 4.312175786875359,
"kl": 0.1376953125,
"learning_rate": 9.199604743083004e-07,
"loss": 0.0055,
"reward": 2.7443251609802246,
"reward_std": 0.012218557298183441,
"rewards/accuracy_reward_stage2": 0.7443252801895142,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 325
},
{
"completion_length": 39.75,
"epoch": 0.08053359683794467,
"grad_norm": 2.5230177648811254,
"kl": 0.22265625,
"learning_rate": 9.197134387351778e-07,
"loss": 0.0089,
"reward": 2.817213773727417,
"reward_std": 0.0013947999104857445,
"rewards/accuracy_reward_stage2": 0.817213773727417,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 326
},
{
"completion_length": 43.5625,
"epoch": 0.0807806324110672,
"grad_norm": 0.6751401066284123,
"kl": 0.19140625,
"learning_rate": 9.194664031620552e-07,
"loss": 0.0077,
"reward": 2.948148250579834,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.9481481313705444,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 327
},
{
"completion_length": 45.9375,
"epoch": 0.08102766798418973,
"grad_norm": 5.102879547791845,
"kl": 0.2001953125,
"learning_rate": 9.192193675889328e-07,
"loss": 0.008,
"reward": 2.7917566299438477,
"reward_std": 0.06986243277788162,
"rewards/accuracy_reward_stage2": 0.7917565107345581,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 328
},
{
"completion_length": 55.828125,
"epoch": 0.08127470355731226,
"grad_norm": 5.398203863071762,
"kl": 0.251953125,
"learning_rate": 9.189723320158103e-07,
"loss": 0.0101,
"reward": 2.423312187194824,
"reward_std": 0.2542467713356018,
"rewards/accuracy_reward_stage2": 0.5483123064041138,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.328125,
"step": 329
},
{
"completion_length": 68.328125,
"epoch": 0.08152173913043478,
"grad_norm": 3.744099222638442,
"kl": 0.1376953125,
"learning_rate": 9.187252964426877e-07,
"loss": 0.0055,
"reward": 2.623687744140625,
"reward_std": 0.061280906200408936,
"rewards/accuracy_reward_stage2": 0.6236876249313354,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 330
},
{
"completion_length": 61.140625,
"epoch": 0.08176877470355731,
"grad_norm": 4.174389727633853,
"kl": 0.1484375,
"learning_rate": 9.184782608695652e-07,
"loss": 0.0059,
"reward": 2.789412021636963,
"reward_std": 0.018945466727018356,
"rewards/accuracy_reward_stage2": 0.7894119620323181,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 331
},
{
"completion_length": 60.0,
"epoch": 0.08201581027667984,
"grad_norm": 5.617047129327734,
"kl": 0.130859375,
"learning_rate": 9.182312252964426e-07,
"loss": 0.0052,
"reward": 2.493664503097534,
"reward_std": 0.08623480051755905,
"rewards/accuracy_reward_stage2": 0.5092895030975342,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.140625,
"step": 332
},
{
"completion_length": 61.5,
"epoch": 0.08226284584980237,
"grad_norm": 4.192545514357886,
"kl": 0.15625,
"learning_rate": 9.179841897233202e-07,
"loss": 0.0063,
"reward": 2.833590030670166,
"reward_std": 0.03723323345184326,
"rewards/accuracy_reward_stage2": 0.8335901498794556,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 333
},
{
"completion_length": 70.625,
"epoch": 0.0825098814229249,
"grad_norm": 4.124911374118259,
"kl": 0.134765625,
"learning_rate": 9.177371541501976e-07,
"loss": 0.0054,
"reward": 2.8245418071746826,
"reward_std": 0.07523417472839355,
"rewards/accuracy_reward_stage2": 0.8245418071746826,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 334
},
{
"completion_length": 57.75,
"epoch": 0.08275691699604742,
"grad_norm": 4.044678481099241,
"kl": 0.150390625,
"learning_rate": 9.17490118577075e-07,
"loss": 0.006,
"reward": 2.784482955932617,
"reward_std": 0.0903862714767456,
"rewards/accuracy_reward_stage2": 0.7844830751419067,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 335
},
{
"completion_length": 64.4375,
"epoch": 0.08300395256916997,
"grad_norm": 4.32490069299913,
"kl": 0.12451171875,
"learning_rate": 9.172430830039525e-07,
"loss": 0.005,
"reward": 2.792992115020752,
"reward_std": 0.07582361996173859,
"rewards/accuracy_reward_stage2": 0.792992115020752,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 336
},
{
"completion_length": 68.875,
"epoch": 0.0832509881422925,
"grad_norm": 3.926256955205469,
"kl": 0.1064453125,
"learning_rate": 9.1699604743083e-07,
"loss": 0.0043,
"reward": 2.8083438873291016,
"reward_std": 0.11306163668632507,
"rewards/accuracy_reward_stage2": 0.8083438873291016,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 337
},
{
"completion_length": 70.75,
"epoch": 0.08349802371541502,
"grad_norm": 3.9512075265777455,
"kl": 0.1181640625,
"learning_rate": 9.167490118577074e-07,
"loss": 0.0047,
"reward": 2.7581210136413574,
"reward_std": 0.07616296410560608,
"rewards/accuracy_reward_stage2": 0.7581211924552917,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 338
},
{
"completion_length": 75.375,
"epoch": 0.08374505928853755,
"grad_norm": 4.658203187374032,
"kl": 0.115234375,
"learning_rate": 9.16501976284585e-07,
"loss": 0.0046,
"reward": 2.5720980167388916,
"reward_std": 0.10662204772233963,
"rewards/accuracy_reward_stage2": 0.5720980167388916,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 339
},
{
"completion_length": 80.34375,
"epoch": 0.08399209486166008,
"grad_norm": 4.101756140968318,
"kl": 0.11328125,
"learning_rate": 9.162549407114624e-07,
"loss": 0.0045,
"reward": 2.7045812606811523,
"reward_std": 0.0755821019411087,
"rewards/accuracy_reward_stage2": 0.7045812606811523,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 340
},
{
"completion_length": 66.0,
"epoch": 0.08423913043478261,
"grad_norm": 3.7647913238998947,
"kl": 0.123046875,
"learning_rate": 9.160079051383399e-07,
"loss": 0.0049,
"reward": 2.548119068145752,
"reward_std": 0.07794924080371857,
"rewards/accuracy_reward_stage2": 0.5481189489364624,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 341
},
{
"completion_length": 87.84375,
"epoch": 0.08448616600790514,
"grad_norm": 3.5158554366688306,
"kl": 0.10107421875,
"learning_rate": 9.157608695652174e-07,
"loss": 0.004,
"reward": 2.307880163192749,
"reward_std": 0.0058220368809998035,
"rewards/accuracy_reward_stage2": 0.37038010358810425,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 342
},
{
"completion_length": 68.9375,
"epoch": 0.08473320158102766,
"grad_norm": 4.574586844445791,
"kl": 0.1171875,
"learning_rate": 9.155138339920948e-07,
"loss": 0.0047,
"reward": 2.369117259979248,
"reward_std": 0.0667356476187706,
"rewards/accuracy_reward_stage2": 0.36911740899086,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 343
},
{
"completion_length": 72.625,
"epoch": 0.08498023715415019,
"grad_norm": 2.595510068327524,
"kl": 0.0947265625,
"learning_rate": 9.152667984189722e-07,
"loss": 0.0038,
"reward": 2.610255718231201,
"reward_std": 0.04115435481071472,
"rewards/accuracy_reward_stage2": 0.6102556586265564,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 344
},
{
"completion_length": 86.5,
"epoch": 0.08522727272727272,
"grad_norm": 3.6823970485562,
"kl": 0.08740234375,
"learning_rate": 9.150197628458498e-07,
"loss": 0.0035,
"reward": 2.6106173992156982,
"reward_std": 0.13690951466560364,
"rewards/accuracy_reward_stage2": 0.7356172800064087,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.078125,
"step": 345
},
{
"completion_length": 83.0625,
"epoch": 0.08547430830039526,
"grad_norm": 3.550714248285621,
"kl": 0.1015625,
"learning_rate": 9.147727272727272e-07,
"loss": 0.0041,
"reward": 2.5705089569091797,
"reward_std": 0.020351896062493324,
"rewards/accuracy_reward_stage2": 0.5705088973045349,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 346
},
{
"completion_length": 80.9375,
"epoch": 0.08572134387351779,
"grad_norm": 6.048964304026029,
"kl": 0.10107421875,
"learning_rate": 9.145256916996046e-07,
"loss": 0.004,
"reward": 2.619798183441162,
"reward_std": 0.1754463016986847,
"rewards/accuracy_reward_stage2": 0.6197980642318726,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 347
},
{
"completion_length": 97.828125,
"epoch": 0.08596837944664032,
"grad_norm": 3.753761799406494,
"kl": 0.07763671875,
"learning_rate": 9.142786561264822e-07,
"loss": 0.0031,
"reward": 2.7262320518493652,
"reward_std": 0.10837259888648987,
"rewards/accuracy_reward_stage2": 0.7324820160865784,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.140625,
"step": 348
},
{
"completion_length": 72.71875,
"epoch": 0.08621541501976285,
"grad_norm": 4.3265197363111385,
"kl": 0.109375,
"learning_rate": 9.140316205533597e-07,
"loss": 0.0044,
"reward": 2.7120370864868164,
"reward_std": 0.12224727869033813,
"rewards/accuracy_reward_stage2": 0.7120370864868164,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 349
},
{
"completion_length": 74.1875,
"epoch": 0.08646245059288538,
"grad_norm": 2.1002722035451935,
"kl": 0.09912109375,
"learning_rate": 9.137845849802372e-07,
"loss": 0.004,
"reward": 2.612084150314331,
"reward_std": 0.00017338224279228598,
"rewards/accuracy_reward_stage2": 0.6120842099189758,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 350
},
{
"completion_length": 90.875,
"epoch": 0.0867094861660079,
"grad_norm": 2.865518683426728,
"kl": 0.06591796875,
"learning_rate": 9.135375494071146e-07,
"loss": 0.0026,
"reward": 2.7866175174713135,
"reward_std": 0.0579838864505291,
"rewards/accuracy_reward_stage2": 0.7866175174713135,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 351
},
{
"completion_length": 98.328125,
"epoch": 0.08695652173913043,
"grad_norm": 4.334486790649066,
"kl": 0.0830078125,
"learning_rate": 9.13290513833992e-07,
"loss": 0.0033,
"reward": 2.560479164123535,
"reward_std": 0.028959453105926514,
"rewards/accuracy_reward_stage2": 0.5604792833328247,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 352
},
{
"completion_length": 90.0,
"epoch": 0.08720355731225296,
"grad_norm": 2.8099941143702236,
"kl": 0.0654296875,
"learning_rate": 9.130434782608695e-07,
"loss": 0.0026,
"reward": 2.7252159118652344,
"reward_std": 0.03723974525928497,
"rewards/accuracy_reward_stage2": 0.7252160310745239,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 353
},
{
"completion_length": 95.09375,
"epoch": 0.08745059288537549,
"grad_norm": 3.8428516026086887,
"kl": 0.08544921875,
"learning_rate": 9.12796442687747e-07,
"loss": 0.0034,
"reward": 2.444223642349243,
"reward_std": 0.01821870729327202,
"rewards/accuracy_reward_stage2": 0.4442237615585327,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 354
},
{
"completion_length": 101.8125,
"epoch": 0.08769762845849802,
"grad_norm": 4.237657395100437,
"kl": 0.08740234375,
"learning_rate": 9.125494071146244e-07,
"loss": 0.0035,
"reward": 2.5968141555786133,
"reward_std": 0.09012407064437866,
"rewards/accuracy_reward_stage2": 0.5968142151832581,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 355
},
{
"completion_length": 92.4375,
"epoch": 0.08794466403162056,
"grad_norm": 3.600620223298913,
"kl": 0.07421875,
"learning_rate": 9.123023715415019e-07,
"loss": 0.003,
"reward": 2.542025327682495,
"reward_std": 0.05679365620017052,
"rewards/accuracy_reward_stage2": 0.5420252680778503,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 356
},
{
"completion_length": 93.0,
"epoch": 0.08819169960474309,
"grad_norm": 4.863920897927563,
"kl": 0.08740234375,
"learning_rate": 9.120553359683793e-07,
"loss": 0.0035,
"reward": 2.5952115058898926,
"reward_std": 0.0393090695142746,
"rewards/accuracy_reward_stage2": 0.5952116250991821,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 357
},
{
"completion_length": 96.109375,
"epoch": 0.08843873517786562,
"grad_norm": 1.7790679985594122,
"kl": 0.10888671875,
"learning_rate": 9.11808300395257e-07,
"loss": 0.0044,
"reward": 2.6337594985961914,
"reward_std": 0.085614413022995,
"rewards/accuracy_reward_stage2": 0.6962594985961914,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 358
},
{
"completion_length": 83.8125,
"epoch": 0.08868577075098814,
"grad_norm": 3.551887755670501,
"kl": 0.0615234375,
"learning_rate": 9.115612648221344e-07,
"loss": 0.0025,
"reward": 2.706885814666748,
"reward_std": 0.08412647247314453,
"rewards/accuracy_reward_stage2": 0.7068856954574585,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 359
},
{
"completion_length": 85.875,
"epoch": 0.08893280632411067,
"grad_norm": 3.9494113031278744,
"kl": 0.1015625,
"learning_rate": 9.113142292490118e-07,
"loss": 0.0041,
"reward": 2.6664719581604004,
"reward_std": 0.028600279241800308,
"rewards/accuracy_reward_stage2": 0.6664718389511108,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 360
},
{
"completion_length": 82.625,
"epoch": 0.0891798418972332,
"grad_norm": 3.650417991791406,
"kl": 0.12890625,
"learning_rate": 9.110671936758893e-07,
"loss": 0.0052,
"reward": 2.6980035305023193,
"reward_std": 0.020932497456669807,
"rewards/accuracy_reward_stage2": 0.6980035305023193,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 361
},
{
"completion_length": 90.6875,
"epoch": 0.08942687747035573,
"grad_norm": 1.8849199154834935,
"kl": 0.06298828125,
"learning_rate": 9.108201581027668e-07,
"loss": 0.0025,
"reward": 2.5484163761138916,
"reward_std": 0.0038587902672588825,
"rewards/accuracy_reward_stage2": 0.5484163165092468,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 362
},
{
"completion_length": 72.0625,
"epoch": 0.08967391304347826,
"grad_norm": 3.494331373222489,
"kl": 0.08740234375,
"learning_rate": 9.105731225296442e-07,
"loss": 0.0035,
"reward": 2.810746431350708,
"reward_std": 0.07636210322380066,
"rewards/accuracy_reward_stage2": 0.810746431350708,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 363
},
{
"completion_length": 94.203125,
"epoch": 0.08992094861660078,
"grad_norm": 1.7491121800723743,
"kl": 0.07958984375,
"learning_rate": 9.103260869565217e-07,
"loss": 0.0032,
"reward": 2.7079660892486572,
"reward_std": 0.0014026534045115113,
"rewards/accuracy_reward_stage2": 0.707966148853302,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 364
},
{
"completion_length": 94.5625,
"epoch": 0.09016798418972333,
"grad_norm": 5.039313710437697,
"kl": 0.07275390625,
"learning_rate": 9.100790513833991e-07,
"loss": 0.0029,
"reward": 2.6045689582824707,
"reward_std": 0.06207848712801933,
"rewards/accuracy_reward_stage2": 0.6045687794685364,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 365
},
{
"completion_length": 78.5625,
"epoch": 0.09041501976284586,
"grad_norm": 5.0930488398379685,
"kl": 0.0927734375,
"learning_rate": 9.098320158102767e-07,
"loss": 0.0037,
"reward": 2.645688533782959,
"reward_std": 0.06729687005281448,
"rewards/accuracy_reward_stage2": 0.645688533782959,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 366
},
{
"completion_length": 80.125,
"epoch": 0.09066205533596838,
"grad_norm": 3.9786634024319487,
"kl": 0.0615234375,
"learning_rate": 9.095849802371542e-07,
"loss": 0.0025,
"reward": 2.5161356925964355,
"reward_std": 0.027481183409690857,
"rewards/accuracy_reward_stage2": 0.5161359310150146,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 367
},
{
"completion_length": 75.0,
"epoch": 0.09090909090909091,
"grad_norm": 3.523685489400653,
"kl": 0.052734375,
"learning_rate": 9.093379446640316e-07,
"loss": 0.0021,
"reward": 2.816563129425049,
"reward_std": 0.009983880445361137,
"rewards/accuracy_reward_stage2": 0.8165630102157593,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 368
},
{
"completion_length": 92.3125,
"epoch": 0.09115612648221344,
"grad_norm": 4.234417498920322,
"kl": 0.06640625,
"learning_rate": 9.09090909090909e-07,
"loss": 0.0026,
"reward": 2.7531919479370117,
"reward_std": 0.15805090963840485,
"rewards/accuracy_reward_stage2": 0.7531920671463013,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 369
},
{
"completion_length": 83.0625,
"epoch": 0.09140316205533597,
"grad_norm": 3.9114706673580932,
"kl": 0.09033203125,
"learning_rate": 9.088438735177866e-07,
"loss": 0.0036,
"reward": 2.5998778343200684,
"reward_std": 0.025657862424850464,
"rewards/accuracy_reward_stage2": 0.5998777747154236,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 370
},
{
"completion_length": 89.359375,
"epoch": 0.0916501976284585,
"grad_norm": 4.838298835178836,
"kl": 0.08203125,
"learning_rate": 9.08596837944664e-07,
"loss": 0.0033,
"reward": 2.607950448989868,
"reward_std": 0.05628318339586258,
"rewards/accuracy_reward_stage2": 0.6079504489898682,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 371
},
{
"completion_length": 82.28125,
"epoch": 0.09189723320158102,
"grad_norm": 3.7446689639370048,
"kl": 0.06982421875,
"learning_rate": 9.083498023715414e-07,
"loss": 0.0028,
"reward": 2.763612747192383,
"reward_std": 0.06537837535142899,
"rewards/accuracy_reward_stage2": 0.7636126279830933,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 372
},
{
"completion_length": 75.6875,
"epoch": 0.09214426877470355,
"grad_norm": 5.0039114879396855,
"kl": 0.07763671875,
"learning_rate": 9.081027667984189e-07,
"loss": 0.0031,
"reward": 2.7143735885620117,
"reward_std": 0.11156806349754333,
"rewards/accuracy_reward_stage2": 0.7143735885620117,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 373
},
{
"completion_length": 62.0,
"epoch": 0.09239130434782608,
"grad_norm": 2.1401690709477665,
"kl": 0.07421875,
"learning_rate": 9.078557312252963e-07,
"loss": 0.003,
"reward": 2.7579450607299805,
"reward_std": 0.0014047721633687615,
"rewards/accuracy_reward_stage2": 0.75794517993927,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 374
},
{
"completion_length": 74.296875,
"epoch": 0.09263833992094862,
"grad_norm": 3.17288826560754,
"kl": 0.07861328125,
"learning_rate": 9.076086956521739e-07,
"loss": 0.0031,
"reward": 2.858553171157837,
"reward_std": 0.07798619568347931,
"rewards/accuracy_reward_stage2": 0.8585531711578369,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 375
},
{
"completion_length": 83.046875,
"epoch": 0.09288537549407115,
"grad_norm": 4.880063981090028,
"kl": 0.0859375,
"learning_rate": 9.073616600790514e-07,
"loss": 0.0034,
"reward": 2.54172420501709,
"reward_std": 0.07892563194036484,
"rewards/accuracy_reward_stage2": 0.541724443435669,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 376
},
{
"completion_length": 76.625,
"epoch": 0.09313241106719368,
"grad_norm": 3.6385142711792717,
"kl": 0.0732421875,
"learning_rate": 9.071146245059288e-07,
"loss": 0.0029,
"reward": 2.5843570232391357,
"reward_std": 0.09440311044454575,
"rewards/accuracy_reward_stage2": 0.5843569040298462,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 377
},
{
"completion_length": 82.3125,
"epoch": 0.09337944664031621,
"grad_norm": 4.763527255160224,
"kl": 0.06787109375,
"learning_rate": 9.068675889328063e-07,
"loss": 0.0027,
"reward": 2.674133062362671,
"reward_std": 0.14093773066997528,
"rewards/accuracy_reward_stage2": 0.7366331219673157,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 378
},
{
"completion_length": 78.140625,
"epoch": 0.09362648221343874,
"grad_norm": 2.188818094769199,
"kl": 0.07763671875,
"learning_rate": 9.066205533596838e-07,
"loss": 0.0031,
"reward": 2.685183525085449,
"reward_std": 0.11984831094741821,
"rewards/accuracy_reward_stage2": 0.7039335370063782,
"rewards/format_reward_all_stage": 1.9812500476837158,
"scores/refine_times": 1.109375,
"step": 379
},
{
"completion_length": 89.515625,
"epoch": 0.09387351778656126,
"grad_norm": 2.065491538033126,
"kl": 0.1748046875,
"learning_rate": 9.063735177865612e-07,
"loss": 0.007,
"reward": 2.7357075214385986,
"reward_std": 0.016317401081323624,
"rewards/accuracy_reward_stage2": 0.7357075214385986,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 380
},
{
"completion_length": 71.890625,
"epoch": 0.09412055335968379,
"grad_norm": 5.304636175879363,
"kl": 0.08203125,
"learning_rate": 9.061264822134387e-07,
"loss": 0.0033,
"reward": 2.747291088104248,
"reward_std": 0.08369327336549759,
"rewards/accuracy_reward_stage2": 0.7629162073135376,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.0625,
"step": 381
},
{
"completion_length": 81.0,
"epoch": 0.09436758893280632,
"grad_norm": 3.2109084867310376,
"kl": 0.10205078125,
"learning_rate": 9.058794466403161e-07,
"loss": 0.0041,
"reward": 2.59745454788208,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward_stage2": 0.5974544286727905,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 382
},
{
"completion_length": 80.265625,
"epoch": 0.09461462450592885,
"grad_norm": 2.77987908921126,
"kl": 0.07421875,
"learning_rate": 9.056324110671936e-07,
"loss": 0.003,
"reward": 2.638756513595581,
"reward_std": 0.03335772827267647,
"rewards/accuracy_reward_stage2": 0.638756513595581,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 383
},
{
"completion_length": 91.359375,
"epoch": 0.09486166007905138,
"grad_norm": 4.517273905721912,
"kl": 0.10205078125,
"learning_rate": 9.053853754940711e-07,
"loss": 0.0041,
"reward": 2.5370707511901855,
"reward_std": 0.04199734330177307,
"rewards/accuracy_reward_stage2": 0.537070631980896,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 384
},
{
"completion_length": 92.0625,
"epoch": 0.09510869565217392,
"grad_norm": 4.582087540119265,
"kl": 0.058349609375,
"learning_rate": 9.051383399209486e-07,
"loss": 0.0023,
"reward": 2.4340434074401855,
"reward_std": 0.17110881209373474,
"rewards/accuracy_reward_stage2": 0.4340435266494751,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 385
},
{
"completion_length": 87.546875,
"epoch": 0.09535573122529645,
"grad_norm": 3.8514679024728227,
"kl": 0.07861328125,
"learning_rate": 9.04891304347826e-07,
"loss": 0.0032,
"reward": 2.805450677871704,
"reward_std": 0.08290667831897736,
"rewards/accuracy_reward_stage2": 0.8054506778717041,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 386
},
{
"completion_length": 111.59375,
"epoch": 0.09560276679841898,
"grad_norm": 2.995041280410516,
"kl": 0.076171875,
"learning_rate": 9.046442687747036e-07,
"loss": 0.003,
"reward": 2.571000099182129,
"reward_std": 0.1235700324177742,
"rewards/accuracy_reward_stage2": 0.6475626230239868,
"rewards/format_reward_all_stage": 1.923437476158142,
"scores/refine_times": 1.484375,
"step": 387
},
{
"completion_length": 93.390625,
"epoch": 0.0958498023715415,
"grad_norm": 3.1143672609105697,
"kl": 0.060791015625,
"learning_rate": 9.04397233201581e-07,
"loss": 0.0024,
"reward": 2.7106356620788574,
"reward_std": 0.10501326620578766,
"rewards/accuracy_reward_stage2": 0.7168859243392944,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.328125,
"step": 388
},
{
"completion_length": 115.40625,
"epoch": 0.09609683794466403,
"grad_norm": 3.2606535165707085,
"kl": 0.08056640625,
"learning_rate": 9.041501976284585e-07,
"loss": 0.0032,
"reward": 2.5893375873565674,
"reward_std": 0.07137158513069153,
"rewards/accuracy_reward_stage2": 0.5955876111984253,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.375,
"step": 389
},
{
"completion_length": 81.25,
"epoch": 0.09634387351778656,
"grad_norm": 0.7663992960647029,
"kl": 0.07421875,
"learning_rate": 9.039031620553359e-07,
"loss": 0.003,
"reward": 2.861607074737549,
"reward_std": 0.03788072615861893,
"rewards/accuracy_reward_stage2": 0.8616071343421936,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 390
},
{
"completion_length": 93.8125,
"epoch": 0.09659090909090909,
"grad_norm": 3.072101647281855,
"kl": 0.060546875,
"learning_rate": 9.036561264822133e-07,
"loss": 0.0024,
"reward": 2.6479721069335938,
"reward_std": 0.13277268409729004,
"rewards/accuracy_reward_stage2": 0.6479719877243042,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 391
},
{
"completion_length": 133.53125,
"epoch": 0.09683794466403162,
"grad_norm": 1.1412164642463887,
"kl": 0.060791015625,
"learning_rate": 9.034090909090909e-07,
"loss": 0.0024,
"reward": 2.759420871734619,
"reward_std": 0.021660229191184044,
"rewards/accuracy_reward_stage2": 0.7594207525253296,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.6875,
"step": 392
},
{
"completion_length": 109.796875,
"epoch": 0.09708498023715414,
"grad_norm": 3.7260651927810224,
"kl": 0.0849609375,
"learning_rate": 9.031620553359683e-07,
"loss": 0.0034,
"reward": 2.6653695106506348,
"reward_std": 0.1488296538591385,
"rewards/accuracy_reward_stage2": 0.6653696298599243,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.4375,
"step": 393
},
{
"completion_length": 97.5,
"epoch": 0.09733201581027667,
"grad_norm": 1.9771971561886401,
"kl": 0.07666015625,
"learning_rate": 9.029150197628458e-07,
"loss": 0.0031,
"reward": 2.64612078666687,
"reward_std": 0.06681530922651291,
"rewards/accuracy_reward_stage2": 0.6461206674575806,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 394
},
{
"completion_length": 136.671875,
"epoch": 0.09757905138339922,
"grad_norm": 2.9936463926600254,
"kl": 0.09716796875,
"learning_rate": 9.026679841897233e-07,
"loss": 0.0039,
"reward": 2.3593060970306396,
"reward_std": 0.10838481783866882,
"rewards/accuracy_reward_stage2": 0.42805612087249756,
"rewards/format_reward_all_stage": 1.931249976158142,
"scores/refine_times": 1.5,
"step": 395
},
{
"completion_length": 100.890625,
"epoch": 0.09782608695652174,
"grad_norm": 3.25385470248015,
"kl": 0.055908203125,
"learning_rate": 9.024209486166008e-07,
"loss": 0.0022,
"reward": 2.2150940895080566,
"reward_std": 0.3374716639518738,
"rewards/accuracy_reward_stage2": 0.3400941491127014,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.34375,
"step": 396
},
{
"completion_length": 100.21875,
"epoch": 0.09807312252964427,
"grad_norm": 2.687250333827823,
"kl": 0.0537109375,
"learning_rate": 9.021739130434782e-07,
"loss": 0.0022,
"reward": 2.6029052734375,
"reward_std": 0.09498636424541473,
"rewards/accuracy_reward_stage2": 0.6107178926467896,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.359375,
"step": 397
},
{
"completion_length": 118.1875,
"epoch": 0.0983201581027668,
"grad_norm": 3.207780448405167,
"kl": 0.0791015625,
"learning_rate": 9.019268774703557e-07,
"loss": 0.0032,
"reward": 2.7733564376831055,
"reward_std": 0.1665625274181366,
"rewards/accuracy_reward_stage2": 0.7733563184738159,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.390625,
"step": 398
},
{
"completion_length": 166.609375,
"epoch": 0.09856719367588933,
"grad_norm": 3.1116135663801625,
"kl": 0.0625,
"learning_rate": 9.016798418972331e-07,
"loss": 0.0025,
"reward": 2.6186110973358154,
"reward_std": 0.23827369511127472,
"rewards/accuracy_reward_stage2": 0.6482987403869629,
"rewards/format_reward_all_stage": 1.970312476158142,
"scores/refine_times": 1.953125,
"step": 399
},
{
"completion_length": 126.015625,
"epoch": 0.09881422924901186,
"grad_norm": 3.09906490210566,
"kl": 0.05322265625,
"learning_rate": 9.014328063241107e-07,
"loss": 0.0021,
"reward": 2.795719861984253,
"reward_std": 0.09762432426214218,
"rewards/accuracy_reward_stage2": 0.7957199215888977,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.53125,
"step": 400
},
{
"completion_length": 92.90625,
"epoch": 0.09906126482213438,
"grad_norm": 2.2514082728709557,
"kl": 0.1162109375,
"learning_rate": 9.011857707509881e-07,
"loss": 0.0046,
"reward": 2.7314138412475586,
"reward_std": 0.0348123237490654,
"rewards/accuracy_reward_stage2": 0.7418302893638611,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.34375,
"step": 401
},
{
"completion_length": 129.75,
"epoch": 0.09930830039525691,
"grad_norm": 3.6954546874529988,
"kl": 0.05615234375,
"learning_rate": 9.009387351778655e-07,
"loss": 0.0022,
"reward": 2.509495496749878,
"reward_std": 0.15696083009243011,
"rewards/accuracy_reward_stage2": 0.5157454013824463,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.671875,
"step": 402
},
{
"completion_length": 146.5625,
"epoch": 0.09955533596837944,
"grad_norm": 2.579997674658776,
"kl": 0.0673828125,
"learning_rate": 9.00691699604743e-07,
"loss": 0.0027,
"reward": 2.930208206176758,
"reward_std": 0.08919259905815125,
"rewards/accuracy_reward_stage2": 0.9302083253860474,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.640625,
"step": 403
},
{
"completion_length": 121.75,
"epoch": 0.09980237154150198,
"grad_norm": 2.1237078180503586,
"kl": 0.0556640625,
"learning_rate": 9.004446640316206e-07,
"loss": 0.0022,
"reward": 2.6469950675964355,
"reward_std": 0.07097595930099487,
"rewards/accuracy_reward_stage2": 0.6469952464103699,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.5625,
"step": 404
},
{
"completion_length": 130.65625,
"epoch": 0.10004940711462451,
"grad_norm": 2.444254134638145,
"kl": 0.0576171875,
"learning_rate": 9.00197628458498e-07,
"loss": 0.0023,
"reward": 2.7591514587402344,
"reward_std": 0.07375451177358627,
"rewards/accuracy_reward_stage2": 0.7591514587402344,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.625,
"step": 405
},
{
"completion_length": 120.765625,
"epoch": 0.10029644268774704,
"grad_norm": 2.1204759761599066,
"kl": 0.059814453125,
"learning_rate": 8.999505928853755e-07,
"loss": 0.0024,
"reward": 2.799839973449707,
"reward_std": 0.004150245804339647,
"rewards/accuracy_reward_stage2": 0.7998400926589966,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.5625,
"step": 406
},
{
"completion_length": 162.5625,
"epoch": 0.10054347826086957,
"grad_norm": 7.631897735555443,
"kl": 0.32421875,
"learning_rate": 8.997035573122529e-07,
"loss": 0.013,
"reward": 2.5346357822418213,
"reward_std": 0.32676130533218384,
"rewards/accuracy_reward_stage2": 0.6940107941627502,
"rewards/format_reward_all_stage": 1.8406249284744263,
"scores/refine_times": 1.921875,
"step": 407
},
{
"completion_length": 146.625,
"epoch": 0.1007905138339921,
"grad_norm": 3.855569269918276,
"kl": 0.048828125,
"learning_rate": 8.994565217391304e-07,
"loss": 0.002,
"reward": 2.6476688385009766,
"reward_std": 0.12627330422401428,
"rewards/accuracy_reward_stage2": 0.6476688981056213,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.75,
"step": 408
},
{
"completion_length": 128.796875,
"epoch": 0.10103754940711462,
"grad_norm": 2.917741896038562,
"kl": 0.0595703125,
"learning_rate": 8.992094861660079e-07,
"loss": 0.0024,
"reward": 2.773995876312256,
"reward_std": 0.10757410526275635,
"rewards/accuracy_reward_stage2": 0.7818082571029663,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.546875,
"step": 409
},
{
"completion_length": 155.734375,
"epoch": 0.10128458498023715,
"grad_norm": 2.9672829734154478,
"kl": 0.0673828125,
"learning_rate": 8.989624505928853e-07,
"loss": 0.0027,
"reward": 2.5923919677734375,
"reward_std": 0.29955703020095825,
"rewards/accuracy_reward_stage2": 0.7741629481315613,
"rewards/format_reward_all_stage": 1.8182291984558105,
"scores/refine_times": 2.0,
"step": 410
},
{
"completion_length": 193.625,
"epoch": 0.10153162055335968,
"grad_norm": 3.7254627268683578,
"kl": 0.05615234375,
"learning_rate": 8.987154150197627e-07,
"loss": 0.0022,
"reward": 2.3365869522094727,
"reward_std": 0.3466811776161194,
"rewards/accuracy_reward_stage2": 0.49075353145599365,
"rewards/format_reward_all_stage": 1.8458333015441895,
"scores/refine_times": 2.046875,
"step": 411
},
{
"completion_length": 149.046875,
"epoch": 0.10177865612648221,
"grad_norm": 2.8805192836507953,
"kl": 0.05419921875,
"learning_rate": 8.984683794466402e-07,
"loss": 0.0022,
"reward": 2.570408821105957,
"reward_std": 0.3919922709465027,
"rewards/accuracy_reward_stage2": 0.8464504480361938,
"rewards/format_reward_all_stage": 1.7239582538604736,
"scores/refine_times": 1.796875,
"step": 412
},
{
"completion_length": 138.328125,
"epoch": 0.10202569169960474,
"grad_norm": 2.4678069629785138,
"kl": 0.06298828125,
"learning_rate": 8.982213438735178e-07,
"loss": 0.0025,
"reward": 2.5044920444488525,
"reward_std": 0.28305938839912415,
"rewards/accuracy_reward_stage2": 0.7544921636581421,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.609375,
"step": 413
},
{
"completion_length": 92.421875,
"epoch": 0.10227272727272728,
"grad_norm": 3.564931456124642,
"kl": 0.06884765625,
"learning_rate": 8.979743083003953e-07,
"loss": 0.0027,
"reward": 2.6201610565185547,
"reward_std": 0.19829556345939636,
"rewards/accuracy_reward_stage2": 0.7659943103790283,
"rewards/format_reward_all_stage": 1.8541667461395264,
"scores/refine_times": 1.40625,
"step": 414
},
{
"completion_length": 140.8125,
"epoch": 0.10251976284584981,
"grad_norm": 4.105881747857916,
"kl": 0.06787109375,
"learning_rate": 8.977272727272727e-07,
"loss": 0.0027,
"reward": 2.1747279167175293,
"reward_std": 0.6268551349639893,
"rewards/accuracy_reward_stage2": 0.6966027021408081,
"rewards/format_reward_all_stage": 1.478124976158142,
"scores/refine_times": 1.546875,
"step": 415
},
{
"completion_length": 130.28125,
"epoch": 0.10276679841897234,
"grad_norm": 3.5739830819011034,
"kl": 0.0703125,
"learning_rate": 8.974802371541501e-07,
"loss": 0.0028,
"reward": 2.330864191055298,
"reward_std": 0.39451754093170166,
"rewards/accuracy_reward_stage2": 0.7058640718460083,
"rewards/format_reward_all_stage": 1.625,
"scores/refine_times": 1.421875,
"step": 416
},
{
"completion_length": 113.203125,
"epoch": 0.10301383399209486,
"grad_norm": 5.371473470225853,
"kl": 0.0927734375,
"learning_rate": 8.972332015810277e-07,
"loss": 0.0037,
"reward": 2.0138349533081055,
"reward_std": 0.6088952422142029,
"rewards/accuracy_reward_stage2": 0.6700849533081055,
"rewards/format_reward_all_stage": 1.34375,
"scores/refine_times": 1.296875,
"step": 417
},
{
"completion_length": 105.09375,
"epoch": 0.10326086956521739,
"grad_norm": 3.7462892766024387,
"kl": 0.06396484375,
"learning_rate": 8.969861660079051e-07,
"loss": 0.0026,
"reward": 2.5230908393859863,
"reward_std": 0.21397997438907623,
"rewards/accuracy_reward_stage2": 0.6480907797813416,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.375,
"step": 418
},
{
"completion_length": 91.484375,
"epoch": 0.10350790513833992,
"grad_norm": 2.5938381710499856,
"kl": 0.06640625,
"learning_rate": 8.967391304347825e-07,
"loss": 0.0026,
"reward": 2.7374203205108643,
"reward_std": 0.06906712800264359,
"rewards/accuracy_reward_stage2": 0.7374203205108643,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 419
},
{
"completion_length": 99.859375,
"epoch": 0.10375494071146245,
"grad_norm": 3.7475747032872726,
"kl": 0.0654296875,
"learning_rate": 8.9649209486166e-07,
"loss": 0.0026,
"reward": 2.4589004516601562,
"reward_std": 0.19246460497379303,
"rewards/accuracy_reward_stage2": 0.5839004516601562,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.234375,
"step": 420
},
{
"completion_length": 87.734375,
"epoch": 0.10400197628458498,
"grad_norm": 2.4100627297787027,
"kl": 0.0732421875,
"learning_rate": 8.962450592885375e-07,
"loss": 0.0029,
"reward": 2.7743234634399414,
"reward_std": 0.08512721955776215,
"rewards/accuracy_reward_stage2": 0.7743235230445862,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 421
},
{
"completion_length": 94.828125,
"epoch": 0.1042490118577075,
"grad_norm": 5.060240016474509,
"kl": 0.07861328125,
"learning_rate": 8.95998023715415e-07,
"loss": 0.0031,
"reward": 2.6438651084899902,
"reward_std": 0.12257831543684006,
"rewards/accuracy_reward_stage2": 0.6438649296760559,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 422
},
{
"completion_length": 103.109375,
"epoch": 0.10449604743083003,
"grad_norm": 4.702666849391409,
"kl": 0.053466796875,
"learning_rate": 8.957509881422925e-07,
"loss": 0.0021,
"reward": 2.2406249046325684,
"reward_std": 0.4183598756790161,
"rewards/accuracy_reward_stage2": 0.4906250238418579,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.28125,
"step": 423
},
{
"completion_length": 106.6875,
"epoch": 0.10474308300395258,
"grad_norm": 4.171493335376991,
"kl": 0.05517578125,
"learning_rate": 8.955039525691699e-07,
"loss": 0.0022,
"reward": 2.5777876377105713,
"reward_std": 0.1656649112701416,
"rewards/accuracy_reward_stage2": 0.7027875781059265,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.25,
"step": 424
},
{
"completion_length": 84.828125,
"epoch": 0.1049901185770751,
"grad_norm": 3.8218395659454005,
"kl": 0.09423828125,
"learning_rate": 8.952569169960475e-07,
"loss": 0.0038,
"reward": 2.5052084922790527,
"reward_std": 0.2631934583187103,
"rewards/accuracy_reward_stage2": 0.6875,
"rewards/format_reward_all_stage": 1.8177083730697632,
"scores/refine_times": 1.21875,
"step": 425
},
{
"completion_length": 105.5625,
"epoch": 0.10523715415019763,
"grad_norm": 3.4660828623228928,
"kl": 0.08935546875,
"learning_rate": 8.950098814229249e-07,
"loss": 0.0036,
"reward": 2.3972997665405273,
"reward_std": 0.22224318981170654,
"rewards/accuracy_reward_stage2": 0.584799587726593,
"rewards/format_reward_all_stage": 1.8125,
"scores/refine_times": 1.265625,
"step": 426
},
{
"completion_length": 98.421875,
"epoch": 0.10548418972332016,
"grad_norm": 4.869263400762264,
"kl": 0.0732421875,
"learning_rate": 8.947628458498023e-07,
"loss": 0.0029,
"reward": 2.5426318645477295,
"reward_std": 0.12406840175390244,
"rewards/accuracy_reward_stage2": 0.5426318645477295,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 427
},
{
"completion_length": 91.90625,
"epoch": 0.10573122529644269,
"grad_norm": 1.8180351543204958,
"kl": 0.07177734375,
"learning_rate": 8.945158102766798e-07,
"loss": 0.0029,
"reward": 2.8208327293395996,
"reward_std": 0.10960796475410461,
"rewards/accuracy_reward_stage2": 0.8208328485488892,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 428
},
{
"completion_length": 86.1875,
"epoch": 0.10597826086956522,
"grad_norm": 3.3436567770125656,
"kl": 0.07421875,
"learning_rate": 8.942687747035573e-07,
"loss": 0.003,
"reward": 2.865917682647705,
"reward_std": 0.02855822630226612,
"rewards/accuracy_reward_stage2": 0.8659177422523499,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 429
},
{
"completion_length": 72.46875,
"epoch": 0.10622529644268774,
"grad_norm": 4.139290116741471,
"kl": 0.169921875,
"learning_rate": 8.940217391304347e-07,
"loss": 0.0068,
"reward": 2.6120975017547607,
"reward_std": 0.2286979854106903,
"rewards/accuracy_reward_stage2": 0.7370975017547607,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 430
},
{
"completion_length": 71.3125,
"epoch": 0.10647233201581027,
"grad_norm": 3.7005291669713354,
"kl": 0.08203125,
"learning_rate": 8.937747035573123e-07,
"loss": 0.0033,
"reward": 2.71065092086792,
"reward_std": 0.08734364062547684,
"rewards/accuracy_reward_stage2": 0.7106510400772095,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 431
},
{
"completion_length": 85.984375,
"epoch": 0.1067193675889328,
"grad_norm": 5.202110977588656,
"kl": 0.1015625,
"learning_rate": 8.935276679841897e-07,
"loss": 0.0041,
"reward": 2.617884635925293,
"reward_std": 0.10500668734312057,
"rewards/accuracy_reward_stage2": 0.617884635925293,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 432
},
{
"completion_length": 72.53125,
"epoch": 0.10696640316205533,
"grad_norm": 5.720314610842664,
"kl": 0.103515625,
"learning_rate": 8.932806324110671e-07,
"loss": 0.0041,
"reward": 2.567166805267334,
"reward_std": 0.21528260409832,
"rewards/accuracy_reward_stage2": 0.5671666860580444,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 433
},
{
"completion_length": 84.546875,
"epoch": 0.10721343873517787,
"grad_norm": 3.2175328418513924,
"kl": 0.068359375,
"learning_rate": 8.930335968379447e-07,
"loss": 0.0027,
"reward": 2.581601142883301,
"reward_std": 0.04433634132146835,
"rewards/accuracy_reward_stage2": 0.581601083278656,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 434
},
{
"completion_length": 78.984375,
"epoch": 0.1074604743083004,
"grad_norm": 1.2926706812925475,
"kl": 0.12353515625,
"learning_rate": 8.927865612648221e-07,
"loss": 0.0049,
"reward": 2.9375431537628174,
"reward_std": 0.01452865544706583,
"rewards/accuracy_reward_stage2": 0.9375432133674622,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 435
},
{
"completion_length": 77.5625,
"epoch": 0.10770750988142293,
"grad_norm": 2.2617787059512224,
"kl": 0.0791015625,
"learning_rate": 8.925395256916995e-07,
"loss": 0.0032,
"reward": 2.767601251602173,
"reward_std": 0.04931104555726051,
"rewards/accuracy_reward_stage2": 0.7676013112068176,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 436
},
{
"completion_length": 101.75,
"epoch": 0.10795454545454546,
"grad_norm": 2.4619041669921216,
"kl": 0.078125,
"learning_rate": 8.92292490118577e-07,
"loss": 0.0031,
"reward": 2.6876630783081055,
"reward_std": 0.09527213126420975,
"rewards/accuracy_reward_stage2": 0.757975697517395,
"rewards/format_reward_all_stage": 1.9296875,
"scores/refine_times": 1.21875,
"step": 437
},
{
"completion_length": 74.546875,
"epoch": 0.10820158102766798,
"grad_norm": 3.6665021628241283,
"kl": 0.08349609375,
"learning_rate": 8.920454545454545e-07,
"loss": 0.0033,
"reward": 2.5134830474853516,
"reward_std": 0.11075843870639801,
"rewards/accuracy_reward_stage2": 0.5134831070899963,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 438
},
{
"completion_length": 83.828125,
"epoch": 0.10844861660079051,
"grad_norm": 0.2742410987570204,
"kl": 0.06787109375,
"learning_rate": 8.91798418972332e-07,
"loss": 0.0027,
"reward": 2.606250047683716,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.606249988079071,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 439
},
{
"completion_length": 70.625,
"epoch": 0.10869565217391304,
"grad_norm": 4.1049578784327085,
"kl": 0.08349609375,
"learning_rate": 8.915513833992094e-07,
"loss": 0.0033,
"reward": 2.747023820877075,
"reward_std": 0.15908406674861908,
"rewards/accuracy_reward_stage2": 0.7470238208770752,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 440
},
{
"completion_length": 83.796875,
"epoch": 0.10894268774703557,
"grad_norm": 3.890400064216742,
"kl": 0.072265625,
"learning_rate": 8.913043478260869e-07,
"loss": 0.0029,
"reward": 2.6413779258728027,
"reward_std": 0.15115022659301758,
"rewards/accuracy_reward_stage2": 0.6413780450820923,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 441
},
{
"completion_length": 82.5,
"epoch": 0.1091897233201581,
"grad_norm": 3.364268393065154,
"kl": 0.0869140625,
"learning_rate": 8.910573122529645e-07,
"loss": 0.0035,
"reward": 2.5104165077209473,
"reward_std": 0.23385357856750488,
"rewards/accuracy_reward_stage2": 0.6354166865348816,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.078125,
"step": 442
},
{
"completion_length": 93.9375,
"epoch": 0.10943675889328064,
"grad_norm": 3.999124238125438,
"kl": 0.0908203125,
"learning_rate": 8.908102766798419e-07,
"loss": 0.0036,
"reward": 2.6210455894470215,
"reward_std": 0.09140154719352722,
"rewards/accuracy_reward_stage2": 0.6210454106330872,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 443
},
{
"completion_length": 94.34375,
"epoch": 0.10968379446640317,
"grad_norm": 4.9639104904967235,
"kl": 0.1005859375,
"learning_rate": 8.905632411067193e-07,
"loss": 0.004,
"reward": 2.784119129180908,
"reward_std": 0.237242192029953,
"rewards/accuracy_reward_stage2": 0.7841191291809082,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 444
},
{
"completion_length": 92.8125,
"epoch": 0.1099308300395257,
"grad_norm": 5.1034757317692145,
"kl": 0.06396484375,
"learning_rate": 8.903162055335968e-07,
"loss": 0.0026,
"reward": 2.5643460750579834,
"reward_std": 0.06753169000148773,
"rewards/accuracy_reward_stage2": 0.564346194267273,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 445
},
{
"completion_length": 78.71875,
"epoch": 0.11017786561264822,
"grad_norm": 3.133436785678946,
"kl": 0.0908203125,
"learning_rate": 8.900691699604743e-07,
"loss": 0.0036,
"reward": 2.6766560077667236,
"reward_std": 0.004866618663072586,
"rewards/accuracy_reward_stage2": 0.6766558885574341,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 446
},
{
"completion_length": 88.4375,
"epoch": 0.11042490118577075,
"grad_norm": 5.09866753511643,
"kl": 0.078125,
"learning_rate": 8.898221343873517e-07,
"loss": 0.0031,
"reward": 2.419560194015503,
"reward_std": 0.09583514928817749,
"rewards/accuracy_reward_stage2": 0.41956019401550293,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 447
},
{
"completion_length": 88.25,
"epoch": 0.11067193675889328,
"grad_norm": 2.3473140239947465,
"kl": 0.0732421875,
"learning_rate": 8.895750988142292e-07,
"loss": 0.0029,
"reward": 2.59273099899292,
"reward_std": 0.1246790662407875,
"rewards/accuracy_reward_stage2": 0.5927308797836304,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 448
},
{
"completion_length": 104.453125,
"epoch": 0.11091897233201581,
"grad_norm": 3.1125557355492424,
"kl": 0.0771484375,
"learning_rate": 8.893280632411066e-07,
"loss": 0.0031,
"reward": 2.4798202514648438,
"reward_std": 0.06186756119132042,
"rewards/accuracy_reward_stage2": 0.47982022166252136,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 449
},
{
"completion_length": 88.078125,
"epoch": 0.11116600790513834,
"grad_norm": 2.9837836911221354,
"kl": 0.0810546875,
"learning_rate": 8.890810276679841e-07,
"loss": 0.0033,
"reward": 2.817561149597168,
"reward_std": 0.05040454864501953,
"rewards/accuracy_reward_stage2": 0.817561149597168,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 450
},
{
"completion_length": 98.609375,
"epoch": 0.11141304347826086,
"grad_norm": 4.441414350052775,
"kl": 0.0810546875,
"learning_rate": 8.888339920948617e-07,
"loss": 0.0032,
"reward": 2.681997299194336,
"reward_std": 0.05065108835697174,
"rewards/accuracy_reward_stage2": 0.6819972395896912,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 451
},
{
"completion_length": 101.296875,
"epoch": 0.11166007905138339,
"grad_norm": 3.7005571414160867,
"kl": 0.0771484375,
"learning_rate": 8.885869565217391e-07,
"loss": 0.0031,
"reward": 2.5746400356292725,
"reward_std": 0.08936276286840439,
"rewards/accuracy_reward_stage2": 0.5746400952339172,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 452
},
{
"completion_length": 78.4375,
"epoch": 0.11190711462450594,
"grad_norm": 4.167745774001736,
"kl": 0.08935546875,
"learning_rate": 8.883399209486165e-07,
"loss": 0.0036,
"reward": 2.710275173187256,
"reward_std": 0.0839356780052185,
"rewards/accuracy_reward_stage2": 0.7102750539779663,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 453
},
{
"completion_length": 108.59375,
"epoch": 0.11215415019762846,
"grad_norm": 2.4441985384576106,
"kl": 0.08251953125,
"learning_rate": 8.88092885375494e-07,
"loss": 0.0033,
"reward": 2.7707533836364746,
"reward_std": 0.10964667797088623,
"rewards/accuracy_reward_stage2": 0.8332535028457642,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.1875,
"step": 454
},
{
"completion_length": 89.09375,
"epoch": 0.11240118577075099,
"grad_norm": 3.4290524406940177,
"kl": 0.11572265625,
"learning_rate": 8.878458498023715e-07,
"loss": 0.0046,
"reward": 2.8360495567321777,
"reward_std": 0.06881996244192123,
"rewards/accuracy_reward_stage2": 0.8360496759414673,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 455
},
{
"completion_length": 93.546875,
"epoch": 0.11264822134387352,
"grad_norm": 4.2633390639176,
"kl": 0.091796875,
"learning_rate": 8.87598814229249e-07,
"loss": 0.0037,
"reward": 2.737834930419922,
"reward_std": 0.08329826593399048,
"rewards/accuracy_reward_stage2": 0.7378349304199219,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 456
},
{
"completion_length": 88.125,
"epoch": 0.11289525691699605,
"grad_norm": 4.185983875071428,
"kl": 0.07177734375,
"learning_rate": 8.873517786561264e-07,
"loss": 0.0029,
"reward": 2.7210230827331543,
"reward_std": 0.1462172567844391,
"rewards/accuracy_reward_stage2": 0.7210230231285095,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 457
},
{
"completion_length": 78.3125,
"epoch": 0.11314229249011858,
"grad_norm": 2.9568956363715557,
"kl": 0.0859375,
"learning_rate": 8.871047430830038e-07,
"loss": 0.0034,
"reward": 2.846480131149292,
"reward_std": 0.058452486991882324,
"rewards/accuracy_reward_stage2": 0.846480131149292,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 458
},
{
"completion_length": 93.96875,
"epoch": 0.1133893280632411,
"grad_norm": 3.702464502871565,
"kl": 0.11474609375,
"learning_rate": 8.868577075098815e-07,
"loss": 0.0046,
"reward": 2.6403086185455322,
"reward_std": 0.07300623506307602,
"rewards/accuracy_reward_stage2": 0.640308678150177,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 459
},
{
"completion_length": 82.015625,
"epoch": 0.11363636363636363,
"grad_norm": 3.5096468558603027,
"kl": 0.11767578125,
"learning_rate": 8.866106719367589e-07,
"loss": 0.0047,
"reward": 2.7332301139831543,
"reward_std": 0.1381341964006424,
"rewards/accuracy_reward_stage2": 0.7957301735877991,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 460
},
{
"completion_length": 78.125,
"epoch": 0.11388339920948616,
"grad_norm": 3.559169978335934,
"kl": 0.1015625,
"learning_rate": 8.863636363636363e-07,
"loss": 0.0041,
"reward": 2.8119444847106934,
"reward_std": 0.08849316835403442,
"rewards/accuracy_reward_stage2": 0.8119444847106934,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 461
},
{
"completion_length": 93.234375,
"epoch": 0.11413043478260869,
"grad_norm": 0.5282625223145849,
"kl": 0.091796875,
"learning_rate": 8.861166007905138e-07,
"loss": 0.0037,
"reward": 2.764010190963745,
"reward_std": 0.008434552699327469,
"rewards/accuracy_reward_stage2": 0.7640101909637451,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 462
},
{
"completion_length": 85.25,
"epoch": 0.11437747035573123,
"grad_norm": 2.9976744755473064,
"kl": 0.09814453125,
"learning_rate": 8.858695652173913e-07,
"loss": 0.0039,
"reward": 2.7464168071746826,
"reward_std": 0.07833977788686752,
"rewards/accuracy_reward_stage2": 0.7464168071746826,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 463
},
{
"completion_length": 83.9375,
"epoch": 0.11462450592885376,
"grad_norm": 3.6875932623475802,
"kl": 0.08642578125,
"learning_rate": 8.856225296442687e-07,
"loss": 0.0035,
"reward": 2.6572089195251465,
"reward_std": 0.027874791994690895,
"rewards/accuracy_reward_stage2": 0.6572088003158569,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 464
},
{
"completion_length": 93.09375,
"epoch": 0.11487154150197629,
"grad_norm": 3.818896478911207,
"kl": 0.09814453125,
"learning_rate": 8.853754940711462e-07,
"loss": 0.0039,
"reward": 2.522165298461914,
"reward_std": 0.0887691006064415,
"rewards/accuracy_reward_stage2": 0.522165060043335,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 465
},
{
"completion_length": 86.40625,
"epoch": 0.11511857707509882,
"grad_norm": 5.068588696355706,
"kl": 0.080078125,
"learning_rate": 8.851284584980236e-07,
"loss": 0.0032,
"reward": 2.7015743255615234,
"reward_std": 0.16825415194034576,
"rewards/accuracy_reward_stage2": 0.7640743255615234,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 466
},
{
"completion_length": 118.046875,
"epoch": 0.11536561264822134,
"grad_norm": 3.1937404370635414,
"kl": 0.0712890625,
"learning_rate": 8.848814229249012e-07,
"loss": 0.0028,
"reward": 2.629642963409424,
"reward_std": 0.1885630190372467,
"rewards/accuracy_reward_stage2": 0.6921432018280029,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.3125,
"step": 467
},
{
"completion_length": 78.03125,
"epoch": 0.11561264822134387,
"grad_norm": 3.2947592139517377,
"kl": 0.0732421875,
"learning_rate": 8.846343873517787e-07,
"loss": 0.0029,
"reward": 2.854365110397339,
"reward_std": 0.04581620916724205,
"rewards/accuracy_reward_stage2": 0.8543651103973389,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 468
},
{
"completion_length": 71.5625,
"epoch": 0.1158596837944664,
"grad_norm": 4.152892444324077,
"kl": 0.08837890625,
"learning_rate": 8.843873517786561e-07,
"loss": 0.0035,
"reward": 2.510572671890259,
"reward_std": 0.01925911381840706,
"rewards/accuracy_reward_stage2": 0.5105725526809692,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 469
},
{
"completion_length": 82.0625,
"epoch": 0.11610671936758893,
"grad_norm": 2.1146453684599917,
"kl": 0.06884765625,
"learning_rate": 8.841403162055336e-07,
"loss": 0.0028,
"reward": 2.793236494064331,
"reward_std": 0.0080789215862751,
"rewards/accuracy_reward_stage2": 0.793236494064331,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 470
},
{
"completion_length": 77.34375,
"epoch": 0.11635375494071146,
"grad_norm": 2.209537382540903,
"kl": 0.07568359375,
"learning_rate": 8.83893280632411e-07,
"loss": 0.003,
"reward": 2.7740843296051025,
"reward_std": 0.023258034139871597,
"rewards/accuracy_reward_stage2": 0.7740844488143921,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 471
},
{
"completion_length": 76.640625,
"epoch": 0.116600790513834,
"grad_norm": 0.6746428954052648,
"kl": 0.06201171875,
"learning_rate": 8.836462450592885e-07,
"loss": 0.0025,
"reward": 2.7364659309387207,
"reward_std": 0.01504778116941452,
"rewards/accuracy_reward_stage2": 0.7364660501480103,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 472
},
{
"completion_length": 80.078125,
"epoch": 0.11684782608695653,
"grad_norm": 5.151503301895097,
"kl": 0.0732421875,
"learning_rate": 8.83399209486166e-07,
"loss": 0.0029,
"reward": 2.6196067333221436,
"reward_std": 0.10613559931516647,
"rewards/accuracy_reward_stage2": 0.6196067333221436,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 473
},
{
"completion_length": 76.203125,
"epoch": 0.11709486166007906,
"grad_norm": 3.6271936982955113,
"kl": 0.10205078125,
"learning_rate": 8.831521739130434e-07,
"loss": 0.0041,
"reward": 2.644803524017334,
"reward_std": 0.07510168105363846,
"rewards/accuracy_reward_stage2": 0.6552203893661499,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.078125,
"step": 474
},
{
"completion_length": 75.65625,
"epoch": 0.11734189723320158,
"grad_norm": 4.565040146312246,
"kl": 0.10302734375,
"learning_rate": 8.829051383399208e-07,
"loss": 0.0041,
"reward": 2.692810535430908,
"reward_std": 0.02393944561481476,
"rewards/accuracy_reward_stage2": 0.6928104162216187,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 475
},
{
"completion_length": 79.171875,
"epoch": 0.11758893280632411,
"grad_norm": 2.197838778708124,
"kl": 0.0849609375,
"learning_rate": 8.826581027667984e-07,
"loss": 0.0034,
"reward": 2.7445812225341797,
"reward_std": 0.10567092895507812,
"rewards/accuracy_reward_stage2": 0.8174980282783508,
"rewards/format_reward_all_stage": 1.9270833730697632,
"scores/refine_times": 1.140625,
"step": 476
},
{
"completion_length": 77.296875,
"epoch": 0.11783596837944664,
"grad_norm": 2.8788455034822484,
"kl": 0.08642578125,
"learning_rate": 8.824110671936759e-07,
"loss": 0.0035,
"reward": 2.6922109127044678,
"reward_std": 0.057185299694538116,
"rewards/accuracy_reward_stage2": 0.692210853099823,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 477
},
{
"completion_length": 80.078125,
"epoch": 0.11808300395256917,
"grad_norm": 2.881982205739493,
"kl": 0.08642578125,
"learning_rate": 8.821640316205533e-07,
"loss": 0.0035,
"reward": 2.9145209789276123,
"reward_std": 0.07034643739461899,
"rewards/accuracy_reward_stage2": 0.9145209789276123,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 478
},
{
"completion_length": 81.3125,
"epoch": 0.1183300395256917,
"grad_norm": 3.9207615453647953,
"kl": 0.0927734375,
"learning_rate": 8.819169960474308e-07,
"loss": 0.0037,
"reward": 2.5646958351135254,
"reward_std": 0.1530625820159912,
"rewards/accuracy_reward_stage2": 0.6896957159042358,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 479
},
{
"completion_length": 84.453125,
"epoch": 0.11857707509881422,
"grad_norm": 3.6748089134366064,
"kl": 0.0771484375,
"learning_rate": 8.816699604743083e-07,
"loss": 0.0031,
"reward": 2.8285746574401855,
"reward_std": 0.028202539309859276,
"rewards/accuracy_reward_stage2": 0.8285747766494751,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 480
},
{
"completion_length": 107.296875,
"epoch": 0.11882411067193675,
"grad_norm": 3.0552260659106762,
"kl": 0.0771484375,
"learning_rate": 8.814229249011858e-07,
"loss": 0.0031,
"reward": 2.7689619064331055,
"reward_std": 0.07823709398508072,
"rewards/accuracy_reward_stage2": 0.768962025642395,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 481
},
{
"completion_length": 69.515625,
"epoch": 0.1190711462450593,
"grad_norm": 3.1056533714213588,
"kl": 0.11083984375,
"learning_rate": 8.811758893280632e-07,
"loss": 0.0044,
"reward": 2.6699254512786865,
"reward_std": 0.10760709643363953,
"rewards/accuracy_reward_stage2": 0.6803420782089233,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.078125,
"step": 482
},
{
"completion_length": 95.75,
"epoch": 0.11931818181818182,
"grad_norm": 3.673929415547851,
"kl": 0.0771484375,
"learning_rate": 8.809288537549406e-07,
"loss": 0.0031,
"reward": 2.495757579803467,
"reward_std": 0.02522164396941662,
"rewards/accuracy_reward_stage2": 0.4957575500011444,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 483
},
{
"completion_length": 87.59375,
"epoch": 0.11956521739130435,
"grad_norm": 2.2653734549347004,
"kl": 0.06689453125,
"learning_rate": 8.806818181818182e-07,
"loss": 0.0027,
"reward": 2.5830860137939453,
"reward_std": 0.06735340505838394,
"rewards/accuracy_reward_stage2": 0.5830860137939453,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 484
},
{
"completion_length": 71.53125,
"epoch": 0.11981225296442688,
"grad_norm": 2.692817884354693,
"kl": 0.07470703125,
"learning_rate": 8.804347826086956e-07,
"loss": 0.003,
"reward": 2.6110339164733887,
"reward_std": 0.07797226309776306,
"rewards/accuracy_reward_stage2": 0.6110339164733887,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 485
},
{
"completion_length": 83.796875,
"epoch": 0.12005928853754941,
"grad_norm": 3.867866268455527,
"kl": 0.0888671875,
"learning_rate": 8.80187747035573e-07,
"loss": 0.0036,
"reward": 2.82053279876709,
"reward_std": 0.04790631681680679,
"rewards/accuracy_reward_stage2": 0.8205327987670898,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 486
},
{
"completion_length": 85.015625,
"epoch": 0.12030632411067194,
"grad_norm": 3.7006027639703243,
"kl": 0.0849609375,
"learning_rate": 8.799407114624506e-07,
"loss": 0.0034,
"reward": 2.630404472351074,
"reward_std": 0.037833116948604584,
"rewards/accuracy_reward_stage2": 0.6304043531417847,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 487
},
{
"completion_length": 89.59375,
"epoch": 0.12055335968379446,
"grad_norm": 3.3659415525573,
"kl": 0.0654296875,
"learning_rate": 8.79693675889328e-07,
"loss": 0.0026,
"reward": 2.5924062728881836,
"reward_std": 0.040243446826934814,
"rewards/accuracy_reward_stage2": 0.592406153678894,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 488
},
{
"completion_length": 84.96875,
"epoch": 0.12080039525691699,
"grad_norm": 2.345461810437249,
"kl": 0.0869140625,
"learning_rate": 8.794466403162055e-07,
"loss": 0.0035,
"reward": 2.8451998233795166,
"reward_std": 0.011534404940903187,
"rewards/accuracy_reward_stage2": 0.8451998233795166,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 489
},
{
"completion_length": 80.078125,
"epoch": 0.12104743083003952,
"grad_norm": 3.3056877683572123,
"kl": 0.07421875,
"learning_rate": 8.79199604743083e-07,
"loss": 0.003,
"reward": 2.674712657928467,
"reward_std": 0.0697299912571907,
"rewards/accuracy_reward_stage2": 0.6747127771377563,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 490
},
{
"completion_length": 100.78125,
"epoch": 0.12129446640316205,
"grad_norm": 4.62714518835741,
"kl": 0.07568359375,
"learning_rate": 8.789525691699604e-07,
"loss": 0.003,
"reward": 2.2546346187591553,
"reward_std": 0.18582692742347717,
"rewards/accuracy_reward_stage2": 0.3796347379684448,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.078125,
"step": 491
},
{
"completion_length": 87.46875,
"epoch": 0.12154150197628459,
"grad_norm": 4.0547374474409565,
"kl": 0.1064453125,
"learning_rate": 8.787055335968378e-07,
"loss": 0.0043,
"reward": 2.582868814468384,
"reward_std": 0.2643705904483795,
"rewards/accuracy_reward_stage2": 0.6609938144683838,
"rewards/format_reward_all_stage": 1.921875,
"scores/refine_times": 1.328125,
"step": 492
},
{
"completion_length": 79.984375,
"epoch": 0.12178853754940712,
"grad_norm": 4.974773902072042,
"kl": 0.08447265625,
"learning_rate": 8.784584980237154e-07,
"loss": 0.0034,
"reward": 2.6923179626464844,
"reward_std": 0.11359754204750061,
"rewards/accuracy_reward_stage2": 0.744401216506958,
"rewards/format_reward_all_stage": 1.9479167461395264,
"scores/refine_times": 1.09375,
"step": 493
},
{
"completion_length": 99.75,
"epoch": 0.12203557312252965,
"grad_norm": 3.0636928116844575,
"kl": 0.0654296875,
"learning_rate": 8.782114624505928e-07,
"loss": 0.0026,
"reward": 2.845734119415283,
"reward_std": 0.01307186484336853,
"rewards/accuracy_reward_stage2": 0.8457342982292175,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 494
},
{
"completion_length": 80.859375,
"epoch": 0.12228260869565218,
"grad_norm": 3.434547157616726,
"kl": 0.11962890625,
"learning_rate": 8.779644268774703e-07,
"loss": 0.0048,
"reward": 2.5350117683410645,
"reward_std": 0.018023155629634857,
"rewards/accuracy_reward_stage2": 0.5350118279457092,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 495
},
{
"completion_length": 70.4375,
"epoch": 0.1225296442687747,
"grad_norm": 4.465055705056343,
"kl": 0.080078125,
"learning_rate": 8.777173913043478e-07,
"loss": 0.0032,
"reward": 2.4872050285339355,
"reward_std": 0.07076370716094971,
"rewards/accuracy_reward_stage2": 0.48720502853393555,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 496
},
{
"completion_length": 81.53125,
"epoch": 0.12277667984189723,
"grad_norm": 5.0334176088721545,
"kl": 0.11865234375,
"learning_rate": 8.774703557312253e-07,
"loss": 0.0047,
"reward": 2.3961634635925293,
"reward_std": 0.30635514855384827,
"rewards/accuracy_reward_stage2": 0.5784550905227661,
"rewards/format_reward_all_stage": 1.8177083730697632,
"scores/refine_times": 1.078125,
"step": 497
},
{
"completion_length": 65.046875,
"epoch": 0.12302371541501976,
"grad_norm": 0.8054176825918075,
"kl": 0.0869140625,
"learning_rate": 8.772233201581028e-07,
"loss": 0.0035,
"reward": 2.841981887817383,
"reward_std": 0.0124855637550354,
"rewards/accuracy_reward_stage2": 0.8419820070266724,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 498
},
{
"completion_length": 94.765625,
"epoch": 0.12327075098814229,
"grad_norm": 3.630597761876916,
"kl": 0.09130859375,
"learning_rate": 8.769762845849802e-07,
"loss": 0.0036,
"reward": 2.5977272987365723,
"reward_std": 0.2044953554868698,
"rewards/accuracy_reward_stage2": 0.8477272987365723,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.1875,
"step": 499
},
{
"completion_length": 79.140625,
"epoch": 0.12351778656126482,
"grad_norm": 3.823309287561965,
"kl": 0.08544921875,
"learning_rate": 8.767292490118576e-07,
"loss": 0.0034,
"reward": 2.621757984161377,
"reward_std": 0.1109161525964737,
"rewards/accuracy_reward_stage2": 0.6217580437660217,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 500
},
{
"completion_length": 79.546875,
"epoch": 0.12376482213438735,
"grad_norm": 0.2598621354632467,
"kl": 0.07763671875,
"learning_rate": 8.764822134387352e-07,
"loss": 0.0031,
"reward": 2.809523820877075,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.8095238208770752,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 501
},
{
"completion_length": 65.6875,
"epoch": 0.12401185770750989,
"grad_norm": 3.741876860167781,
"kl": 0.09130859375,
"learning_rate": 8.762351778656126e-07,
"loss": 0.0037,
"reward": 2.740185022354126,
"reward_std": 0.044610656797885895,
"rewards/accuracy_reward_stage2": 0.740185022354126,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 502
},
{
"completion_length": 64.9375,
"epoch": 0.12425889328063242,
"grad_norm": 2.1936854561420267,
"kl": 0.09912109375,
"learning_rate": 8.7598814229249e-07,
"loss": 0.004,
"reward": 2.96875,
"reward_std": 0.033407654613256454,
"rewards/accuracy_reward_stage2": 0.96875,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 503
},
{
"completion_length": 65.234375,
"epoch": 0.12450592885375494,
"grad_norm": 4.050893925110765,
"kl": 0.0869140625,
"learning_rate": 8.757411067193675e-07,
"loss": 0.0035,
"reward": 2.7537124156951904,
"reward_std": 0.11977555602788925,
"rewards/accuracy_reward_stage2": 0.7537122964859009,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 504
},
{
"completion_length": 84.359375,
"epoch": 0.12475296442687747,
"grad_norm": 3.072366621168365,
"kl": 0.107421875,
"learning_rate": 8.754940711462451e-07,
"loss": 0.0043,
"reward": 2.5886545181274414,
"reward_std": 0.1502598226070404,
"rewards/accuracy_reward_stage2": 0.713654637336731,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.15625,
"step": 505
},
{
"completion_length": 69.828125,
"epoch": 0.125,
"grad_norm": 5.845882236669313,
"kl": 0.1533203125,
"learning_rate": 8.752470355731226e-07,
"loss": 0.0061,
"reward": 2.244267463684082,
"reward_std": 0.35389357805252075,
"rewards/accuracy_reward_stage2": 0.49426767230033875,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.125,
"step": 506
},
{
"completion_length": 76.109375,
"epoch": 0.12524703557312253,
"grad_norm": 3.476292970725002,
"kl": 0.08642578125,
"learning_rate": 8.75e-07,
"loss": 0.0034,
"reward": 2.8092727661132812,
"reward_std": 0.034412235021591187,
"rewards/accuracy_reward_stage2": 0.809272825717926,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 507
},
{
"completion_length": 75.875,
"epoch": 0.12549407114624506,
"grad_norm": 3.331603296120503,
"kl": 0.08984375,
"learning_rate": 8.747529644268774e-07,
"loss": 0.0036,
"reward": 2.759597063064575,
"reward_std": 0.020664650946855545,
"rewards/accuracy_reward_stage2": 0.7595971822738647,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 508
},
{
"completion_length": 65.5625,
"epoch": 0.12574110671936758,
"grad_norm": 2.4995462264429062,
"kl": 0.08984375,
"learning_rate": 8.745059288537549e-07,
"loss": 0.0036,
"reward": 2.830458879470825,
"reward_std": 0.005241929553449154,
"rewards/accuracy_reward_stage2": 0.8304589986801147,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 509
},
{
"completion_length": 85.90625,
"epoch": 0.1259881422924901,
"grad_norm": 5.179493941114563,
"kl": 0.10400390625,
"learning_rate": 8.742588932806324e-07,
"loss": 0.0042,
"reward": 2.6034417152404785,
"reward_std": 0.12308812886476517,
"rewards/accuracy_reward_stage2": 0.6607334613800049,
"rewards/format_reward_all_stage": 1.9427082538604736,
"scores/refine_times": 1.21875,
"step": 510
},
{
"completion_length": 69.625,
"epoch": 0.12623517786561264,
"grad_norm": 4.072436652749926,
"kl": 0.10107421875,
"learning_rate": 8.740118577075098e-07,
"loss": 0.004,
"reward": 2.5392637252807617,
"reward_std": 0.061263859272003174,
"rewards/accuracy_reward_stage2": 0.5392636060714722,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 511
},
{
"completion_length": 60.015625,
"epoch": 0.12648221343873517,
"grad_norm": 4.246135496046692,
"kl": 0.1357421875,
"learning_rate": 8.737648221343873e-07,
"loss": 0.0054,
"reward": 2.6761271953582764,
"reward_std": 0.06654070317745209,
"rewards/accuracy_reward_stage2": 0.6761271953582764,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 512
},
{
"completion_length": 59.875,
"epoch": 0.1267292490118577,
"grad_norm": 4.924272097887162,
"kl": 0.11083984375,
"learning_rate": 8.735177865612647e-07,
"loss": 0.0044,
"reward": 2.692314624786377,
"reward_std": 0.0717814713716507,
"rewards/accuracy_reward_stage2": 0.6923147439956665,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 513
},
{
"completion_length": 78.4375,
"epoch": 0.12697628458498023,
"grad_norm": 4.1067478974761515,
"kl": 0.11572265625,
"learning_rate": 8.732707509881423e-07,
"loss": 0.0046,
"reward": 2.7923502922058105,
"reward_std": 0.03013404831290245,
"rewards/accuracy_reward_stage2": 0.7923504114151001,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 514
},
{
"completion_length": 80.125,
"epoch": 0.12722332015810275,
"grad_norm": 3.9493774213422403,
"kl": 0.11083984375,
"learning_rate": 8.730237154150198e-07,
"loss": 0.0044,
"reward": 2.1602487564086914,
"reward_std": 0.28394466638565063,
"rewards/accuracy_reward_stage2": 0.5352487564086914,
"rewards/format_reward_all_stage": 1.625,
"scores/refine_times": 1.140625,
"step": 515
},
{
"completion_length": 74.4375,
"epoch": 0.1274703557312253,
"grad_norm": 3.6966243378996513,
"kl": 0.1044921875,
"learning_rate": 8.727766798418972e-07,
"loss": 0.0042,
"reward": 2.8750760555267334,
"reward_std": 0.06863968074321747,
"rewards/accuracy_reward_stage2": 0.937576174736023,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.203125,
"step": 516
},
{
"completion_length": 89.984375,
"epoch": 0.12771739130434784,
"grad_norm": 3.8890101683067577,
"kl": 0.1181640625,
"learning_rate": 8.725296442687746e-07,
"loss": 0.0047,
"reward": 2.467437744140625,
"reward_std": 0.10733962059020996,
"rewards/accuracy_reward_stage2": 0.4830629527568817,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.21875,
"step": 517
},
{
"completion_length": 73.1875,
"epoch": 0.12796442687747037,
"grad_norm": 4.990132015423476,
"kl": 0.10009765625,
"learning_rate": 8.722826086956522e-07,
"loss": 0.004,
"reward": 2.5844149589538574,
"reward_std": 0.20980337262153625,
"rewards/accuracy_reward_stage2": 0.7094148397445679,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 518
},
{
"completion_length": 77.828125,
"epoch": 0.1282114624505929,
"grad_norm": 4.7214654461831485,
"kl": 0.12353515625,
"learning_rate": 8.720355731225296e-07,
"loss": 0.0049,
"reward": 2.7041687965393066,
"reward_std": 0.07556068897247314,
"rewards/accuracy_reward_stage2": 0.7041686177253723,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 519
},
{
"completion_length": 77.78125,
"epoch": 0.12845849802371542,
"grad_norm": 0.3211094077379723,
"kl": 0.111328125,
"learning_rate": 8.71788537549407e-07,
"loss": 0.0045,
"reward": 2.943162441253662,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.9431624412536621,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 520
},
{
"completion_length": 75.296875,
"epoch": 0.12870553359683795,
"grad_norm": 3.364555080905937,
"kl": 0.11083984375,
"learning_rate": 8.715415019762845e-07,
"loss": 0.0044,
"reward": 2.7595181465148926,
"reward_std": 0.03243761509656906,
"rewards/accuracy_reward_stage2": 0.7595181465148926,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.109375,
"step": 521
},
{
"completion_length": 75.125,
"epoch": 0.12895256916996048,
"grad_norm": 4.197238919282646,
"kl": 0.08935546875,
"learning_rate": 8.71294466403162e-07,
"loss": 0.0036,
"reward": 2.7527740001678467,
"reward_std": 0.15860606729984283,
"rewards/accuracy_reward_stage2": 0.8777740001678467,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 522
},
{
"completion_length": 83.046875,
"epoch": 0.129199604743083,
"grad_norm": 3.684564519838231,
"kl": 0.09912109375,
"learning_rate": 8.710474308300395e-07,
"loss": 0.004,
"reward": 2.7169008255004883,
"reward_std": 0.028284341096878052,
"rewards/accuracy_reward_stage2": 0.7169008851051331,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 523
},
{
"completion_length": 89.65625,
"epoch": 0.12944664031620554,
"grad_norm": 3.7050980630735038,
"kl": 0.0966796875,
"learning_rate": 8.70800395256917e-07,
"loss": 0.0039,
"reward": 2.706432342529297,
"reward_std": 0.03294907137751579,
"rewards/accuracy_reward_stage2": 0.7064325213432312,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 524
},
{
"completion_length": 87.4375,
"epoch": 0.12969367588932806,
"grad_norm": 4.395244733430323,
"kl": 0.0908203125,
"learning_rate": 8.705533596837944e-07,
"loss": 0.0036,
"reward": 2.6969168186187744,
"reward_std": 0.2856762111186981,
"rewards/accuracy_reward_stage2": 0.8219167590141296,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 525
},
{
"completion_length": 75.15625,
"epoch": 0.1299407114624506,
"grad_norm": 3.7899906109560693,
"kl": 0.09814453125,
"learning_rate": 8.70306324110672e-07,
"loss": 0.0039,
"reward": 2.7718465328216553,
"reward_std": 0.026267768815159798,
"rewards/accuracy_reward_stage2": 0.7718464136123657,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 526
},
{
"completion_length": 75.5,
"epoch": 0.13018774703557312,
"grad_norm": 4.216329302877091,
"kl": 0.07666015625,
"learning_rate": 8.700592885375494e-07,
"loss": 0.0031,
"reward": 2.68217134475708,
"reward_std": 0.045934125781059265,
"rewards/accuracy_reward_stage2": 0.6821711659431458,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 527
},
{
"completion_length": 68.9375,
"epoch": 0.13043478260869565,
"grad_norm": 0.2760461680822888,
"kl": 0.0830078125,
"learning_rate": 8.698122529644268e-07,
"loss": 0.0033,
"reward": 2.7838234901428223,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.7838236093521118,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 528
},
{
"completion_length": 80.09375,
"epoch": 0.13068181818181818,
"grad_norm": 4.353620248510361,
"kl": 0.09130859375,
"learning_rate": 8.695652173913043e-07,
"loss": 0.0036,
"reward": 2.865518093109131,
"reward_std": 0.09008646011352539,
"rewards/accuracy_reward_stage2": 0.8655180931091309,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 529
},
{
"completion_length": 79.921875,
"epoch": 0.1309288537549407,
"grad_norm": 4.4730643003742125,
"kl": 0.12451171875,
"learning_rate": 8.693181818181817e-07,
"loss": 0.005,
"reward": 2.5940661430358887,
"reward_std": 0.12470673769712448,
"rewards/accuracy_reward_stage2": 0.6565661430358887,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.203125,
"step": 530
},
{
"completion_length": 76.28125,
"epoch": 0.13117588932806323,
"grad_norm": 3.879298926548609,
"kl": 0.11376953125,
"learning_rate": 8.690711462450592e-07,
"loss": 0.0046,
"reward": 2.544191598892212,
"reward_std": 0.09751159697771072,
"rewards/accuracy_reward_stage2": 0.6212749481201172,
"rewards/format_reward_all_stage": 1.9229166507720947,
"scores/refine_times": 1.1875,
"step": 531
},
{
"completion_length": 78.6875,
"epoch": 0.13142292490118576,
"grad_norm": 2.89769607245115,
"kl": 0.09033203125,
"learning_rate": 8.688241106719367e-07,
"loss": 0.0036,
"reward": 2.672381639480591,
"reward_std": 0.004669596441090107,
"rewards/accuracy_reward_stage2": 0.6723816394805908,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 532
},
{
"completion_length": 70.1875,
"epoch": 0.1316699604743083,
"grad_norm": 4.060304413882462,
"kl": 0.125,
"learning_rate": 8.685770750988142e-07,
"loss": 0.005,
"reward": 2.491544246673584,
"reward_std": 0.013606157153844833,
"rewards/accuracy_reward_stage2": 0.4915444254875183,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 533
},
{
"completion_length": 87.75,
"epoch": 0.13191699604743082,
"grad_norm": 3.6628188266219768,
"kl": 0.10791015625,
"learning_rate": 8.683300395256917e-07,
"loss": 0.0043,
"reward": 2.5697736740112305,
"reward_std": 0.015459168702363968,
"rewards/accuracy_reward_stage2": 0.5697736740112305,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 534
},
{
"completion_length": 92.28125,
"epoch": 0.13216403162055335,
"grad_norm": 4.200607367033454,
"kl": 0.1044921875,
"learning_rate": 8.680830039525692e-07,
"loss": 0.0042,
"reward": 2.7578911781311035,
"reward_std": 0.1420280486345291,
"rewards/accuracy_reward_stage2": 0.836016058921814,
"rewards/format_reward_all_stage": 1.921875,
"scores/refine_times": 1.1875,
"step": 535
},
{
"completion_length": 89.640625,
"epoch": 0.1324110671936759,
"grad_norm": 2.626610558071225,
"kl": 0.08349609375,
"learning_rate": 8.678359683794466e-07,
"loss": 0.0033,
"reward": 2.7216603755950928,
"reward_std": 0.013819479383528233,
"rewards/accuracy_reward_stage2": 0.7216602563858032,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 536
},
{
"completion_length": 93.765625,
"epoch": 0.13265810276679843,
"grad_norm": 4.500544096757364,
"kl": 0.107421875,
"learning_rate": 8.675889328063241e-07,
"loss": 0.0043,
"reward": 2.755054473876953,
"reward_std": 0.08167126029729843,
"rewards/accuracy_reward_stage2": 0.7550546526908875,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 537
},
{
"completion_length": 86.21875,
"epoch": 0.13290513833992096,
"grad_norm": 2.865246814533839,
"kl": 0.09619140625,
"learning_rate": 8.673418972332015e-07,
"loss": 0.0039,
"reward": 2.8893496990203857,
"reward_std": 0.024965433403849602,
"rewards/accuracy_reward_stage2": 0.8893496990203857,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 538
},
{
"completion_length": 76.4375,
"epoch": 0.1331521739130435,
"grad_norm": 1.67135798397581,
"kl": 0.0810546875,
"learning_rate": 8.67094861660079e-07,
"loss": 0.0033,
"reward": 2.7071239948272705,
"reward_std": 0.005690534599125385,
"rewards/accuracy_reward_stage2": 0.7071239948272705,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 539
},
{
"completion_length": 82.78125,
"epoch": 0.13339920948616601,
"grad_norm": 4.1245154427845465,
"kl": 0.0830078125,
"learning_rate": 8.668478260869565e-07,
"loss": 0.0033,
"reward": 2.630288600921631,
"reward_std": 0.06467457115650177,
"rewards/accuracy_reward_stage2": 0.6302886009216309,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 540
},
{
"completion_length": 83.0,
"epoch": 0.13364624505928854,
"grad_norm": 1.7077519811767492,
"kl": 0.07861328125,
"learning_rate": 8.666007905138339e-07,
"loss": 0.0031,
"reward": 2.87119722366333,
"reward_std": 0.012019687332212925,
"rewards/accuracy_reward_stage2": 0.8711971044540405,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 541
},
{
"completion_length": 75.5,
"epoch": 0.13389328063241107,
"grad_norm": 3.6652902376339846,
"kl": 0.0771484375,
"learning_rate": 8.663537549407114e-07,
"loss": 0.0031,
"reward": 2.528963565826416,
"reward_std": 0.03654494509100914,
"rewards/accuracy_reward_stage2": 0.5289634466171265,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 542
},
{
"completion_length": 82.125,
"epoch": 0.1341403162055336,
"grad_norm": 2.915704097691835,
"kl": 0.087890625,
"learning_rate": 8.66106719367589e-07,
"loss": 0.0035,
"reward": 2.725383996963501,
"reward_std": 0.05014697462320328,
"rewards/accuracy_reward_stage2": 0.725383996963501,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 543
},
{
"completion_length": 85.328125,
"epoch": 0.13438735177865613,
"grad_norm": 3.3523091039346906,
"kl": 0.07177734375,
"learning_rate": 8.658596837944664e-07,
"loss": 0.0029,
"reward": 2.68579363822937,
"reward_std": 0.02735856920480728,
"rewards/accuracy_reward_stage2": 0.6857935786247253,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 544
},
{
"completion_length": 66.1875,
"epoch": 0.13463438735177866,
"grad_norm": 5.104979767459419,
"kl": 0.09814453125,
"learning_rate": 8.656126482213438e-07,
"loss": 0.0039,
"reward": 2.6130058765411377,
"reward_std": 0.050167717039585114,
"rewards/accuracy_reward_stage2": 0.6130058169364929,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 545
},
{
"completion_length": 71.25,
"epoch": 0.13488142292490118,
"grad_norm": 3.450494920523269,
"kl": 0.08251953125,
"learning_rate": 8.653656126482213e-07,
"loss": 0.0033,
"reward": 2.6630539894104004,
"reward_std": 0.03502904251217842,
"rewards/accuracy_reward_stage2": 0.6630541086196899,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 546
},
{
"completion_length": 79.125,
"epoch": 0.1351284584980237,
"grad_norm": 2.859476004597891,
"kl": 0.068359375,
"learning_rate": 8.651185770750987e-07,
"loss": 0.0027,
"reward": 2.6337075233459473,
"reward_std": 0.03160533308982849,
"rewards/accuracy_reward_stage2": 0.6337075233459473,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 547
},
{
"completion_length": 73.0625,
"epoch": 0.13537549407114624,
"grad_norm": 4.327752965233732,
"kl": 0.07666015625,
"learning_rate": 8.648715415019763e-07,
"loss": 0.0031,
"reward": 2.5985307693481445,
"reward_std": 0.036348432302474976,
"rewards/accuracy_reward_stage2": 0.5985307097434998,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 548
},
{
"completion_length": 83.9375,
"epoch": 0.13562252964426877,
"grad_norm": 3.5893750391044117,
"kl": 0.080078125,
"learning_rate": 8.646245059288537e-07,
"loss": 0.0032,
"reward": 2.7797060012817383,
"reward_std": 0.03607073798775673,
"rewards/accuracy_reward_stage2": 0.7797058820724487,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 549
},
{
"completion_length": 101.328125,
"epoch": 0.1358695652173913,
"grad_norm": 4.38009132567829,
"kl": 0.07861328125,
"learning_rate": 8.643774703557311e-07,
"loss": 0.0031,
"reward": 2.667013645172119,
"reward_std": 0.07868792116641998,
"rewards/accuracy_reward_stage2": 0.6826385259628296,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.0625,
"step": 550
},
{
"completion_length": 94.125,
"epoch": 0.13611660079051383,
"grad_norm": 3.4410934285028305,
"kl": 0.049072265625,
"learning_rate": 8.641304347826086e-07,
"loss": 0.002,
"reward": 2.637599229812622,
"reward_std": 0.08176921308040619,
"rewards/accuracy_reward_stage2": 0.6375992298126221,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 551
},
{
"completion_length": 84.6875,
"epoch": 0.13636363636363635,
"grad_norm": 4.056780679390888,
"kl": 0.1005859375,
"learning_rate": 8.638833992094862e-07,
"loss": 0.004,
"reward": 2.6697425842285156,
"reward_std": 0.037977829575538635,
"rewards/accuracy_reward_stage2": 0.6697424650192261,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 552
},
{
"completion_length": 97.140625,
"epoch": 0.13661067193675888,
"grad_norm": 3.1912103956728206,
"kl": 0.0634765625,
"learning_rate": 8.636363636363636e-07,
"loss": 0.0025,
"reward": 2.673372745513916,
"reward_std": 0.17441077530384064,
"rewards/accuracy_reward_stage2": 0.813997745513916,
"rewards/format_reward_all_stage": 1.859375,
"scores/refine_times": 1.125,
"step": 553
},
{
"completion_length": 94.6875,
"epoch": 0.1368577075098814,
"grad_norm": 3.9139148065427922,
"kl": 0.08447265625,
"learning_rate": 8.633893280632411e-07,
"loss": 0.0034,
"reward": 2.507345676422119,
"reward_std": 0.1150667741894722,
"rewards/accuracy_reward_stage2": 0.5177624225616455,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.140625,
"step": 554
},
{
"completion_length": 97.640625,
"epoch": 0.13710474308300397,
"grad_norm": 2.778226584347817,
"kl": 0.09521484375,
"learning_rate": 8.631422924901185e-07,
"loss": 0.0038,
"reward": 2.8255248069763184,
"reward_std": 0.026099219918251038,
"rewards/accuracy_reward_stage2": 0.8255246877670288,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 555
},
{
"completion_length": 93.796875,
"epoch": 0.1373517786561265,
"grad_norm": 3.714775703693918,
"kl": 0.06982421875,
"learning_rate": 8.62895256916996e-07,
"loss": 0.0028,
"reward": 2.700383186340332,
"reward_std": 0.11565177142620087,
"rewards/accuracy_reward_stage2": 0.7003831267356873,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.109375,
"step": 556
},
{
"completion_length": 95.96875,
"epoch": 0.13759881422924902,
"grad_norm": 4.424621871770595,
"kl": 0.08203125,
"learning_rate": 8.626482213438735e-07,
"loss": 0.0033,
"reward": 2.531020164489746,
"reward_std": 0.1613694131374359,
"rewards/accuracy_reward_stage2": 0.5310203433036804,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 557
},
{
"completion_length": 74.40625,
"epoch": 0.13784584980237155,
"grad_norm": 0.2547649072682389,
"kl": 0.07421875,
"learning_rate": 8.624011857707509e-07,
"loss": 0.003,
"reward": 2.75,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.75,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 558
},
{
"completion_length": 91.046875,
"epoch": 0.13809288537549408,
"grad_norm": 4.493659007924147,
"kl": 0.08740234375,
"learning_rate": 8.621541501976283e-07,
"loss": 0.0035,
"reward": 2.4549224376678467,
"reward_std": 0.0722210705280304,
"rewards/accuracy_reward_stage2": 0.4705474376678467,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.078125,
"step": 559
},
{
"completion_length": 80.9375,
"epoch": 0.1383399209486166,
"grad_norm": 3.4555316703357213,
"kl": 0.072265625,
"learning_rate": 8.61907114624506e-07,
"loss": 0.0029,
"reward": 2.689997434616089,
"reward_std": 0.15605773031711578,
"rewards/accuracy_reward_stage2": 0.8149974346160889,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 560
},
{
"completion_length": 82.921875,
"epoch": 0.13858695652173914,
"grad_norm": 4.399711011954836,
"kl": 0.07861328125,
"learning_rate": 8.616600790513834e-07,
"loss": 0.0031,
"reward": 2.6843276023864746,
"reward_std": 0.048114050179719925,
"rewards/accuracy_reward_stage2": 0.6843275427818298,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 561
},
{
"completion_length": 69.4375,
"epoch": 0.13883399209486166,
"grad_norm": 2.544848259860156,
"kl": 0.060791015625,
"learning_rate": 8.614130434782609e-07,
"loss": 0.0024,
"reward": 2.7242729663848877,
"reward_std": 0.008474176749587059,
"rewards/accuracy_reward_stage2": 0.7242730259895325,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 562
},
{
"completion_length": 93.015625,
"epoch": 0.1390810276679842,
"grad_norm": 1.9603121727015835,
"kl": 0.08203125,
"learning_rate": 8.611660079051383e-07,
"loss": 0.0033,
"reward": 2.7193644046783447,
"reward_std": 0.0018559737363830209,
"rewards/accuracy_reward_stage2": 0.7193642854690552,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 563
},
{
"completion_length": 83.328125,
"epoch": 0.13932806324110672,
"grad_norm": 4.238916634053795,
"kl": 0.0908203125,
"learning_rate": 8.609189723320158e-07,
"loss": 0.0036,
"reward": 2.60569167137146,
"reward_std": 0.21955642104148865,
"rewards/accuracy_reward_stage2": 0.73069167137146,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 564
},
{
"completion_length": 87.390625,
"epoch": 0.13957509881422925,
"grad_norm": 4.382576919750248,
"kl": 0.1015625,
"learning_rate": 8.606719367588933e-07,
"loss": 0.0041,
"reward": 2.5854461193084717,
"reward_std": 0.15955139696598053,
"rewards/accuracy_reward_stage2": 0.7104461789131165,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 565
},
{
"completion_length": 80.25,
"epoch": 0.13982213438735178,
"grad_norm": 2.9867715867203857,
"kl": 0.0654296875,
"learning_rate": 8.604249011857707e-07,
"loss": 0.0026,
"reward": 2.7301557064056396,
"reward_std": 0.0774800181388855,
"rewards/accuracy_reward_stage2": 0.7301557064056396,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 566
},
{
"completion_length": 81.0625,
"epoch": 0.1400691699604743,
"grad_norm": 3.224273094621829,
"kl": 0.0771484375,
"learning_rate": 8.601778656126481e-07,
"loss": 0.0031,
"reward": 2.7714409828186035,
"reward_std": 0.010768895037472248,
"rewards/accuracy_reward_stage2": 0.7714409232139587,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 567
},
{
"completion_length": 87.375,
"epoch": 0.14031620553359683,
"grad_norm": 4.003108387284898,
"kl": 0.08544921875,
"learning_rate": 8.599308300395256e-07,
"loss": 0.0034,
"reward": 2.571406841278076,
"reward_std": 0.021448295563459396,
"rewards/accuracy_reward_stage2": 0.5714069604873657,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 568
},
{
"completion_length": 91.734375,
"epoch": 0.14056324110671936,
"grad_norm": 1.763343386987535,
"kl": 0.078125,
"learning_rate": 8.596837944664031e-07,
"loss": 0.0031,
"reward": 2.537046432495117,
"reward_std": 0.005295008420944214,
"rewards/accuracy_reward_stage2": 0.5370461940765381,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 569
},
{
"completion_length": 94.234375,
"epoch": 0.1408102766798419,
"grad_norm": 2.1101705072844963,
"kl": 0.080078125,
"learning_rate": 8.594367588932806e-07,
"loss": 0.0032,
"reward": 2.644087076187134,
"reward_std": 0.006828606594353914,
"rewards/accuracy_reward_stage2": 0.6440869569778442,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 570
},
{
"completion_length": 91.125,
"epoch": 0.14105731225296442,
"grad_norm": 2.4996061886316165,
"kl": 0.06689453125,
"learning_rate": 8.591897233201581e-07,
"loss": 0.0027,
"reward": 2.6300556659698486,
"reward_std": 0.06752649694681168,
"rewards/accuracy_reward_stage2": 0.6300556659698486,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 571
},
{
"completion_length": 83.796875,
"epoch": 0.14130434782608695,
"grad_norm": 2.4902164979539956,
"kl": 0.08642578125,
"learning_rate": 8.589426877470355e-07,
"loss": 0.0035,
"reward": 2.728515148162842,
"reward_std": 0.020385991781949997,
"rewards/accuracy_reward_stage2": 0.7285150289535522,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 572
},
{
"completion_length": 82.25,
"epoch": 0.14155138339920947,
"grad_norm": 4.393798659706149,
"kl": 0.0908203125,
"learning_rate": 8.586956521739131e-07,
"loss": 0.0036,
"reward": 2.5845212936401367,
"reward_std": 0.14551466703414917,
"rewards/accuracy_reward_stage2": 0.5845211744308472,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 573
},
{
"completion_length": 89.5625,
"epoch": 0.141798418972332,
"grad_norm": 3.4708980017734623,
"kl": 0.0625,
"learning_rate": 8.584486166007905e-07,
"loss": 0.0025,
"reward": 2.4852051734924316,
"reward_std": 0.04739289730787277,
"rewards/accuracy_reward_stage2": 0.4852050542831421,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 574
},
{
"completion_length": 75.3125,
"epoch": 0.14204545454545456,
"grad_norm": 1.9218103907041477,
"kl": 0.09130859375,
"learning_rate": 8.582015810276679e-07,
"loss": 0.0037,
"reward": 2.776808738708496,
"reward_std": 0.003206153865903616,
"rewards/accuracy_reward_stage2": 0.7768086194992065,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 575
},
{
"completion_length": 99.109375,
"epoch": 0.1422924901185771,
"grad_norm": 4.45653496204149,
"kl": 0.0791015625,
"learning_rate": 8.579545454545454e-07,
"loss": 0.0032,
"reward": 2.578134059906006,
"reward_std": 0.09177695214748383,
"rewards/accuracy_reward_stage2": 0.5781341791152954,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 576
},
{
"completion_length": 105.296875,
"epoch": 0.14253952569169961,
"grad_norm": 4.1541183942964235,
"kl": 0.0634765625,
"learning_rate": 8.577075098814229e-07,
"loss": 0.0025,
"reward": 2.668818950653076,
"reward_std": 0.14301809668540955,
"rewards/accuracy_reward_stage2": 0.6688190698623657,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 577
},
{
"completion_length": 86.34375,
"epoch": 0.14278656126482214,
"grad_norm": 3.6726618866240326,
"kl": 0.0810546875,
"learning_rate": 8.574604743083003e-07,
"loss": 0.0032,
"reward": 2.8135416507720947,
"reward_std": 0.09910938143730164,
"rewards/accuracy_reward_stage2": 0.8135416507720947,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 578
},
{
"completion_length": 96.5,
"epoch": 0.14303359683794467,
"grad_norm": 3.2088315728349333,
"kl": 0.09326171875,
"learning_rate": 8.572134387351779e-07,
"loss": 0.0037,
"reward": 2.813441514968872,
"reward_std": 0.01283353567123413,
"rewards/accuracy_reward_stage2": 0.8134413957595825,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 579
},
{
"completion_length": 89.125,
"epoch": 0.1432806324110672,
"grad_norm": 4.787753647506243,
"kl": 0.076171875,
"learning_rate": 8.569664031620553e-07,
"loss": 0.003,
"reward": 2.428907871246338,
"reward_std": 0.27077752351760864,
"rewards/accuracy_reward_stage2": 0.5539077520370483,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 580
},
{
"completion_length": 92.53125,
"epoch": 0.14352766798418973,
"grad_norm": 1.964050185140295,
"kl": 0.0693359375,
"learning_rate": 8.567193675889328e-07,
"loss": 0.0028,
"reward": 2.7752304077148438,
"reward_std": 0.017977114766836166,
"rewards/accuracy_reward_stage2": 0.7752305269241333,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 581
},
{
"completion_length": 98.734375,
"epoch": 0.14377470355731226,
"grad_norm": 4.5476829359160895,
"kl": 0.087890625,
"learning_rate": 8.564723320158103e-07,
"loss": 0.0035,
"reward": 2.6758265495300293,
"reward_std": 0.057866424322128296,
"rewards/accuracy_reward_stage2": 0.6758266687393188,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 582
},
{
"completion_length": 97.140625,
"epoch": 0.14402173913043478,
"grad_norm": 3.3204660955158856,
"kl": 0.08349609375,
"learning_rate": 8.562252964426877e-07,
"loss": 0.0033,
"reward": 2.78486967086792,
"reward_std": 0.08574660122394562,
"rewards/accuracy_reward_stage2": 0.7848696708679199,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 583
},
{
"completion_length": 100.046875,
"epoch": 0.1442687747035573,
"grad_norm": 3.826957325314654,
"kl": 0.07470703125,
"learning_rate": 8.559782608695651e-07,
"loss": 0.003,
"reward": 2.706620454788208,
"reward_std": 0.07693397253751755,
"rewards/accuracy_reward_stage2": 0.7066203355789185,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 584
},
{
"completion_length": 103.125,
"epoch": 0.14451581027667984,
"grad_norm": 3.8246431494565387,
"kl": 0.09375,
"learning_rate": 8.557312252964426e-07,
"loss": 0.0038,
"reward": 2.5936570167541504,
"reward_std": 0.09550125896930695,
"rewards/accuracy_reward_stage2": 0.5936569571495056,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 585
},
{
"completion_length": 103.671875,
"epoch": 0.14476284584980237,
"grad_norm": 2.914164966990848,
"kl": 0.07568359375,
"learning_rate": 8.554841897233201e-07,
"loss": 0.003,
"reward": 2.3438425064086914,
"reward_std": 0.029713183641433716,
"rewards/accuracy_reward_stage2": 0.3438425660133362,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 586
},
{
"completion_length": 92.34375,
"epoch": 0.1450098814229249,
"grad_norm": 4.580016718298798,
"kl": 0.0810546875,
"learning_rate": 8.552371541501975e-07,
"loss": 0.0032,
"reward": 2.644289016723633,
"reward_std": 0.057920269668102264,
"rewards/accuracy_reward_stage2": 0.6442890167236328,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 587
},
{
"completion_length": 91.328125,
"epoch": 0.14525691699604742,
"grad_norm": 2.3584839471859866,
"kl": 0.099609375,
"learning_rate": 8.549901185770751e-07,
"loss": 0.004,
"reward": 2.853529930114746,
"reward_std": 0.018497945740818977,
"rewards/accuracy_reward_stage2": 0.8535300493240356,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 588
},
{
"completion_length": 99.671875,
"epoch": 0.14550395256916995,
"grad_norm": 1.5376393488573932,
"kl": 0.0986328125,
"learning_rate": 8.547430830039525e-07,
"loss": 0.0039,
"reward": 2.908482074737549,
"reward_std": 0.0646936446428299,
"rewards/accuracy_reward_stage2": 0.9084821939468384,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 589
},
{
"completion_length": 108.59375,
"epoch": 0.14575098814229248,
"grad_norm": 2.616044570647881,
"kl": 0.07373046875,
"learning_rate": 8.544960474308301e-07,
"loss": 0.003,
"reward": 2.4426791667938232,
"reward_std": 0.09798327833414078,
"rewards/accuracy_reward_stage2": 0.4426790773868561,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 590
},
{
"completion_length": 75.9375,
"epoch": 0.145998023715415,
"grad_norm": 2.3565237541211075,
"kl": 0.087890625,
"learning_rate": 8.542490118577075e-07,
"loss": 0.0035,
"reward": 2.5625,
"reward_std": 0.06681530922651291,
"rewards/accuracy_reward_stage2": 0.5625,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 591
},
{
"completion_length": 98.359375,
"epoch": 0.14624505928853754,
"grad_norm": 5.707482842285747,
"kl": 0.09033203125,
"learning_rate": 8.540019762845849e-07,
"loss": 0.0036,
"reward": 2.3818490505218506,
"reward_std": 0.3349462151527405,
"rewards/accuracy_reward_stage2": 0.5693491101264954,
"rewards/format_reward_all_stage": 1.8125,
"scores/refine_times": 1.125,
"step": 592
},
{
"completion_length": 74.875,
"epoch": 0.14649209486166007,
"grad_norm": 2.5953914638869096,
"kl": 0.08056640625,
"learning_rate": 8.537549407114624e-07,
"loss": 0.0032,
"reward": 2.9376792907714844,
"reward_std": 0.004189035389572382,
"rewards/accuracy_reward_stage2": 0.9376791715621948,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 593
},
{
"completion_length": 96.953125,
"epoch": 0.14673913043478262,
"grad_norm": 5.554929821649331,
"kl": 0.09326171875,
"learning_rate": 8.535079051383399e-07,
"loss": 0.0037,
"reward": 2.597329616546631,
"reward_std": 0.27840977907180786,
"rewards/accuracy_reward_stage2": 0.6598294973373413,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 594
},
{
"completion_length": 89.15625,
"epoch": 0.14698616600790515,
"grad_norm": 4.019536724109511,
"kl": 0.059326171875,
"learning_rate": 8.532608695652173e-07,
"loss": 0.0024,
"reward": 2.6870036125183105,
"reward_std": 0.1373123675584793,
"rewards/accuracy_reward_stage2": 0.8120037317276001,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 595
},
{
"completion_length": 77.8125,
"epoch": 0.14723320158102768,
"grad_norm": 4.8955452689702685,
"kl": 0.0927734375,
"learning_rate": 8.530138339920948e-07,
"loss": 0.0037,
"reward": 2.7033681869506836,
"reward_std": 0.09230202436447144,
"rewards/accuracy_reward_stage2": 0.7033681869506836,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 596
},
{
"completion_length": 79.6875,
"epoch": 0.1474802371541502,
"grad_norm": 3.7796317517540103,
"kl": 0.08447265625,
"learning_rate": 8.527667984189722e-07,
"loss": 0.0034,
"reward": 2.491044521331787,
"reward_std": 0.14756934344768524,
"rewards/accuracy_reward_stage2": 0.4910443425178528,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 597
},
{
"completion_length": 78.09375,
"epoch": 0.14772727272727273,
"grad_norm": 3.8334181192499153,
"kl": 0.091796875,
"learning_rate": 8.525197628458499e-07,
"loss": 0.0037,
"reward": 2.8837780952453613,
"reward_std": 0.07180596143007278,
"rewards/accuracy_reward_stage2": 0.8837779760360718,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 598
},
{
"completion_length": 92.859375,
"epoch": 0.14797430830039526,
"grad_norm": 4.288955963549508,
"kl": 0.125,
"learning_rate": 8.522727272727273e-07,
"loss": 0.005,
"reward": 2.762850761413574,
"reward_std": 0.07867846637964249,
"rewards/accuracy_reward_stage2": 0.7628507018089294,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 599
},
{
"completion_length": 82.71875,
"epoch": 0.1482213438735178,
"grad_norm": 3.5528631311780385,
"kl": 0.10498046875,
"learning_rate": 8.520256916996047e-07,
"loss": 0.0042,
"reward": 2.6389195919036865,
"reward_std": 0.06157643720507622,
"rewards/accuracy_reward_stage2": 0.6389195919036865,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 600
},
{
"completion_length": 85.328125,
"epoch": 0.14846837944664032,
"grad_norm": 4.689305434315942,
"kl": 0.08984375,
"learning_rate": 8.517786561264822e-07,
"loss": 0.0036,
"reward": 2.7248332500457764,
"reward_std": 0.1498197615146637,
"rewards/accuracy_reward_stage2": 0.724833071231842,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 601
},
{
"completion_length": 63.34375,
"epoch": 0.14871541501976285,
"grad_norm": 3.314732002614642,
"kl": 0.13671875,
"learning_rate": 8.515316205533597e-07,
"loss": 0.0055,
"reward": 2.7788662910461426,
"reward_std": 0.035718683153390884,
"rewards/accuracy_reward_stage2": 0.7788662910461426,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 602
},
{
"completion_length": 91.640625,
"epoch": 0.14896245059288538,
"grad_norm": 3.0711151595440245,
"kl": 0.078125,
"learning_rate": 8.512845849802371e-07,
"loss": 0.0031,
"reward": 2.600371837615967,
"reward_std": 0.17952686548233032,
"rewards/accuracy_reward_stage2": 0.7253717184066772,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.21875,
"step": 603
},
{
"completion_length": 78.796875,
"epoch": 0.1492094861660079,
"grad_norm": 3.5027395607429717,
"kl": 0.09765625,
"learning_rate": 8.510375494071146e-07,
"loss": 0.0039,
"reward": 2.674034833908081,
"reward_std": 0.04458358883857727,
"rewards/accuracy_reward_stage2": 0.6740349531173706,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 604
},
{
"completion_length": 86.609375,
"epoch": 0.14945652173913043,
"grad_norm": 2.2600159130295654,
"kl": 0.0849609375,
"learning_rate": 8.50790513833992e-07,
"loss": 0.0034,
"reward": 2.6875,
"reward_std": 0.06681530922651291,
"rewards/accuracy_reward_stage2": 0.6875,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 605
},
{
"completion_length": 80.640625,
"epoch": 0.14970355731225296,
"grad_norm": 4.805520751259063,
"kl": 0.07861328125,
"learning_rate": 8.505434782608694e-07,
"loss": 0.0032,
"reward": 2.63192081451416,
"reward_std": 0.1455264687538147,
"rewards/accuracy_reward_stage2": 0.6475456357002258,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.15625,
"step": 606
},
{
"completion_length": 75.71875,
"epoch": 0.1499505928853755,
"grad_norm": 4.573576044918896,
"kl": 0.1044921875,
"learning_rate": 8.502964426877471e-07,
"loss": 0.0042,
"reward": 2.67458438873291,
"reward_std": 0.04098530113697052,
"rewards/accuracy_reward_stage2": 0.6745842695236206,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 607
},
{
"completion_length": 75.234375,
"epoch": 0.15019762845849802,
"grad_norm": 4.335859801113833,
"kl": 0.07470703125,
"learning_rate": 8.500494071146245e-07,
"loss": 0.003,
"reward": 2.4506468772888184,
"reward_std": 0.05327065661549568,
"rewards/accuracy_reward_stage2": 0.4506469666957855,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 608
},
{
"completion_length": 58.0,
"epoch": 0.15044466403162055,
"grad_norm": 4.152937219223513,
"kl": 0.09521484375,
"learning_rate": 8.498023715415019e-07,
"loss": 0.0038,
"reward": 2.6415534019470215,
"reward_std": 0.0792679637670517,
"rewards/accuracy_reward_stage2": 0.6415532827377319,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 609
},
{
"completion_length": 75.671875,
"epoch": 0.15069169960474307,
"grad_norm": 3.5202934389955143,
"kl": 0.0986328125,
"learning_rate": 8.495553359683794e-07,
"loss": 0.0039,
"reward": 2.627929210662842,
"reward_std": 0.08055642247200012,
"rewards/accuracy_reward_stage2": 0.6279292106628418,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 610
},
{
"completion_length": 68.421875,
"epoch": 0.1509387351778656,
"grad_norm": 3.9270342458590073,
"kl": 0.09326171875,
"learning_rate": 8.493083003952569e-07,
"loss": 0.0037,
"reward": 2.6131200790405273,
"reward_std": 0.07064563035964966,
"rewards/accuracy_reward_stage2": 0.6131199598312378,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 611
},
{
"completion_length": 94.046875,
"epoch": 0.15118577075098813,
"grad_norm": 2.797781979194763,
"kl": 0.138671875,
"learning_rate": 8.490612648221343e-07,
"loss": 0.0055,
"reward": 2.668482542037964,
"reward_std": 0.09077267348766327,
"rewards/accuracy_reward_stage2": 0.7257742285728455,
"rewards/format_reward_all_stage": 1.9427082538604736,
"scores/refine_times": 1.421875,
"step": 612
},
{
"completion_length": 76.25,
"epoch": 0.15143280632411066,
"grad_norm": 3.5823382600072833,
"kl": 0.1123046875,
"learning_rate": 8.488142292490118e-07,
"loss": 0.0045,
"reward": 2.8506646156311035,
"reward_std": 0.02524959295988083,
"rewards/accuracy_reward_stage2": 0.8506646156311035,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 613
},
{
"completion_length": 54.375,
"epoch": 0.15167984189723321,
"grad_norm": 5.43957715062048,
"kl": 0.1513671875,
"learning_rate": 8.485671936758892e-07,
"loss": 0.006,
"reward": 2.212973117828369,
"reward_std": 0.054874010384082794,
"rewards/accuracy_reward_stage2": 0.2129732221364975,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 614
},
{
"completion_length": 58.265625,
"epoch": 0.15192687747035574,
"grad_norm": 5.126288403045913,
"kl": 0.265625,
"learning_rate": 8.483201581027668e-07,
"loss": 0.0106,
"reward": 2.6679821014404297,
"reward_std": 0.11545281857252121,
"rewards/accuracy_reward_stage2": 0.6836073398590088,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.25,
"step": 615
},
{
"completion_length": 37.515625,
"epoch": 0.15217391304347827,
"grad_norm": 4.80980966637606,
"kl": 0.2392578125,
"learning_rate": 8.480731225296443e-07,
"loss": 0.0096,
"reward": 2.513650417327881,
"reward_std": 0.11596601456403732,
"rewards/accuracy_reward_stage2": 0.5136504769325256,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 616
},
{
"completion_length": 50.390625,
"epoch": 0.1524209486166008,
"grad_norm": 2.9052895258079197,
"kl": 0.1875,
"learning_rate": 8.478260869565217e-07,
"loss": 0.0075,
"reward": 2.665754795074463,
"reward_std": 0.09999995678663254,
"rewards/accuracy_reward_stage2": 0.6813797950744629,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.140625,
"step": 617
},
{
"completion_length": 42.734375,
"epoch": 0.15266798418972333,
"grad_norm": 2.9639210413309627,
"kl": 0.2138671875,
"learning_rate": 8.475790513833992e-07,
"loss": 0.0085,
"reward": 2.741377353668213,
"reward_std": 0.07839522510766983,
"rewards/accuracy_reward_stage2": 0.7413773536682129,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 618
},
{
"completion_length": 32.59375,
"epoch": 0.15291501976284586,
"grad_norm": 3.7719906987384793,
"kl": 0.322265625,
"learning_rate": 8.473320158102767e-07,
"loss": 0.0129,
"reward": 2.813457489013672,
"reward_std": 0.034740254282951355,
"rewards/accuracy_reward_stage2": 0.8134576082229614,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 619
},
{
"completion_length": 32.0625,
"epoch": 0.15316205533596838,
"grad_norm": 3.886802620815163,
"kl": 0.251953125,
"learning_rate": 8.470849802371541e-07,
"loss": 0.01,
"reward": 2.7390785217285156,
"reward_std": 0.08644495904445648,
"rewards/accuracy_reward_stage2": 0.7390785813331604,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 620
},
{
"completion_length": 28.53125,
"epoch": 0.1534090909090909,
"grad_norm": 4.3419292681332085,
"kl": 0.283203125,
"learning_rate": 8.468379446640316e-07,
"loss": 0.0113,
"reward": 2.72926664352417,
"reward_std": 0.02556372992694378,
"rewards/accuracy_reward_stage2": 0.7292666435241699,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 621
},
{
"completion_length": 31.265625,
"epoch": 0.15365612648221344,
"grad_norm": 4.426681437677741,
"kl": 0.330078125,
"learning_rate": 8.46590909090909e-07,
"loss": 0.0132,
"reward": 2.675302505493164,
"reward_std": 0.1827945113182068,
"rewards/accuracy_reward_stage2": 0.6753023862838745,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 622
},
{
"completion_length": 40.5,
"epoch": 0.15390316205533597,
"grad_norm": 4.901917034966575,
"kl": 0.2265625,
"learning_rate": 8.463438735177865e-07,
"loss": 0.0091,
"reward": 2.6904773712158203,
"reward_std": 0.09335070103406906,
"rewards/accuracy_reward_stage2": 0.7061026096343994,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.078125,
"step": 623
},
{
"completion_length": 25.71875,
"epoch": 0.1541501976284585,
"grad_norm": 2.0307140823798524,
"kl": 0.25390625,
"learning_rate": 8.46096837944664e-07,
"loss": 0.0102,
"reward": 2.9270834922790527,
"reward_std": 0.09627808630466461,
"rewards/accuracy_reward_stage2": 0.9375,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.203125,
"step": 624
},
{
"completion_length": 34.59375,
"epoch": 0.15439723320158102,
"grad_norm": 4.471368602534083,
"kl": 0.28515625,
"learning_rate": 8.458498023715415e-07,
"loss": 0.0114,
"reward": 2.7720367908477783,
"reward_std": 0.06439623236656189,
"rewards/accuracy_reward_stage2": 0.7876617908477783,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.25,
"step": 625
},
{
"completion_length": 36.0625,
"epoch": 0.15464426877470355,
"grad_norm": 4.686637990114619,
"kl": 0.25,
"learning_rate": 8.45602766798419e-07,
"loss": 0.01,
"reward": 2.6198482513427734,
"reward_std": 0.0595381073653698,
"rewards/accuracy_reward_stage2": 0.6198481917381287,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 626
},
{
"completion_length": 28.421875,
"epoch": 0.15489130434782608,
"grad_norm": 3.816332577374878,
"kl": 0.2578125,
"learning_rate": 8.453557312252964e-07,
"loss": 0.0103,
"reward": 2.7242279052734375,
"reward_std": 0.10282538086175919,
"rewards/accuracy_reward_stage2": 0.7242279648780823,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 627
},
{
"completion_length": 26.875,
"epoch": 0.1551383399209486,
"grad_norm": 5.68706231004042,
"kl": 0.275390625,
"learning_rate": 8.451086956521739e-07,
"loss": 0.011,
"reward": 2.6736927032470703,
"reward_std": 0.12340083718299866,
"rewards/accuracy_reward_stage2": 0.673692524433136,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 628
},
{
"completion_length": 29.078125,
"epoch": 0.15538537549407114,
"grad_norm": 2.328207752491545,
"kl": 0.283203125,
"learning_rate": 8.448616600790514e-07,
"loss": 0.0113,
"reward": 2.7866618633270264,
"reward_std": 0.03922741115093231,
"rewards/accuracy_reward_stage2": 0.7866617441177368,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 629
},
{
"completion_length": 35.9375,
"epoch": 0.15563241106719367,
"grad_norm": 0.5035644859778624,
"kl": 0.240234375,
"learning_rate": 8.446146245059288e-07,
"loss": 0.0096,
"reward": 2.75,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.75,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 630
},
{
"completion_length": 44.375,
"epoch": 0.1558794466403162,
"grad_norm": 3.7496541284685887,
"kl": 0.244140625,
"learning_rate": 8.443675889328062e-07,
"loss": 0.0098,
"reward": 2.7512311935424805,
"reward_std": 0.09940579533576965,
"rewards/accuracy_reward_stage2": 0.8085229396820068,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.296875,
"step": 631
},
{
"completion_length": 35.5,
"epoch": 0.15612648221343872,
"grad_norm": 3.8633097627706148,
"kl": 0.193359375,
"learning_rate": 8.441205533596838e-07,
"loss": 0.0077,
"reward": 2.653985023498535,
"reward_std": 0.035471752285957336,
"rewards/accuracy_reward_stage2": 0.6539848446846008,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 632
},
{
"completion_length": 35.65625,
"epoch": 0.15637351778656128,
"grad_norm": 2.102058612850822,
"kl": 0.2158203125,
"learning_rate": 8.438735177865612e-07,
"loss": 0.0086,
"reward": 2.83585524559021,
"reward_std": 0.07515110820531845,
"rewards/accuracy_reward_stage2": 0.8358553051948547,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 633
},
{
"completion_length": 27.09375,
"epoch": 0.1566205533596838,
"grad_norm": 5.37908292624943,
"kl": 0.2451171875,
"learning_rate": 8.436264822134386e-07,
"loss": 0.0098,
"reward": 2.67559814453125,
"reward_std": 0.1606387048959732,
"rewards/accuracy_reward_stage2": 0.6912230849266052,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.0625,
"step": 634
},
{
"completion_length": 57.015625,
"epoch": 0.15686758893280633,
"grad_norm": 1.930963468171265,
"kl": 0.203125,
"learning_rate": 8.433794466403162e-07,
"loss": 0.0081,
"reward": 2.9319870471954346,
"reward_std": 0.00449987780302763,
"rewards/accuracy_reward_stage2": 0.9319870471954346,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 635
},
{
"completion_length": 42.6875,
"epoch": 0.15711462450592886,
"grad_norm": 4.015840085817725,
"kl": 0.26953125,
"learning_rate": 8.431324110671937e-07,
"loss": 0.0109,
"reward": 2.7423253059387207,
"reward_std": 0.20044684410095215,
"rewards/accuracy_reward_stage2": 0.8048254251480103,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.140625,
"step": 636
},
{
"completion_length": 46.046875,
"epoch": 0.1573616600790514,
"grad_norm": 1.2951785973348273,
"kl": 0.2177734375,
"learning_rate": 8.428853754940711e-07,
"loss": 0.0087,
"reward": 2.8515625,
"reward_std": 0.022097086533904076,
"rewards/accuracy_reward_stage2": 0.8515625,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 637
},
{
"completion_length": 43.65625,
"epoch": 0.15760869565217392,
"grad_norm": 3.7465112575188666,
"kl": 0.23046875,
"learning_rate": 8.426383399209486e-07,
"loss": 0.0092,
"reward": 2.5327157974243164,
"reward_std": 0.17657220363616943,
"rewards/accuracy_reward_stage2": 0.6577157974243164,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.203125,
"step": 638
},
{
"completion_length": 36.625,
"epoch": 0.15785573122529645,
"grad_norm": 5.842474547361892,
"kl": 0.20703125,
"learning_rate": 8.42391304347826e-07,
"loss": 0.0083,
"reward": 2.6493334770202637,
"reward_std": 0.21465185284614563,
"rewards/accuracy_reward_stage2": 0.7743334770202637,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.140625,
"step": 639
},
{
"completion_length": 45.6875,
"epoch": 0.15810276679841898,
"grad_norm": 2.4835559209692075,
"kl": 0.24609375,
"learning_rate": 8.421442687747036e-07,
"loss": 0.0098,
"reward": 2.8289783000946045,
"reward_std": 0.016266150400042534,
"rewards/accuracy_reward_stage2": 0.8289782404899597,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 640
},
{
"completion_length": 59.5625,
"epoch": 0.1583498023715415,
"grad_norm": 5.194147021313203,
"kl": 0.294921875,
"learning_rate": 8.41897233201581e-07,
"loss": 0.0118,
"reward": 2.5701632499694824,
"reward_std": 0.032328180968761444,
"rewards/accuracy_reward_stage2": 0.5701633095741272,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 641
},
{
"completion_length": 41.265625,
"epoch": 0.15859683794466403,
"grad_norm": 1.9162461047086887,
"kl": 0.1650390625,
"learning_rate": 8.416501976284584e-07,
"loss": 0.0066,
"reward": 2.867898941040039,
"reward_std": 0.06962129473686218,
"rewards/accuracy_reward_stage2": 0.8678989410400391,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.390625,
"step": 642
},
{
"completion_length": 51.0,
"epoch": 0.15884387351778656,
"grad_norm": 2.573662013627622,
"kl": 0.193359375,
"learning_rate": 8.414031620553359e-07,
"loss": 0.0077,
"reward": 2.8405961990356445,
"reward_std": 0.0634043961763382,
"rewards/accuracy_reward_stage2": 0.8562212586402893,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.3125,
"step": 643
},
{
"completion_length": 71.03125,
"epoch": 0.1590909090909091,
"grad_norm": 1.7426308070279573,
"kl": 0.1787109375,
"learning_rate": 8.411561264822134e-07,
"loss": 0.0071,
"reward": 2.577765464782715,
"reward_std": 0.14586614072322845,
"rewards/accuracy_reward_stage2": 0.6663070917129517,
"rewards/format_reward_all_stage": 1.9114582538604736,
"scores/refine_times": 1.453125,
"step": 644
},
{
"completion_length": 46.453125,
"epoch": 0.15933794466403162,
"grad_norm": 4.271793905157046,
"kl": 0.28515625,
"learning_rate": 8.409090909090909e-07,
"loss": 0.0114,
"reward": 2.730409860610962,
"reward_std": 0.15720345079898834,
"rewards/accuracy_reward_stage2": 0.7929098606109619,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.25,
"step": 645
},
{
"completion_length": 67.453125,
"epoch": 0.15958498023715414,
"grad_norm": 5.033031730842746,
"kl": 0.2421875,
"learning_rate": 8.406620553359684e-07,
"loss": 0.0097,
"reward": 2.7433104515075684,
"reward_std": 0.05172478407621384,
"rewards/accuracy_reward_stage2": 0.7537272572517395,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.296875,
"step": 646
},
{
"completion_length": 76.640625,
"epoch": 0.15983201581027667,
"grad_norm": 3.2006449227471725,
"kl": 0.1611328125,
"learning_rate": 8.404150197628458e-07,
"loss": 0.0065,
"reward": 2.792520761489868,
"reward_std": 0.013646906241774559,
"rewards/accuracy_reward_stage2": 0.7925208210945129,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 647
},
{
"completion_length": 68.640625,
"epoch": 0.1600790513833992,
"grad_norm": 6.943408346730125,
"kl": 0.302734375,
"learning_rate": 8.401679841897232e-07,
"loss": 0.0121,
"reward": 2.5047967433929443,
"reward_std": 0.1528734713792801,
"rewards/accuracy_reward_stage2": 0.5829217433929443,
"rewards/format_reward_all_stage": 1.921875,
"scores/refine_times": 1.25,
"step": 648
},
{
"completion_length": 77.828125,
"epoch": 0.16032608695652173,
"grad_norm": 4.0567635031379705,
"kl": 0.1923828125,
"learning_rate": 8.399209486166008e-07,
"loss": 0.0077,
"reward": 2.656846046447754,
"reward_std": 0.04590492323040962,
"rewards/accuracy_reward_stage2": 0.6568462252616882,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 649
},
{
"completion_length": 67.625,
"epoch": 0.16057312252964426,
"grad_norm": 3.370288275593572,
"kl": 0.1533203125,
"learning_rate": 8.396739130434782e-07,
"loss": 0.0061,
"reward": 2.65291690826416,
"reward_std": 0.07776036858558655,
"rewards/accuracy_reward_stage2": 0.6529167890548706,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 650
},
{
"completion_length": 77.75,
"epoch": 0.16082015810276679,
"grad_norm": 0.9094780307831853,
"kl": 0.10546875,
"learning_rate": 8.394268774703556e-07,
"loss": 0.0042,
"reward": 2.8998451232910156,
"reward_std": 0.051297787576913834,
"rewards/accuracy_reward_stage2": 0.9060951471328735,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.3125,
"step": 651
},
{
"completion_length": 77.46875,
"epoch": 0.16106719367588934,
"grad_norm": 4.930984465220881,
"kl": 0.1171875,
"learning_rate": 8.391798418972331e-07,
"loss": 0.0047,
"reward": 2.6480984687805176,
"reward_std": 0.18127982318401337,
"rewards/accuracy_reward_stage2": 0.6793487071990967,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.265625,
"step": 652
},
{
"completion_length": 60.984375,
"epoch": 0.16131422924901187,
"grad_norm": 5.128137555423027,
"kl": 0.134765625,
"learning_rate": 8.389328063241107e-07,
"loss": 0.0054,
"reward": 2.765324115753174,
"reward_std": 0.08034379780292511,
"rewards/accuracy_reward_stage2": 0.7809491157531738,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.140625,
"step": 653
},
{
"completion_length": 74.890625,
"epoch": 0.1615612648221344,
"grad_norm": 4.648661118886044,
"kl": 0.134765625,
"learning_rate": 8.386857707509882e-07,
"loss": 0.0054,
"reward": 2.691357135772705,
"reward_std": 0.13039422035217285,
"rewards/accuracy_reward_stage2": 0.6913573145866394,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 654
},
{
"completion_length": 98.640625,
"epoch": 0.16180830039525693,
"grad_norm": 4.064729740373123,
"kl": 0.12353515625,
"learning_rate": 8.384387351778656e-07,
"loss": 0.0049,
"reward": 2.5702481269836426,
"reward_std": 0.13462196290493011,
"rewards/accuracy_reward_stage2": 0.5702481269836426,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.46875,
"step": 655
},
{
"completion_length": 113.84375,
"epoch": 0.16205533596837945,
"grad_norm": 3.2862871577391393,
"kl": 0.10595703125,
"learning_rate": 8.38191699604743e-07,
"loss": 0.0042,
"reward": 2.511234760284424,
"reward_std": 0.2531359791755676,
"rewards/accuracy_reward_stage2": 0.5841513872146606,
"rewards/format_reward_all_stage": 1.9270833730697632,
"scores/refine_times": 1.671875,
"step": 656
},
{
"completion_length": 92.34375,
"epoch": 0.16230237154150198,
"grad_norm": 4.422132694926118,
"kl": 0.1474609375,
"learning_rate": 8.379446640316206e-07,
"loss": 0.0059,
"reward": 2.3517141342163086,
"reward_std": 0.16402184963226318,
"rewards/accuracy_reward_stage2": 0.476714164018631,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.46875,
"step": 657
},
{
"completion_length": 70.734375,
"epoch": 0.1625494071146245,
"grad_norm": 1.1685505706287254,
"kl": 0.1162109375,
"learning_rate": 8.37697628458498e-07,
"loss": 0.0046,
"reward": 2.7522194385528564,
"reward_std": 0.05886061489582062,
"rewards/accuracy_reward_stage2": 0.7678444981575012,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.375,
"step": 658
},
{
"completion_length": 91.140625,
"epoch": 0.16279644268774704,
"grad_norm": 3.257325899603518,
"kl": 0.14453125,
"learning_rate": 8.374505928853754e-07,
"loss": 0.0058,
"reward": 2.632218837738037,
"reward_std": 0.09933055937290192,
"rewards/accuracy_reward_stage2": 0.658260703086853,
"rewards/format_reward_all_stage": 1.9739583730697632,
"scores/refine_times": 1.53125,
"step": 659
},
{
"completion_length": 95.90625,
"epoch": 0.16304347826086957,
"grad_norm": 1.9690432413920704,
"kl": 0.134765625,
"learning_rate": 8.372035573122529e-07,
"loss": 0.0054,
"reward": 2.771512508392334,
"reward_std": 0.06438760459423065,
"rewards/accuracy_reward_stage2": 0.7715123891830444,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.59375,
"step": 660
},
{
"completion_length": 78.234375,
"epoch": 0.1632905138339921,
"grad_norm": 4.936444107949188,
"kl": 0.18359375,
"learning_rate": 8.369565217391304e-07,
"loss": 0.0073,
"reward": 2.2439303398132324,
"reward_std": 0.28851431608200073,
"rewards/accuracy_reward_stage2": 0.4574721157550812,
"rewards/format_reward_all_stage": 1.7864583730697632,
"scores/refine_times": 1.484375,
"step": 661
},
{
"completion_length": 82.96875,
"epoch": 0.16353754940711462,
"grad_norm": 4.166445669222274,
"kl": 0.1474609375,
"learning_rate": 8.367094861660079e-07,
"loss": 0.0059,
"reward": 2.672656297683716,
"reward_std": 0.28908365964889526,
"rewards/accuracy_reward_stage2": 0.9226562976837158,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.421875,
"step": 662
},
{
"completion_length": 99.703125,
"epoch": 0.16378458498023715,
"grad_norm": 2.9695692781319765,
"kl": 0.1494140625,
"learning_rate": 8.364624505928854e-07,
"loss": 0.006,
"reward": 2.593446969985962,
"reward_std": 0.07469912618398666,
"rewards/accuracy_reward_stage2": 0.5934468507766724,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.5625,
"step": 663
},
{
"completion_length": 62.421875,
"epoch": 0.16403162055335968,
"grad_norm": 3.313859207150031,
"kl": 0.19921875,
"learning_rate": 8.362154150197628e-07,
"loss": 0.008,
"reward": 2.719926357269287,
"reward_std": 0.06940320134162903,
"rewards/accuracy_reward_stage2": 0.7199262380599976,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 664
},
{
"completion_length": 96.4375,
"epoch": 0.1642786561264822,
"grad_norm": 2.5612629248896943,
"kl": 0.1484375,
"learning_rate": 8.359683794466402e-07,
"loss": 0.006,
"reward": 2.539092540740967,
"reward_std": 0.16509577631950378,
"rewards/accuracy_reward_stage2": 0.5609675049781799,
"rewards/format_reward_all_stage": 1.978124976158142,
"scores/refine_times": 1.5,
"step": 665
},
{
"completion_length": 92.9375,
"epoch": 0.16452569169960474,
"grad_norm": 4.211580308464546,
"kl": 0.2255859375,
"learning_rate": 8.357213438735178e-07,
"loss": 0.009,
"reward": 2.471635580062866,
"reward_std": 0.37825992703437805,
"rewards/accuracy_reward_stage2": 0.700802206993103,
"rewards/format_reward_all_stage": 1.7708333730697632,
"scores/refine_times": 1.390625,
"step": 666
},
{
"completion_length": 82.265625,
"epoch": 0.16477272727272727,
"grad_norm": 3.6219249232882227,
"kl": 0.2490234375,
"learning_rate": 8.354743083003952e-07,
"loss": 0.01,
"reward": 2.845177173614502,
"reward_std": 0.11333870142698288,
"rewards/accuracy_reward_stage2": 0.8451772332191467,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.515625,
"step": 667
},
{
"completion_length": 75.5,
"epoch": 0.1650197628458498,
"grad_norm": 2.3153256870816588,
"kl": 0.2080078125,
"learning_rate": 8.352272727272727e-07,
"loss": 0.0083,
"reward": 2.7491612434387207,
"reward_std": 0.14487169682979584,
"rewards/accuracy_reward_stage2": 0.7804111838340759,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.46875,
"step": 668
},
{
"completion_length": 92.0625,
"epoch": 0.16526679841897232,
"grad_norm": 2.3136812707049246,
"kl": 0.169921875,
"learning_rate": 8.349802371541501e-07,
"loss": 0.0068,
"reward": 2.8566527366638184,
"reward_std": 0.01673370786011219,
"rewards/accuracy_reward_stage2": 0.856652557849884,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.640625,
"step": 669
},
{
"completion_length": 81.703125,
"epoch": 0.16551383399209485,
"grad_norm": 2.4637521602454946,
"kl": 0.1640625,
"learning_rate": 8.347332015810276e-07,
"loss": 0.0066,
"reward": 2.7337234020233154,
"reward_std": 0.08656609803438187,
"rewards/accuracy_reward_stage2": 0.749348521232605,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.296875,
"step": 670
},
{
"completion_length": 72.140625,
"epoch": 0.16576086956521738,
"grad_norm": 4.509112434753636,
"kl": 0.203125,
"learning_rate": 8.344861660079052e-07,
"loss": 0.0081,
"reward": 2.5170774459838867,
"reward_std": 0.1543843299150467,
"rewards/accuracy_reward_stage2": 0.6420773267745972,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.390625,
"step": 671
},
{
"completion_length": 70.6875,
"epoch": 0.16600790513833993,
"grad_norm": 3.6089679100156515,
"kl": 0.310546875,
"learning_rate": 8.342391304347826e-07,
"loss": 0.0124,
"reward": 2.774411916732788,
"reward_std": 0.16703827679157257,
"rewards/accuracy_reward_stage2": 0.8994120359420776,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.3125,
"step": 672
},
{
"completion_length": 76.796875,
"epoch": 0.16625494071146246,
"grad_norm": 3.7602906391473985,
"kl": 0.220703125,
"learning_rate": 8.3399209486166e-07,
"loss": 0.0088,
"reward": 2.7770447731018066,
"reward_std": 0.1483728587627411,
"rewards/accuracy_reward_stage2": 0.8395448923110962,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.515625,
"step": 673
},
{
"completion_length": 105.421875,
"epoch": 0.166501976284585,
"grad_norm": 5.874770192574941,
"kl": 0.263671875,
"learning_rate": 8.337450592885376e-07,
"loss": 0.0105,
"reward": 2.7320079803466797,
"reward_std": 0.22740009427070618,
"rewards/accuracy_reward_stage2": 0.8517996072769165,
"rewards/format_reward_all_stage": 1.8802083730697632,
"scores/refine_times": 1.59375,
"step": 674
},
{
"completion_length": 55.96875,
"epoch": 0.16674901185770752,
"grad_norm": 6.3659434646120445,
"kl": 0.271484375,
"learning_rate": 8.33498023715415e-07,
"loss": 0.0108,
"reward": 2.706432580947876,
"reward_std": 0.12657755613327026,
"rewards/accuracy_reward_stage2": 0.7064326405525208,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 675
},
{
"completion_length": 62.515625,
"epoch": 0.16699604743083005,
"grad_norm": 3.602748165149088,
"kl": 0.2138671875,
"learning_rate": 8.332509881422924e-07,
"loss": 0.0086,
"reward": 2.6532373428344727,
"reward_std": 0.12004198879003525,
"rewards/accuracy_reward_stage2": 0.6532373428344727,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 676
},
{
"completion_length": 70.375,
"epoch": 0.16724308300395258,
"grad_norm": 3.767716762503837,
"kl": 0.1865234375,
"learning_rate": 8.330039525691699e-07,
"loss": 0.0075,
"reward": 2.7938568592071533,
"reward_std": 0.10841001570224762,
"rewards/accuracy_reward_stage2": 0.8094819784164429,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.171875,
"step": 677
},
{
"completion_length": 67.5,
"epoch": 0.1674901185770751,
"grad_norm": 5.950488721235662,
"kl": 0.19921875,
"learning_rate": 8.327569169960474e-07,
"loss": 0.008,
"reward": 2.2359559535980225,
"reward_std": 0.1503939926624298,
"rewards/accuracy_reward_stage2": 0.23595598340034485,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 678
},
{
"completion_length": 60.78125,
"epoch": 0.16773715415019763,
"grad_norm": 6.45907229221516,
"kl": 0.4375,
"learning_rate": 8.325098814229248e-07,
"loss": 0.0176,
"reward": 2.4382810592651367,
"reward_std": 0.24932968616485596,
"rewards/accuracy_reward_stage2": 0.5736978054046631,
"rewards/format_reward_all_stage": 1.8645833730697632,
"scores/refine_times": 1.375,
"step": 679
},
{
"completion_length": 70.515625,
"epoch": 0.16798418972332016,
"grad_norm": 3.8960198945184024,
"kl": 0.185546875,
"learning_rate": 8.322628458498023e-07,
"loss": 0.0074,
"reward": 2.7114639282226562,
"reward_std": 0.10305608808994293,
"rewards/accuracy_reward_stage2": 0.7114640474319458,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.4375,
"step": 680
},
{
"completion_length": 62.25,
"epoch": 0.1682312252964427,
"grad_norm": 2.3643728630811998,
"kl": 0.173828125,
"learning_rate": 8.320158102766798e-07,
"loss": 0.0069,
"reward": 2.6612939834594727,
"reward_std": 0.00641861604526639,
"rewards/accuracy_reward_stage2": 0.6612938642501831,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 681
},
{
"completion_length": 50.421875,
"epoch": 0.16847826086956522,
"grad_norm": 4.307232779181777,
"kl": 0.1630859375,
"learning_rate": 8.317687747035574e-07,
"loss": 0.0065,
"reward": 2.8651654720306396,
"reward_std": 0.07820230722427368,
"rewards/accuracy_reward_stage2": 0.8651654124259949,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 682
},
{
"completion_length": 62.4375,
"epoch": 0.16872529644268774,
"grad_norm": 4.587939777537952,
"kl": 0.2119140625,
"learning_rate": 8.315217391304348e-07,
"loss": 0.0085,
"reward": 2.7178468704223633,
"reward_std": 0.04774696007370949,
"rewards/accuracy_reward_stage2": 0.7178468704223633,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 683
},
{
"completion_length": 55.09375,
"epoch": 0.16897233201581027,
"grad_norm": 1.624931490884714,
"kl": 0.09228515625,
"learning_rate": 8.312747035573122e-07,
"loss": 0.0037,
"reward": 2.6188762187957764,
"reward_std": 0.0659729540348053,
"rewards/accuracy_reward_stage2": 0.6813762784004211,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.1875,
"step": 684
},
{
"completion_length": 44.921875,
"epoch": 0.1692193675889328,
"grad_norm": 2.746456916397565,
"kl": 0.177734375,
"learning_rate": 8.310276679841897e-07,
"loss": 0.0071,
"reward": 2.862575054168701,
"reward_std": 0.07828960567712784,
"rewards/accuracy_reward_stage2": 0.8781998753547668,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.140625,
"step": 685
},
{
"completion_length": 59.140625,
"epoch": 0.16946640316205533,
"grad_norm": 3.164878176588737,
"kl": 0.16015625,
"learning_rate": 8.307806324110671e-07,
"loss": 0.0064,
"reward": 2.7457258701324463,
"reward_std": 0.08368375897407532,
"rewards/accuracy_reward_stage2": 0.7613507509231567,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.203125,
"step": 686
},
{
"completion_length": 60.09375,
"epoch": 0.16971343873517786,
"grad_norm": 4.429099779687638,
"kl": 0.1953125,
"learning_rate": 8.305335968379446e-07,
"loss": 0.0078,
"reward": 2.592247486114502,
"reward_std": 0.12885862588882446,
"rewards/accuracy_reward_stage2": 0.6078723669052124,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.21875,
"step": 687
},
{
"completion_length": 53.765625,
"epoch": 0.16996047430830039,
"grad_norm": 5.471292386957025,
"kl": 0.173828125,
"learning_rate": 8.302865612648221e-07,
"loss": 0.007,
"reward": 2.387572765350342,
"reward_std": 0.07770496606826782,
"rewards/accuracy_reward_stage2": 0.3875727653503418,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 688
},
{
"completion_length": 48.375,
"epoch": 0.1702075098814229,
"grad_norm": 5.2370692806906325,
"kl": 0.1416015625,
"learning_rate": 8.300395256916995e-07,
"loss": 0.0057,
"reward": 2.590374231338501,
"reward_std": 0.030147546902298927,
"rewards/accuracy_reward_stage2": 0.5903741717338562,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 689
},
{
"completion_length": 68.265625,
"epoch": 0.17045454545454544,
"grad_norm": 3.3215452215659234,
"kl": 0.1416015625,
"learning_rate": 8.29792490118577e-07,
"loss": 0.0057,
"reward": 2.662672281265259,
"reward_std": 0.12791702151298523,
"rewards/accuracy_reward_stage2": 0.7303804159164429,
"rewards/format_reward_all_stage": 1.9322917461395264,
"scores/refine_times": 1.28125,
"step": 690
},
{
"completion_length": 72.71875,
"epoch": 0.170701581027668,
"grad_norm": 4.221926503688189,
"kl": 0.1806640625,
"learning_rate": 8.295454545454546e-07,
"loss": 0.0072,
"reward": 2.465261697769165,
"reward_std": 0.19621172547340393,
"rewards/accuracy_reward_stage2": 0.6475533246994019,
"rewards/format_reward_all_stage": 1.8177083730697632,
"scores/refine_times": 1.40625,
"step": 691
},
{
"completion_length": 58.15625,
"epoch": 0.17094861660079053,
"grad_norm": 4.999034875079111,
"kl": 0.1943359375,
"learning_rate": 8.29298418972332e-07,
"loss": 0.0078,
"reward": 2.3911185264587402,
"reward_std": 0.3185882568359375,
"rewards/accuracy_reward_stage2": 0.6567436456680298,
"rewards/format_reward_all_stage": 1.734375,
"scores/refine_times": 1.125,
"step": 692
},
{
"completion_length": 49.921875,
"epoch": 0.17119565217391305,
"grad_norm": 3.6830696834703565,
"kl": 0.2421875,
"learning_rate": 8.290513833992095e-07,
"loss": 0.0097,
"reward": 2.8560991287231445,
"reward_std": 0.08281519263982773,
"rewards/accuracy_reward_stage2": 0.856099009513855,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 693
},
{
"completion_length": 34.125,
"epoch": 0.17144268774703558,
"grad_norm": 0.9783630797309559,
"kl": 0.2275390625,
"learning_rate": 8.288043478260869e-07,
"loss": 0.0091,
"reward": 2.609375,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward_stage2": 0.734375,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 694
},
{
"completion_length": 69.125,
"epoch": 0.1716897233201581,
"grad_norm": 3.0218653160299795,
"kl": 0.173828125,
"learning_rate": 8.285573122529644e-07,
"loss": 0.007,
"reward": 2.6367897987365723,
"reward_std": 0.12302423268556595,
"rewards/accuracy_reward_stage2": 0.7748106718063354,
"rewards/format_reward_all_stage": 1.8619791269302368,
"scores/refine_times": 1.265625,
"step": 695
},
{
"completion_length": 54.234375,
"epoch": 0.17193675889328064,
"grad_norm": 4.184913189641024,
"kl": 0.248046875,
"learning_rate": 8.283102766798419e-07,
"loss": 0.0099,
"reward": 2.395554542541504,
"reward_std": 0.39820319414138794,
"rewards/accuracy_reward_stage2": 0.6611795425415039,
"rewards/format_reward_all_stage": 1.734375,
"scores/refine_times": 1.203125,
"step": 696
},
{
"completion_length": 43.5,
"epoch": 0.17218379446640317,
"grad_norm": 2.595884108341551,
"kl": 0.248046875,
"learning_rate": 8.280632411067193e-07,
"loss": 0.0099,
"reward": 2.8225650787353516,
"reward_std": 0.17631623148918152,
"rewards/accuracy_reward_stage2": 0.9631900191307068,
"rewards/format_reward_all_stage": 1.859375,
"scores/refine_times": 1.171875,
"step": 697
},
{
"completion_length": 38.953125,
"epoch": 0.1724308300395257,
"grad_norm": 4.022940918000707,
"kl": 0.26171875,
"learning_rate": 8.278162055335967e-07,
"loss": 0.0105,
"reward": 2.7802834510803223,
"reward_std": 0.18306688964366913,
"rewards/accuracy_reward_stage2": 0.9052833318710327,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 698
},
{
"completion_length": 42.90625,
"epoch": 0.17267786561264822,
"grad_norm": 3.4391934805126767,
"kl": 0.2490234375,
"learning_rate": 8.275691699604744e-07,
"loss": 0.0099,
"reward": 2.6163432598114014,
"reward_std": 0.23740006983280182,
"rewards/accuracy_reward_stage2": 0.6319682002067566,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.125,
"step": 699
},
{
"completion_length": 49.4375,
"epoch": 0.17292490118577075,
"grad_norm": 5.477631698074841,
"kl": 0.34375,
"learning_rate": 8.273221343873518e-07,
"loss": 0.0137,
"reward": 2.540916681289673,
"reward_std": 0.21585610508918762,
"rewards/accuracy_reward_stage2": 0.6034167408943176,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.203125,
"step": 700
},
{
"completion_length": 37.3125,
"epoch": 0.17317193675889328,
"grad_norm": 4.275784599530968,
"kl": 0.1767578125,
"learning_rate": 8.270750988142292e-07,
"loss": 0.0071,
"reward": 2.82668399810791,
"reward_std": 0.12060071527957916,
"rewards/accuracy_reward_stage2": 0.8266839385032654,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 701
},
{
"completion_length": 44.0,
"epoch": 0.1734189723320158,
"grad_norm": 5.929761937498036,
"kl": 0.1982421875,
"learning_rate": 8.268280632411067e-07,
"loss": 0.0079,
"reward": 2.5984463691711426,
"reward_std": 0.10440248996019363,
"rewards/accuracy_reward_stage2": 0.5984464883804321,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 702
},
{
"completion_length": 38.046875,
"epoch": 0.17366600790513834,
"grad_norm": 4.36820409837742,
"kl": 0.2177734375,
"learning_rate": 8.265810276679841e-07,
"loss": 0.0087,
"reward": 2.6032559871673584,
"reward_std": 0.021172545850276947,
"rewards/accuracy_reward_stage2": 0.6032558679580688,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 703
},
{
"completion_length": 29.8125,
"epoch": 0.17391304347826086,
"grad_norm": 4.34358181377926,
"kl": 0.263671875,
"learning_rate": 8.263339920948616e-07,
"loss": 0.0106,
"reward": 2.7768826484680176,
"reward_std": 0.04401562735438347,
"rewards/accuracy_reward_stage2": 0.7768827080726624,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 704
},
{
"completion_length": 39.890625,
"epoch": 0.1741600790513834,
"grad_norm": 3.8777171386458273,
"kl": 0.2314453125,
"learning_rate": 8.260869565217391e-07,
"loss": 0.0093,
"reward": 2.51078200340271,
"reward_std": 0.15417703986167908,
"rewards/accuracy_reward_stage2": 0.5784904360771179,
"rewards/format_reward_all_stage": 1.9322916269302368,
"scores/refine_times": 1.078125,
"step": 705
},
{
"completion_length": 29.34375,
"epoch": 0.17440711462450592,
"grad_norm": 5.0953232936524895,
"kl": 0.296875,
"learning_rate": 8.258399209486165e-07,
"loss": 0.0118,
"reward": 2.5522358417510986,
"reward_std": 0.07304012030363083,
"rewards/accuracy_reward_stage2": 0.5522358417510986,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 706
},
{
"completion_length": 49.625,
"epoch": 0.17465415019762845,
"grad_norm": 2.902160155311922,
"kl": 0.2373046875,
"learning_rate": 8.255928853754939e-07,
"loss": 0.0095,
"reward": 2.645291805267334,
"reward_std": 0.0809149295091629,
"rewards/accuracy_reward_stage2": 0.8327919244766235,
"rewards/format_reward_all_stage": 1.8125,
"scores/refine_times": 1.125,
"step": 707
},
{
"completion_length": 30.375,
"epoch": 0.17490118577075098,
"grad_norm": 3.3841190373178414,
"kl": 0.224609375,
"learning_rate": 8.253458498023716e-07,
"loss": 0.009,
"reward": 2.648444414138794,
"reward_std": 0.0884200856089592,
"rewards/accuracy_reward_stage2": 0.6484442949295044,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 708
},
{
"completion_length": 37.4375,
"epoch": 0.1751482213438735,
"grad_norm": 2.569762486464144,
"kl": 0.22265625,
"learning_rate": 8.25098814229249e-07,
"loss": 0.0089,
"reward": 2.728829860687256,
"reward_std": 0.07126966118812561,
"rewards/accuracy_reward_stage2": 0.7913298606872559,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 709
},
{
"completion_length": 29.625,
"epoch": 0.17539525691699603,
"grad_norm": 2.9841615476853205,
"kl": 0.2001953125,
"learning_rate": 8.248517786561265e-07,
"loss": 0.008,
"reward": 2.771129608154297,
"reward_std": 0.01384773664176464,
"rewards/accuracy_reward_stage2": 0.7711294889450073,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 710
},
{
"completion_length": 38.1875,
"epoch": 0.1756422924901186,
"grad_norm": 3.3518342412176336,
"kl": 0.2314453125,
"learning_rate": 8.246047430830039e-07,
"loss": 0.0092,
"reward": 2.5392909049987793,
"reward_std": 0.027141904458403587,
"rewards/accuracy_reward_stage2": 0.5392909049987793,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 711
},
{
"completion_length": 45.421875,
"epoch": 0.17588932806324112,
"grad_norm": 5.272718022007329,
"kl": 0.32421875,
"learning_rate": 8.243577075098814e-07,
"loss": 0.013,
"reward": 2.4325010776519775,
"reward_std": 0.3576674461364746,
"rewards/accuracy_reward_stage2": 0.7450010180473328,
"rewards/format_reward_all_stage": 1.6875,
"scores/refine_times": 1.125,
"step": 712
},
{
"completion_length": 44.828125,
"epoch": 0.17613636363636365,
"grad_norm": 4.860902186643657,
"kl": 0.2353515625,
"learning_rate": 8.241106719367589e-07,
"loss": 0.0094,
"reward": 2.4493470191955566,
"reward_std": 0.18647147715091705,
"rewards/accuracy_reward_stage2": 0.5743468999862671,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.078125,
"step": 713
},
{
"completion_length": 38.359375,
"epoch": 0.17638339920948617,
"grad_norm": 4.4256498100793324,
"kl": 0.23046875,
"learning_rate": 8.238636363636363e-07,
"loss": 0.0093,
"reward": 2.6608829498291016,
"reward_std": 0.08993560075759888,
"rewards/accuracy_reward_stage2": 0.7233830690383911,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.1875,
"step": 714
},
{
"completion_length": 33.0625,
"epoch": 0.1766304347826087,
"grad_norm": 4.296817652715915,
"kl": 0.27734375,
"learning_rate": 8.236166007905137e-07,
"loss": 0.0111,
"reward": 2.7572100162506104,
"reward_std": 0.1819225251674652,
"rewards/accuracy_reward_stage2": 0.8822100162506104,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0,
"step": 715
},
{
"completion_length": 39.65625,
"epoch": 0.17687747035573123,
"grad_norm": 1.246417044381739,
"kl": 0.2236328125,
"learning_rate": 8.233695652173913e-07,
"loss": 0.0089,
"reward": 2.941277265548706,
"reward_std": 0.04274333268404007,
"rewards/accuracy_reward_stage2": 0.956902265548706,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.125,
"step": 716
},
{
"completion_length": 62.546875,
"epoch": 0.17712450592885376,
"grad_norm": 3.592067481060388,
"kl": 0.15234375,
"learning_rate": 8.231225296442687e-07,
"loss": 0.0061,
"reward": 2.607104778289795,
"reward_std": 0.10167138278484344,
"rewards/accuracy_reward_stage2": 0.6696048378944397,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 717
},
{
"completion_length": 65.015625,
"epoch": 0.1773715415019763,
"grad_norm": 4.019521038571473,
"kl": 0.158203125,
"learning_rate": 8.228754940711462e-07,
"loss": 0.0063,
"reward": 2.6828722953796387,
"reward_std": 0.044261686503887177,
"rewards/accuracy_reward_stage2": 0.6828722357749939,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 718
},
{
"completion_length": 45.25,
"epoch": 0.17761857707509882,
"grad_norm": 6.127963836128111,
"kl": 0.17578125,
"learning_rate": 8.226284584980237e-07,
"loss": 0.007,
"reward": 2.5614614486694336,
"reward_std": 0.16553114354610443,
"rewards/accuracy_reward_stage2": 0.6239614486694336,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 719
},
{
"completion_length": 62.0625,
"epoch": 0.17786561264822134,
"grad_norm": 5.5689391101787225,
"kl": 0.162109375,
"learning_rate": 8.223814229249012e-07,
"loss": 0.0065,
"reward": 2.605252265930176,
"reward_std": 0.15683504939079285,
"rewards/accuracy_reward_stage2": 0.6677523255348206,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.21875,
"step": 720
},
{
"completion_length": 53.53125,
"epoch": 0.17811264822134387,
"grad_norm": 3.7608697254725154,
"kl": 0.1787109375,
"learning_rate": 8.221343873517787e-07,
"loss": 0.0071,
"reward": 2.357903480529785,
"reward_std": 0.037971869111061096,
"rewards/accuracy_reward_stage2": 0.35790371894836426,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 721
},
{
"completion_length": 60.46875,
"epoch": 0.1783596837944664,
"grad_norm": 2.9452310520498797,
"kl": 0.11962890625,
"learning_rate": 8.218873517786561e-07,
"loss": 0.0048,
"reward": 2.753098964691162,
"reward_std": 0.056902870535850525,
"rewards/accuracy_reward_stage2": 0.7530988454818726,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 722
},
{
"completion_length": 69.234375,
"epoch": 0.17860671936758893,
"grad_norm": 1.1382233123873722,
"kl": 0.10400390625,
"learning_rate": 8.216403162055335e-07,
"loss": 0.0042,
"reward": 2.576420545578003,
"reward_std": 0.004017648287117481,
"rewards/accuracy_reward_stage2": 0.5764204859733582,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 723
},
{
"completion_length": 75.015625,
"epoch": 0.17885375494071146,
"grad_norm": 2.173861915208193,
"kl": 0.1181640625,
"learning_rate": 8.21393280632411e-07,
"loss": 0.0047,
"reward": 2.8701119422912598,
"reward_std": 0.03073050081729889,
"rewards/accuracy_reward_stage2": 0.8701118230819702,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 724
},
{
"completion_length": 102.875,
"epoch": 0.17910079051383399,
"grad_norm": 2.5923588665830204,
"kl": 0.11181640625,
"learning_rate": 8.211462450592885e-07,
"loss": 0.0045,
"reward": 2.534066677093506,
"reward_std": 0.11862494796514511,
"rewards/accuracy_reward_stage2": 0.6069832444190979,
"rewards/format_reward_all_stage": 1.9270833730697632,
"scores/refine_times": 1.4375,
"step": 725
},
{
"completion_length": 80.375,
"epoch": 0.1793478260869565,
"grad_norm": 4.406094013771105,
"kl": 0.09521484375,
"learning_rate": 8.208992094861659e-07,
"loss": 0.0038,
"reward": 2.7627735137939453,
"reward_std": 0.05562632530927658,
"rewards/accuracy_reward_stage2": 0.7627733945846558,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 726
},
{
"completion_length": 78.71875,
"epoch": 0.17959486166007904,
"grad_norm": 4.420639598786895,
"kl": 0.13671875,
"learning_rate": 8.206521739130435e-07,
"loss": 0.0055,
"reward": 2.427016258239746,
"reward_std": 0.20195448398590088,
"rewards/accuracy_reward_stage2": 0.5520162582397461,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 727
},
{
"completion_length": 102.9375,
"epoch": 0.17984189723320157,
"grad_norm": 3.7517930348608077,
"kl": 0.1044921875,
"learning_rate": 8.204051383399209e-07,
"loss": 0.0042,
"reward": 2.55942440032959,
"reward_std": 0.2075508087873459,
"rewards/accuracy_reward_stage2": 0.6844244003295898,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.140625,
"step": 728
},
{
"completion_length": 87.328125,
"epoch": 0.1800889328063241,
"grad_norm": 4.610010902240235,
"kl": 0.134765625,
"learning_rate": 8.201581027667984e-07,
"loss": 0.0054,
"reward": 2.5851545333862305,
"reward_std": 0.07183162122964859,
"rewards/accuracy_reward_stage2": 0.6476545333862305,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.1875,
"step": 729
},
{
"completion_length": 93.359375,
"epoch": 0.18033596837944665,
"grad_norm": 3.6922330703671884,
"kl": 0.0859375,
"learning_rate": 8.199110671936759e-07,
"loss": 0.0034,
"reward": 2.8179094791412354,
"reward_std": 0.018987158313393593,
"rewards/accuracy_reward_stage2": 0.8179094195365906,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 730
},
{
"completion_length": 87.75,
"epoch": 0.18058300395256918,
"grad_norm": 2.7718550607441177,
"kl": 0.08251953125,
"learning_rate": 8.196640316205533e-07,
"loss": 0.0033,
"reward": 2.840196371078491,
"reward_std": 0.13527555763721466,
"rewards/accuracy_reward_stage2": 0.9026963114738464,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.25,
"step": 731
},
{
"completion_length": 75.296875,
"epoch": 0.1808300395256917,
"grad_norm": 4.568714612629976,
"kl": 0.1865234375,
"learning_rate": 8.194169960474307e-07,
"loss": 0.0075,
"reward": 2.598619222640991,
"reward_std": 0.05828527733683586,
"rewards/accuracy_reward_stage2": 0.5986192226409912,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 732
},
{
"completion_length": 82.625,
"epoch": 0.18107707509881424,
"grad_norm": 3.3078917095674036,
"kl": 0.099609375,
"learning_rate": 8.191699604743083e-07,
"loss": 0.004,
"reward": 2.803985118865967,
"reward_std": 0.041249554604291916,
"rewards/accuracy_reward_stage2": 0.8039852380752563,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 733
},
{
"completion_length": 62.75,
"epoch": 0.18132411067193677,
"grad_norm": 4.665197863030277,
"kl": 0.1328125,
"learning_rate": 8.189229249011857e-07,
"loss": 0.0053,
"reward": 2.7739241123199463,
"reward_std": 0.09368322789669037,
"rewards/accuracy_reward_stage2": 0.7739241123199463,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 734
},
{
"completion_length": 87.03125,
"epoch": 0.1815711462450593,
"grad_norm": 5.230207042065446,
"kl": 0.1376953125,
"learning_rate": 8.186758893280632e-07,
"loss": 0.0055,
"reward": 2.6669585704803467,
"reward_std": 0.13236752152442932,
"rewards/accuracy_reward_stage2": 0.6669585704803467,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 735
},
{
"completion_length": 79.671875,
"epoch": 0.18181818181818182,
"grad_norm": 5.098012008996088,
"kl": 0.10400390625,
"learning_rate": 8.184288537549407e-07,
"loss": 0.0042,
"reward": 2.605584144592285,
"reward_std": 0.19998475909233093,
"rewards/accuracy_reward_stage2": 0.6055843830108643,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 736
},
{
"completion_length": 87.796875,
"epoch": 0.18206521739130435,
"grad_norm": 3.218557975199275,
"kl": 0.0927734375,
"learning_rate": 8.181818181818182e-07,
"loss": 0.0037,
"reward": 2.5339651107788086,
"reward_std": 0.09329190850257874,
"rewards/accuracy_reward_stage2": 0.6016733646392822,
"rewards/format_reward_all_stage": 1.9322917461395264,
"scores/refine_times": 1.140625,
"step": 737
},
{
"completion_length": 91.671875,
"epoch": 0.18231225296442688,
"grad_norm": 2.9728018450072504,
"kl": 0.1220703125,
"learning_rate": 8.179347826086957e-07,
"loss": 0.0049,
"reward": 2.5654635429382324,
"reward_std": 0.07684879750013351,
"rewards/accuracy_reward_stage2": 0.5654636025428772,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 738
},
{
"completion_length": 76.875,
"epoch": 0.1825592885375494,
"grad_norm": 4.098914602594136,
"kl": 0.1220703125,
"learning_rate": 8.176877470355731e-07,
"loss": 0.0049,
"reward": 2.6792609691619873,
"reward_std": 0.1466922014951706,
"rewards/accuracy_reward_stage2": 0.8042609691619873,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.0625,
"step": 739
},
{
"completion_length": 58.1875,
"epoch": 0.18280632411067194,
"grad_norm": 0.34555494426893213,
"kl": 0.1044921875,
"learning_rate": 8.174407114624505e-07,
"loss": 0.0042,
"reward": 2.6822760105133057,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.6822760701179504,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 740
},
{
"completion_length": 71.609375,
"epoch": 0.18305335968379446,
"grad_norm": 4.662745533117084,
"kl": 0.119140625,
"learning_rate": 8.17193675889328e-07,
"loss": 0.0048,
"reward": 2.5369908809661865,
"reward_std": 0.11054471135139465,
"rewards/accuracy_reward_stage2": 0.5369909405708313,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 741
},
{
"completion_length": 69.75,
"epoch": 0.183300395256917,
"grad_norm": 2.243839387959608,
"kl": 0.11767578125,
"learning_rate": 8.169466403162055e-07,
"loss": 0.0047,
"reward": 2.672517776489258,
"reward_std": 0.029596952721476555,
"rewards/accuracy_reward_stage2": 0.6725177764892578,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 742
},
{
"completion_length": 83.4375,
"epoch": 0.18354743083003952,
"grad_norm": 3.3308848465244423,
"kl": 0.091796875,
"learning_rate": 8.166996047430829e-07,
"loss": 0.0037,
"reward": 2.738541603088379,
"reward_std": 0.1675356924533844,
"rewards/accuracy_reward_stage2": 0.7385417222976685,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 743
},
{
"completion_length": 80.859375,
"epoch": 0.18379446640316205,
"grad_norm": 4.228437836918937,
"kl": 0.13671875,
"learning_rate": 8.164525691699604e-07,
"loss": 0.0055,
"reward": 2.600447654724121,
"reward_std": 0.27785760164260864,
"rewards/accuracy_reward_stage2": 0.8035725355148315,
"rewards/format_reward_all_stage": 1.796875,
"scores/refine_times": 1.296875,
"step": 744
},
{
"completion_length": 77.90625,
"epoch": 0.18404150197628458,
"grad_norm": 2.75672529825747,
"kl": 0.0908203125,
"learning_rate": 8.162055335968378e-07,
"loss": 0.0036,
"reward": 2.814410448074341,
"reward_std": 0.03253195434808731,
"rewards/accuracy_reward_stage2": 0.8144104480743408,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 745
},
{
"completion_length": 64.78125,
"epoch": 0.1842885375494071,
"grad_norm": 2.6445111122830727,
"kl": 0.09716796875,
"learning_rate": 8.159584980237155e-07,
"loss": 0.0039,
"reward": 2.8770241737365723,
"reward_std": 0.07037458568811417,
"rewards/accuracy_reward_stage2": 0.9395240545272827,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.0625,
"step": 746
},
{
"completion_length": 69.5,
"epoch": 0.18453557312252963,
"grad_norm": 2.5683252121883693,
"kl": 0.1220703125,
"learning_rate": 8.157114624505929e-07,
"loss": 0.0049,
"reward": 2.676783561706543,
"reward_std": 0.033407654613256454,
"rewards/accuracy_reward_stage2": 0.6767836809158325,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 747
},
{
"completion_length": 75.671875,
"epoch": 0.18478260869565216,
"grad_norm": 1.7986182845085459,
"kl": 0.10986328125,
"learning_rate": 8.154644268774703e-07,
"loss": 0.0044,
"reward": 2.851139783859253,
"reward_std": 0.00538706174120307,
"rewards/accuracy_reward_stage2": 0.8511397838592529,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 748
},
{
"completion_length": 70.0,
"epoch": 0.1850296442687747,
"grad_norm": 5.4302296395693235,
"kl": 0.12158203125,
"learning_rate": 8.152173913043478e-07,
"loss": 0.0049,
"reward": 2.357592821121216,
"reward_std": 0.05461695045232773,
"rewards/accuracy_reward_stage2": 0.3575928211212158,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 749
},
{
"completion_length": 80.65625,
"epoch": 0.18527667984189725,
"grad_norm": 5.106369592091597,
"kl": 0.2060546875,
"learning_rate": 8.149703557312253e-07,
"loss": 0.0082,
"reward": 2.7984001636505127,
"reward_std": 0.09079733490943909,
"rewards/accuracy_reward_stage2": 0.7984002828598022,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 750
},
{
"completion_length": 95.71875,
"epoch": 0.18552371541501977,
"grad_norm": 3.9061202300388365,
"kl": 0.09033203125,
"learning_rate": 8.147233201581027e-07,
"loss": 0.0036,
"reward": 2.681049108505249,
"reward_std": 0.08273804187774658,
"rewards/accuracy_reward_stage2": 0.7018824219703674,
"rewards/format_reward_all_stage": 1.9791667461395264,
"scores/refine_times": 1.234375,
"step": 751
},
{
"completion_length": 87.859375,
"epoch": 0.1857707509881423,
"grad_norm": 4.0976955965839785,
"kl": 0.11181640625,
"learning_rate": 8.144762845849802e-07,
"loss": 0.0045,
"reward": 2.7674148082733154,
"reward_std": 0.1868867129087448,
"rewards/accuracy_reward_stage2": 0.7778315544128418,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.140625,
"step": 752
},
{
"completion_length": 88.75,
"epoch": 0.18601778656126483,
"grad_norm": 4.555919227092463,
"kl": 0.10888671875,
"learning_rate": 8.142292490118576e-07,
"loss": 0.0044,
"reward": 2.463618278503418,
"reward_std": 0.16842269897460938,
"rewards/accuracy_reward_stage2": 0.5886183977127075,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.234375,
"step": 753
},
{
"completion_length": 84.171875,
"epoch": 0.18626482213438736,
"grad_norm": 5.010271159735403,
"kl": 0.125,
"learning_rate": 8.139822134387351e-07,
"loss": 0.005,
"reward": 2.6570065021514893,
"reward_std": 0.05975431948900223,
"rewards/accuracy_reward_stage2": 0.657006561756134,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 754
},
{
"completion_length": 88.21875,
"epoch": 0.1865118577075099,
"grad_norm": 5.939693635822729,
"kl": 0.1064453125,
"learning_rate": 8.137351778656127e-07,
"loss": 0.0043,
"reward": 2.694657325744629,
"reward_std": 0.11330029368400574,
"rewards/accuracy_reward_stage2": 0.6946573257446289,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 755
},
{
"completion_length": 90.359375,
"epoch": 0.18675889328063242,
"grad_norm": 3.393811545438695,
"kl": 0.130859375,
"learning_rate": 8.134881422924901e-07,
"loss": 0.0052,
"reward": 2.7568604946136475,
"reward_std": 0.1113467812538147,
"rewards/accuracy_reward_stage2": 0.819360613822937,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.265625,
"step": 756
},
{
"completion_length": 62.125,
"epoch": 0.18700592885375494,
"grad_norm": 3.0805751298380057,
"kl": 0.11865234375,
"learning_rate": 8.132411067193675e-07,
"loss": 0.0047,
"reward": 2.7386887073516846,
"reward_std": 0.0412348210811615,
"rewards/accuracy_reward_stage2": 0.7386887073516846,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 757
},
{
"completion_length": 81.953125,
"epoch": 0.18725296442687747,
"grad_norm": 3.183809799693692,
"kl": 0.1748046875,
"learning_rate": 8.129940711462451e-07,
"loss": 0.007,
"reward": 2.7576744556427,
"reward_std": 0.07029517740011215,
"rewards/accuracy_reward_stage2": 0.8149662017822266,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.25,
"step": 758
},
{
"completion_length": 59.0625,
"epoch": 0.1875,
"grad_norm": 2.2324711430978,
"kl": 0.11083984375,
"learning_rate": 8.127470355731225e-07,
"loss": 0.0044,
"reward": 2.9286131858825684,
"reward_std": 0.00950054731220007,
"rewards/accuracy_reward_stage2": 0.9286130666732788,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 759
},
{
"completion_length": 76.4375,
"epoch": 0.18774703557312253,
"grad_norm": 5.275881267988074,
"kl": 0.1455078125,
"learning_rate": 8.125e-07,
"loss": 0.0058,
"reward": 2.655832290649414,
"reward_std": 0.12385143339633942,
"rewards/accuracy_reward_stage2": 0.6558322906494141,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 760
},
{
"completion_length": 72.859375,
"epoch": 0.18799407114624506,
"grad_norm": 2.148609964785086,
"kl": 0.1044921875,
"learning_rate": 8.122529644268774e-07,
"loss": 0.0042,
"reward": 2.609675884246826,
"reward_std": 0.002886358881369233,
"rewards/accuracy_reward_stage2": 0.6096760034561157,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 761
},
{
"completion_length": 73.375,
"epoch": 0.18824110671936758,
"grad_norm": 3.404198278689953,
"kl": 0.11962890625,
"learning_rate": 8.120059288537548e-07,
"loss": 0.0048,
"reward": 2.573883056640625,
"reward_std": 0.015004590153694153,
"rewards/accuracy_reward_stage2": 0.5738831758499146,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 762
},
{
"completion_length": 70.546875,
"epoch": 0.1884881422924901,
"grad_norm": 4.508482517354418,
"kl": 0.11279296875,
"learning_rate": 8.117588932806324e-07,
"loss": 0.0045,
"reward": 2.5378634929656982,
"reward_std": 0.10779360681772232,
"rewards/accuracy_reward_stage2": 0.5378634929656982,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 763
},
{
"completion_length": 84.3125,
"epoch": 0.18873517786561264,
"grad_norm": 3.1922499449312167,
"kl": 0.11669921875,
"learning_rate": 8.115118577075099e-07,
"loss": 0.0047,
"reward": 2.71618914604187,
"reward_std": 0.14423049986362457,
"rewards/accuracy_reward_stage2": 0.7422308325767517,
"rewards/format_reward_all_stage": 1.9739583730697632,
"scores/refine_times": 1.34375,
"step": 764
},
{
"completion_length": 94.890625,
"epoch": 0.18898221343873517,
"grad_norm": 2.7685322764181715,
"kl": 0.1953125,
"learning_rate": 8.112648221343873e-07,
"loss": 0.0078,
"reward": 2.739062547683716,
"reward_std": 0.11331679672002792,
"rewards/accuracy_reward_stage2": 0.7390625476837158,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 765
},
{
"completion_length": 77.0625,
"epoch": 0.1892292490118577,
"grad_norm": 3.9402093829627516,
"kl": 0.10546875,
"learning_rate": 8.110177865612648e-07,
"loss": 0.0042,
"reward": 2.555873155593872,
"reward_std": 0.16401194036006927,
"rewards/accuracy_reward_stage2": 0.6131649017333984,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.15625,
"step": 766
},
{
"completion_length": 62.984375,
"epoch": 0.18947628458498023,
"grad_norm": 3.3270805279161335,
"kl": 0.1064453125,
"learning_rate": 8.107707509881423e-07,
"loss": 0.0043,
"reward": 2.593876361846924,
"reward_std": 0.14388912916183472,
"rewards/accuracy_reward_stage2": 0.7188762426376343,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.125,
"step": 767
},
{
"completion_length": 63.375,
"epoch": 0.18972332015810275,
"grad_norm": 4.575039803879313,
"kl": 0.12451171875,
"learning_rate": 8.105237154150197e-07,
"loss": 0.005,
"reward": 2.676608085632324,
"reward_std": 0.06587755680084229,
"rewards/accuracy_reward_stage2": 0.6766082048416138,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 768
},
{
"completion_length": 80.796875,
"epoch": 0.1899703557312253,
"grad_norm": 4.913789346219929,
"kl": 0.11181640625,
"learning_rate": 8.102766798418972e-07,
"loss": 0.0045,
"reward": 2.4672067165374756,
"reward_std": 0.14596593379974365,
"rewards/accuracy_reward_stage2": 0.5922067165374756,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.140625,
"step": 769
},
{
"completion_length": 64.25,
"epoch": 0.19021739130434784,
"grad_norm": 4.260844595348638,
"kl": 0.1357421875,
"learning_rate": 8.100296442687746e-07,
"loss": 0.0054,
"reward": 2.771862030029297,
"reward_std": 0.03155703470110893,
"rewards/accuracy_reward_stage2": 0.7718619108200073,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 770
},
{
"completion_length": 72.265625,
"epoch": 0.19046442687747037,
"grad_norm": 4.247208948225819,
"kl": 0.11181640625,
"learning_rate": 8.097826086956521e-07,
"loss": 0.0045,
"reward": 2.562706708908081,
"reward_std": 0.1308896243572235,
"rewards/accuracy_reward_stage2": 0.5627066493034363,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 771
},
{
"completion_length": 81.90625,
"epoch": 0.1907114624505929,
"grad_norm": 5.532229311907246,
"kl": 0.150390625,
"learning_rate": 8.095355731225296e-07,
"loss": 0.006,
"reward": 2.4436357021331787,
"reward_std": 0.2998715341091156,
"rewards/accuracy_reward_stage2": 0.6936356425285339,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.140625,
"step": 772
},
{
"completion_length": 96.015625,
"epoch": 0.19095849802371542,
"grad_norm": 3.8686169326260247,
"kl": 0.1005859375,
"learning_rate": 8.092885375494071e-07,
"loss": 0.004,
"reward": 2.8755273818969727,
"reward_std": 0.06159983575344086,
"rewards/accuracy_reward_stage2": 0.8859438896179199,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.4375,
"step": 773
},
{
"completion_length": 74.765625,
"epoch": 0.19120553359683795,
"grad_norm": 4.19101339210458,
"kl": 0.107421875,
"learning_rate": 8.090415019762846e-07,
"loss": 0.0043,
"reward": 2.772860527038574,
"reward_std": 0.14349845051765442,
"rewards/accuracy_reward_stage2": 0.772860586643219,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 774
},
{
"completion_length": 87.46875,
"epoch": 0.19145256916996048,
"grad_norm": 2.3971160307653996,
"kl": 0.11181640625,
"learning_rate": 8.087944664031621e-07,
"loss": 0.0045,
"reward": 2.9060990810394287,
"reward_std": 0.0646752119064331,
"rewards/accuracy_reward_stage2": 0.9165157079696655,
"rewards/format_reward_all_stage": 1.9895832538604736,
"scores/refine_times": 1.359375,
"step": 775
},
{
"completion_length": 81.578125,
"epoch": 0.191699604743083,
"grad_norm": 1.6984897790582976,
"kl": 0.10400390625,
"learning_rate": 8.085474308300395e-07,
"loss": 0.0042,
"reward": 2.6885228157043457,
"reward_std": 0.06918665766716003,
"rewards/accuracy_reward_stage2": 0.7432103157043457,
"rewards/format_reward_all_stage": 1.9453125,
"scores/refine_times": 1.359375,
"step": 776
},
{
"completion_length": 66.953125,
"epoch": 0.19194664031620554,
"grad_norm": 4.087458802427488,
"kl": 0.09814453125,
"learning_rate": 8.08300395256917e-07,
"loss": 0.0039,
"reward": 2.8248817920684814,
"reward_std": 0.12380017340183258,
"rewards/accuracy_reward_stage2": 0.8248817920684814,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 777
},
{
"completion_length": 88.25,
"epoch": 0.19219367588932806,
"grad_norm": 2.418293642121235,
"kl": 0.1201171875,
"learning_rate": 8.080533596837944e-07,
"loss": 0.0048,
"reward": 2.590397357940674,
"reward_std": 0.12917400896549225,
"rewards/accuracy_reward_stage2": 0.5903975367546082,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 778
},
{
"completion_length": 93.75,
"epoch": 0.1924407114624506,
"grad_norm": 3.2915590592064,
"kl": 0.1162109375,
"learning_rate": 8.078063241106719e-07,
"loss": 0.0047,
"reward": 2.8586063385009766,
"reward_std": 0.02293376252055168,
"rewards/accuracy_reward_stage2": 0.8586064577102661,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 779
},
{
"completion_length": 89.515625,
"epoch": 0.19268774703557312,
"grad_norm": 4.172983979860826,
"kl": 0.12890625,
"learning_rate": 8.075592885375494e-07,
"loss": 0.0051,
"reward": 2.6774847507476807,
"reward_std": 0.16821645200252533,
"rewards/accuracy_reward_stage2": 0.7931098341941833,
"rewards/format_reward_all_stage": 1.884374976158142,
"scores/refine_times": 1.515625,
"step": 780
},
{
"completion_length": 88.65625,
"epoch": 0.19293478260869565,
"grad_norm": 3.7850164391712373,
"kl": 0.1923828125,
"learning_rate": 8.073122529644268e-07,
"loss": 0.0077,
"reward": 2.632174015045166,
"reward_std": 0.18120327591896057,
"rewards/accuracy_reward_stage2": 0.6998822689056396,
"rewards/format_reward_all_stage": 1.9322916269302368,
"scores/refine_times": 1.328125,
"step": 781
},
{
"completion_length": 88.5,
"epoch": 0.19318181818181818,
"grad_norm": 2.957342350183697,
"kl": 0.1376953125,
"learning_rate": 8.070652173913042e-07,
"loss": 0.0055,
"reward": 2.5614137649536133,
"reward_std": 0.07992963492870331,
"rewards/accuracy_reward_stage2": 0.5614137053489685,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 782
},
{
"completion_length": 112.296875,
"epoch": 0.1934288537549407,
"grad_norm": 3.8559803732895497,
"kl": 0.09228515625,
"learning_rate": 8.068181818181818e-07,
"loss": 0.0037,
"reward": 2.7399330139160156,
"reward_std": 0.1664211004972458,
"rewards/accuracy_reward_stage2": 0.7399328947067261,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.71875,
"step": 783
},
{
"completion_length": 88.875,
"epoch": 0.19367588932806323,
"grad_norm": 4.502436247020987,
"kl": 0.1298828125,
"learning_rate": 8.065711462450593e-07,
"loss": 0.0052,
"reward": 2.3374903202056885,
"reward_std": 0.2821962237358093,
"rewards/accuracy_reward_stage2": 0.5354070663452148,
"rewards/format_reward_all_stage": 1.8020832538604736,
"scores/refine_times": 1.265625,
"step": 784
},
{
"completion_length": 80.90625,
"epoch": 0.19392292490118576,
"grad_norm": 3.7802077823940197,
"kl": 0.11279296875,
"learning_rate": 8.063241106719367e-07,
"loss": 0.0045,
"reward": 2.731360912322998,
"reward_std": 0.03464128449559212,
"rewards/accuracy_reward_stage2": 0.7313610315322876,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 785
},
{
"completion_length": 118.15625,
"epoch": 0.1941699604743083,
"grad_norm": 3.555349001791891,
"kl": 0.0888671875,
"learning_rate": 8.060770750988142e-07,
"loss": 0.0036,
"reward": 2.5097384452819824,
"reward_std": 0.14271043241024017,
"rewards/accuracy_reward_stage2": 0.5982798933982849,
"rewards/format_reward_all_stage": 1.9114582538604736,
"scores/refine_times": 1.640625,
"step": 786
},
{
"completion_length": 106.03125,
"epoch": 0.19441699604743082,
"grad_norm": 2.85710040568485,
"kl": 0.08203125,
"learning_rate": 8.058300395256916e-07,
"loss": 0.0033,
"reward": 2.8455610275268555,
"reward_std": 0.05401609092950821,
"rewards/accuracy_reward_stage2": 0.8559777736663818,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.515625,
"step": 787
},
{
"completion_length": 118.0625,
"epoch": 0.19466403162055335,
"grad_norm": 3.7964493038018245,
"kl": 0.1572265625,
"learning_rate": 8.055830039525692e-07,
"loss": 0.0063,
"reward": 2.6106669902801514,
"reward_std": 0.21474987268447876,
"rewards/accuracy_reward_stage2": 0.7356671094894409,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.53125,
"step": 788
},
{
"completion_length": 126.265625,
"epoch": 0.1949110671936759,
"grad_norm": 2.9916850524673837,
"kl": 0.1044921875,
"learning_rate": 8.053359683794466e-07,
"loss": 0.0042,
"reward": 2.669743537902832,
"reward_std": 0.17657098174095154,
"rewards/accuracy_reward_stage2": 0.7478686571121216,
"rewards/format_reward_all_stage": 1.921875,
"scores/refine_times": 1.53125,
"step": 789
},
{
"completion_length": 106.75,
"epoch": 0.19515810276679843,
"grad_norm": 3.059045891709165,
"kl": 0.1103515625,
"learning_rate": 8.05088932806324e-07,
"loss": 0.0044,
"reward": 2.5722293853759766,
"reward_std": 0.19657298922538757,
"rewards/accuracy_reward_stage2": 0.5847293138504028,
"rewards/format_reward_all_stage": 1.9874999523162842,
"scores/refine_times": 1.515625,
"step": 790
},
{
"completion_length": 115.171875,
"epoch": 0.19540513833992096,
"grad_norm": 4.038115134383084,
"kl": 0.134765625,
"learning_rate": 8.048418972332015e-07,
"loss": 0.0054,
"reward": 2.6473679542541504,
"reward_std": 0.3247292637825012,
"rewards/accuracy_reward_stage2": 0.838513970375061,
"rewards/format_reward_all_stage": 1.808854103088379,
"scores/refine_times": 1.734375,
"step": 791
},
{
"completion_length": 119.84375,
"epoch": 0.1956521739130435,
"grad_norm": 3.9096509332723537,
"kl": 0.1083984375,
"learning_rate": 8.045948616600791e-07,
"loss": 0.0043,
"reward": 2.6962451934814453,
"reward_std": 0.12264476716518402,
"rewards/accuracy_reward_stage2": 0.7597866058349609,
"rewards/format_reward_all_stage": 1.9364583492279053,
"scores/refine_times": 1.671875,
"step": 792
},
{
"completion_length": 123.046875,
"epoch": 0.19589920948616601,
"grad_norm": 3.453621564913509,
"kl": 0.1005859375,
"learning_rate": 8.043478260869565e-07,
"loss": 0.004,
"reward": 2.64453125,
"reward_std": 0.14773526787757874,
"rewards/accuracy_reward_stage2": 0.6445313096046448,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.671875,
"step": 793
},
{
"completion_length": 136.171875,
"epoch": 0.19614624505928854,
"grad_norm": 3.244903631558797,
"kl": 0.08203125,
"learning_rate": 8.04100790513834e-07,
"loss": 0.0033,
"reward": 2.459873676300049,
"reward_std": 0.250729501247406,
"rewards/accuracy_reward_stage2": 0.5989362001419067,
"rewards/format_reward_all_stage": 1.860937476158142,
"scores/refine_times": 1.828125,
"step": 794
},
{
"completion_length": 121.84375,
"epoch": 0.19639328063241107,
"grad_norm": 3.0812502917172724,
"kl": 0.1201171875,
"learning_rate": 8.038537549407114e-07,
"loss": 0.0048,
"reward": 2.6600656509399414,
"reward_std": 0.21765968203544617,
"rewards/accuracy_reward_stage2": 0.7225657105445862,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.8125,
"step": 795
},
{
"completion_length": 80.90625,
"epoch": 0.1966403162055336,
"grad_norm": 3.1572409050375336,
"kl": 0.1474609375,
"learning_rate": 8.036067193675889e-07,
"loss": 0.0059,
"reward": 2.801659345626831,
"reward_std": 0.1534295380115509,
"rewards/accuracy_reward_stage2": 0.8172844648361206,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.28125,
"step": 796
},
{
"completion_length": 102.078125,
"epoch": 0.19688735177865613,
"grad_norm": 3.677604625623705,
"kl": 0.091796875,
"learning_rate": 8.033596837944664e-07,
"loss": 0.0037,
"reward": 2.7125244140625,
"reward_std": 0.21848973631858826,
"rewards/accuracy_reward_stage2": 0.8437741994857788,
"rewards/format_reward_all_stage": 1.868749976158142,
"scores/refine_times": 1.375,
"step": 797
},
{
"completion_length": 115.859375,
"epoch": 0.19713438735177866,
"grad_norm": 4.2639279037433075,
"kl": 0.1005859375,
"learning_rate": 8.031126482213438e-07,
"loss": 0.004,
"reward": 2.6685781478881836,
"reward_std": 0.12759262323379517,
"rewards/accuracy_reward_stage2": 0.731078028678894,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.5,
"step": 798
},
{
"completion_length": 61.765625,
"epoch": 0.19738142292490118,
"grad_norm": 7.203216209798114,
"kl": 0.1357421875,
"learning_rate": 8.028656126482212e-07,
"loss": 0.0054,
"reward": 2.653932571411133,
"reward_std": 0.2906397581100464,
"rewards/accuracy_reward_stage2": 0.6539325714111328,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 799
},
{
"completion_length": 96.875,
"epoch": 0.1976284584980237,
"grad_norm": 5.129742489372007,
"kl": 0.12451171875,
"learning_rate": 8.026185770750987e-07,
"loss": 0.005,
"reward": 2.4581518173217773,
"reward_std": 0.11399400979280472,
"rewards/accuracy_reward_stage2": 0.5154435038566589,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.359375,
"step": 800
},
{
"completion_length": 95.671875,
"epoch": 0.19787549407114624,
"grad_norm": 4.764586841064228,
"kl": 0.11181640625,
"learning_rate": 8.023715415019763e-07,
"loss": 0.0045,
"reward": 2.610954999923706,
"reward_std": 0.2346010059118271,
"rewards/accuracy_reward_stage2": 0.6734550595283508,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.265625,
"step": 801
},
{
"completion_length": 99.140625,
"epoch": 0.19812252964426877,
"grad_norm": 4.333849192860251,
"kl": 0.10107421875,
"learning_rate": 8.021245059288538e-07,
"loss": 0.004,
"reward": 2.6565942764282227,
"reward_std": 0.178715318441391,
"rewards/accuracy_reward_stage2": 0.6628443598747253,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.359375,
"step": 802
},
{
"completion_length": 128.1875,
"epoch": 0.1983695652173913,
"grad_norm": 4.089422740146013,
"kl": 0.119140625,
"learning_rate": 8.018774703557312e-07,
"loss": 0.0048,
"reward": 2.6466307640075684,
"reward_std": 0.2680560052394867,
"rewards/accuracy_reward_stage2": 0.7716308832168579,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.4375,
"step": 803
},
{
"completion_length": 81.5625,
"epoch": 0.19861660079051383,
"grad_norm": 4.131156028784667,
"kl": 0.1083984375,
"learning_rate": 8.016304347826086e-07,
"loss": 0.0043,
"reward": 2.7747063636779785,
"reward_std": 0.027518026530742645,
"rewards/accuracy_reward_stage2": 0.7747063636779785,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 804
},
{
"completion_length": 89.6875,
"epoch": 0.19886363636363635,
"grad_norm": 4.292467738406767,
"kl": 0.08740234375,
"learning_rate": 8.013833992094862e-07,
"loss": 0.0035,
"reward": 2.638862133026123,
"reward_std": 0.23011387884616852,
"rewards/accuracy_reward_stage2": 0.670112133026123,
"rewards/format_reward_all_stage": 1.96875,
"scores/refine_times": 1.25,
"step": 805
},
{
"completion_length": 87.78125,
"epoch": 0.19911067193675888,
"grad_norm": 3.6376261756995625,
"kl": 0.10205078125,
"learning_rate": 8.011363636363636e-07,
"loss": 0.0041,
"reward": 2.8962717056274414,
"reward_std": 0.030569197610020638,
"rewards/accuracy_reward_stage2": 0.8962716460227966,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 806
},
{
"completion_length": 86.96875,
"epoch": 0.1993577075098814,
"grad_norm": 4.069978684984678,
"kl": 0.10546875,
"learning_rate": 8.00889328063241e-07,
"loss": 0.0042,
"reward": 2.4667863845825195,
"reward_std": 0.08214451372623444,
"rewards/accuracy_reward_stage2": 0.4667862355709076,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 807
},
{
"completion_length": 124.8125,
"epoch": 0.19960474308300397,
"grad_norm": 3.6030777655101205,
"kl": 0.08935546875,
"learning_rate": 8.006422924901185e-07,
"loss": 0.0036,
"reward": 2.402216672897339,
"reward_std": 0.21567249298095703,
"rewards/accuracy_reward_stage2": 0.5272166132926941,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.453125,
"step": 808
},
{
"completion_length": 95.796875,
"epoch": 0.1998517786561265,
"grad_norm": 3.960382990766777,
"kl": 0.08154296875,
"learning_rate": 8.00395256916996e-07,
"loss": 0.0033,
"reward": 2.651146650314331,
"reward_std": 0.1337394118309021,
"rewards/accuracy_reward_stage2": 0.6511465907096863,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 809
},
{
"completion_length": 126.8125,
"epoch": 0.20009881422924902,
"grad_norm": 3.305927068278349,
"kl": 0.12060546875,
"learning_rate": 8.001482213438735e-07,
"loss": 0.0048,
"reward": 2.4156692028045654,
"reward_std": 0.21533656120300293,
"rewards/accuracy_reward_stage2": 0.5406690835952759,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.359375,
"step": 810
},
{
"completion_length": 111.78125,
"epoch": 0.20034584980237155,
"grad_norm": 2.2768402753567365,
"kl": 0.095703125,
"learning_rate": 7.99901185770751e-07,
"loss": 0.0038,
"reward": 2.449721336364746,
"reward_std": 0.011049442924559116,
"rewards/accuracy_reward_stage2": 0.44972118735313416,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 811
},
{
"completion_length": 88.09375,
"epoch": 0.20059288537549408,
"grad_norm": 4.766874200244394,
"kl": 0.0791015625,
"learning_rate": 7.996541501976284e-07,
"loss": 0.0032,
"reward": 2.599069595336914,
"reward_std": 0.21623960137367249,
"rewards/accuracy_reward_stage2": 0.7396947741508484,
"rewards/format_reward_all_stage": 1.859375,
"scores/refine_times": 1.125,
"step": 812
},
{
"completion_length": 89.125,
"epoch": 0.2008399209486166,
"grad_norm": 3.457097069645932,
"kl": 0.1435546875,
"learning_rate": 7.99407114624506e-07,
"loss": 0.0058,
"reward": 2.5180485248565674,
"reward_std": 0.2695591151714325,
"rewards/accuracy_reward_stage2": 0.7680485844612122,
"rewards/format_reward_all_stage": 1.75,
"scores/refine_times": 1.15625,
"step": 813
},
{
"completion_length": 109.375,
"epoch": 0.20108695652173914,
"grad_norm": 3.1285474295421047,
"kl": 0.07080078125,
"learning_rate": 7.991600790513834e-07,
"loss": 0.0028,
"reward": 2.4723763465881348,
"reward_std": 0.13593828678131104,
"rewards/accuracy_reward_stage2": 0.4723762571811676,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 814
},
{
"completion_length": 107.375,
"epoch": 0.20133399209486166,
"grad_norm": 4.642239916817279,
"kl": 0.09814453125,
"learning_rate": 7.989130434782608e-07,
"loss": 0.0039,
"reward": 2.4711804389953613,
"reward_std": 0.18163591623306274,
"rewards/accuracy_reward_stage2": 0.48159706592559814,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.296875,
"step": 815
},
{
"completion_length": 91.28125,
"epoch": 0.2015810276679842,
"grad_norm": 3.2296620896928436,
"kl": 0.10107421875,
"learning_rate": 7.986660079051383e-07,
"loss": 0.0041,
"reward": 2.847200870513916,
"reward_std": 0.04103899747133255,
"rewards/accuracy_reward_stage2": 0.8472008109092712,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 816
},
{
"completion_length": 91.984375,
"epoch": 0.20182806324110672,
"grad_norm": 4.058195341850405,
"kl": 0.12060546875,
"learning_rate": 7.984189723320158e-07,
"loss": 0.0048,
"reward": 2.5396575927734375,
"reward_std": 0.08468227833509445,
"rewards/accuracy_reward_stage2": 0.539657473564148,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 817
},
{
"completion_length": 87.640625,
"epoch": 0.20207509881422925,
"grad_norm": 3.411264589140427,
"kl": 0.10693359375,
"learning_rate": 7.981719367588932e-07,
"loss": 0.0043,
"reward": 2.590893268585205,
"reward_std": 0.013569341972470284,
"rewards/accuracy_reward_stage2": 0.5908934473991394,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 818
},
{
"completion_length": 118.203125,
"epoch": 0.20232213438735178,
"grad_norm": 3.0412069435274183,
"kl": 0.10595703125,
"learning_rate": 7.979249011857708e-07,
"loss": 0.0042,
"reward": 2.5969619750976562,
"reward_std": 0.06461979448795319,
"rewards/accuracy_reward_stage2": 0.5969619750976562,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 819
},
{
"completion_length": 106.609375,
"epoch": 0.2025691699604743,
"grad_norm": 3.9366533721995354,
"kl": 0.083984375,
"learning_rate": 7.976778656126482e-07,
"loss": 0.0034,
"reward": 2.7181453704833984,
"reward_std": 0.06762054562568665,
"rewards/accuracy_reward_stage2": 0.7754369974136353,
"rewards/format_reward_all_stage": 1.9427082538604736,
"scores/refine_times": 1.21875,
"step": 820
},
{
"completion_length": 101.75,
"epoch": 0.20281620553359683,
"grad_norm": 2.896684409936336,
"kl": 0.0869140625,
"learning_rate": 7.974308300395256e-07,
"loss": 0.0035,
"reward": 2.7476139068603516,
"reward_std": 0.13944795727729797,
"rewards/accuracy_reward_stage2": 0.8726138472557068,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.265625,
"step": 821
},
{
"completion_length": 90.9375,
"epoch": 0.20306324110671936,
"grad_norm": 4.01535366563514,
"kl": 0.0908203125,
"learning_rate": 7.971837944664032e-07,
"loss": 0.0036,
"reward": 2.506101131439209,
"reward_std": 0.08851330727338791,
"rewards/accuracy_reward_stage2": 0.5061010122299194,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 822
},
{
"completion_length": 98.8125,
"epoch": 0.2033102766798419,
"grad_norm": 3.350308544289007,
"kl": 0.1005859375,
"learning_rate": 7.969367588932806e-07,
"loss": 0.004,
"reward": 2.8014578819274902,
"reward_std": 0.08593001216650009,
"rewards/accuracy_reward_stage2": 0.8092702627182007,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.203125,
"step": 823
},
{
"completion_length": 107.828125,
"epoch": 0.20355731225296442,
"grad_norm": 3.0783484634726546,
"kl": 0.1005859375,
"learning_rate": 7.96689723320158e-07,
"loss": 0.004,
"reward": 2.5391650199890137,
"reward_std": 0.10030417889356613,
"rewards/accuracy_reward_stage2": 0.5391650795936584,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 824
},
{
"completion_length": 66.0625,
"epoch": 0.20380434782608695,
"grad_norm": 4.045892982620479,
"kl": 0.099609375,
"learning_rate": 7.964426877470355e-07,
"loss": 0.004,
"reward": 2.76053524017334,
"reward_std": 0.01285035815089941,
"rewards/accuracy_reward_stage2": 0.7605355381965637,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 825
},
{
"completion_length": 88.390625,
"epoch": 0.20405138339920947,
"grad_norm": 2.4907619826917724,
"kl": 0.0859375,
"learning_rate": 7.96195652173913e-07,
"loss": 0.0034,
"reward": 2.803270101547241,
"reward_std": 0.03927075117826462,
"rewards/accuracy_reward_stage2": 0.8032701015472412,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 826
},
{
"completion_length": 92.828125,
"epoch": 0.204298418972332,
"grad_norm": 3.734962052920799,
"kl": 0.091796875,
"learning_rate": 7.959486166007904e-07,
"loss": 0.0037,
"reward": 2.655478000640869,
"reward_std": 0.028529653325676918,
"rewards/accuracy_reward_stage2": 0.6554780602455139,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 827
},
{
"completion_length": 118.625,
"epoch": 0.20454545454545456,
"grad_norm": 3.570207149465539,
"kl": 0.0810546875,
"learning_rate": 7.957015810276679e-07,
"loss": 0.0032,
"reward": 2.61575984954834,
"reward_std": 0.04259338974952698,
"rewards/accuracy_reward_stage2": 0.6157597303390503,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 828
},
{
"completion_length": 107.734375,
"epoch": 0.2047924901185771,
"grad_norm": 3.982682150827258,
"kl": 0.068359375,
"learning_rate": 7.954545454545454e-07,
"loss": 0.0027,
"reward": 2.3884055614471436,
"reward_std": 0.16423478722572327,
"rewards/accuracy_reward_stage2": 0.45871806144714355,
"rewards/format_reward_all_stage": 1.9296875,
"scores/refine_times": 1.28125,
"step": 829
},
{
"completion_length": 80.328125,
"epoch": 0.20503952569169961,
"grad_norm": 0.8626963889416652,
"kl": 0.07568359375,
"learning_rate": 7.95207509881423e-07,
"loss": 0.003,
"reward": 2.832073211669922,
"reward_std": 0.012326827272772789,
"rewards/accuracy_reward_stage2": 0.8320731520652771,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 830
},
{
"completion_length": 111.59375,
"epoch": 0.20528656126482214,
"grad_norm": 1.9373113229582275,
"kl": 0.08447265625,
"learning_rate": 7.949604743083004e-07,
"loss": 0.0034,
"reward": 2.71048903465271,
"reward_std": 0.10930237174034119,
"rewards/accuracy_reward_stage2": 0.7365307211875916,
"rewards/format_reward_all_stage": 1.9739583730697632,
"scores/refine_times": 1.328125,
"step": 831
},
{
"completion_length": 108.078125,
"epoch": 0.20553359683794467,
"grad_norm": 3.4279539102400514,
"kl": 0.10302734375,
"learning_rate": 7.947134387351778e-07,
"loss": 0.0041,
"reward": 2.6778836250305176,
"reward_std": 0.05375240743160248,
"rewards/accuracy_reward_stage2": 0.6778836250305176,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 832
},
{
"completion_length": 104.078125,
"epoch": 0.2057806324110672,
"grad_norm": 12.44012981683348,
"kl": 0.470703125,
"learning_rate": 7.944664031620553e-07,
"loss": 0.0189,
"reward": 2.657538414001465,
"reward_std": 0.05722765997052193,
"rewards/accuracy_reward_stage2": 0.6575384140014648,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 833
},
{
"completion_length": 149.515625,
"epoch": 0.20602766798418973,
"grad_norm": 1.2161205838642917,
"kl": 0.068359375,
"learning_rate": 7.942193675889328e-07,
"loss": 0.0027,
"reward": 2.7292327880859375,
"reward_std": 0.020421404391527176,
"rewards/accuracy_reward_stage2": 0.7292326092720032,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.5625,
"step": 834
},
{
"completion_length": 120.046875,
"epoch": 0.20627470355731226,
"grad_norm": 4.827345708037369,
"kl": 0.09814453125,
"learning_rate": 7.939723320158102e-07,
"loss": 0.0039,
"reward": 2.404914379119873,
"reward_std": 0.24059876799583435,
"rewards/accuracy_reward_stage2": 0.598664402961731,
"rewards/format_reward_all_stage": 1.806249976158142,
"scores/refine_times": 1.328125,
"step": 835
},
{
"completion_length": 139.640625,
"epoch": 0.20652173913043478,
"grad_norm": 2.549571532494945,
"kl": 0.08642578125,
"learning_rate": 7.937252964426877e-07,
"loss": 0.0035,
"reward": 2.589641571044922,
"reward_std": 0.2173718512058258,
"rewards/accuracy_reward_stage2": 0.7146413922309875,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.515625,
"step": 836
},
{
"completion_length": 102.03125,
"epoch": 0.2067687747035573,
"grad_norm": 2.3032759644093197,
"kl": 0.07958984375,
"learning_rate": 7.934782608695651e-07,
"loss": 0.0032,
"reward": 2.698106050491333,
"reward_std": 0.1670789122581482,
"rewards/accuracy_reward_stage2": 0.760606050491333,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.28125,
"step": 837
},
{
"completion_length": 102.625,
"epoch": 0.20701581027667984,
"grad_norm": 2.7054313518366575,
"kl": 0.08935546875,
"learning_rate": 7.932312252964426e-07,
"loss": 0.0036,
"reward": 2.827049970626831,
"reward_std": 0.02332034707069397,
"rewards/accuracy_reward_stage2": 0.827049970626831,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 838
},
{
"completion_length": 134.90625,
"epoch": 0.20726284584980237,
"grad_norm": 3.525087666814418,
"kl": 0.083984375,
"learning_rate": 7.929841897233202e-07,
"loss": 0.0034,
"reward": 2.7634880542755127,
"reward_std": 0.16727206110954285,
"rewards/accuracy_reward_stage2": 0.8963003754615784,
"rewards/format_reward_all_stage": 1.8671875,
"scores/refine_times": 1.453125,
"step": 839
},
{
"completion_length": 133.34375,
"epoch": 0.2075098814229249,
"grad_norm": 2.803983186699194,
"kl": 0.0732421875,
"learning_rate": 7.927371541501976e-07,
"loss": 0.0029,
"reward": 2.6079330444335938,
"reward_std": 0.0722799152135849,
"rewards/accuracy_reward_stage2": 0.6235581040382385,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.703125,
"step": 840
},
{
"completion_length": 84.6875,
"epoch": 0.20775691699604742,
"grad_norm": 0.9087254095102063,
"kl": 0.08203125,
"learning_rate": 7.92490118577075e-07,
"loss": 0.0033,
"reward": 2.972470283508301,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward_stage2": 0.972470223903656,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 841
},
{
"completion_length": 96.421875,
"epoch": 0.20800395256916995,
"grad_norm": 4.268367932857312,
"kl": 0.1064453125,
"learning_rate": 7.922430830039525e-07,
"loss": 0.0043,
"reward": 2.7230210304260254,
"reward_std": 0.09623900800943375,
"rewards/accuracy_reward_stage2": 0.7308334112167358,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.1875,
"step": 842
},
{
"completion_length": 94.1875,
"epoch": 0.20825098814229248,
"grad_norm": 3.751185950471063,
"kl": 0.08740234375,
"learning_rate": 7.9199604743083e-07,
"loss": 0.0035,
"reward": 2.553612470626831,
"reward_std": 0.13736850023269653,
"rewards/accuracy_reward_stage2": 0.6192374229431152,
"rewards/format_reward_all_stage": 1.9343750476837158,
"scores/refine_times": 1.265625,
"step": 843
},
{
"completion_length": 111.6875,
"epoch": 0.208498023715415,
"grad_norm": 2.3607450354442774,
"kl": 0.09326171875,
"learning_rate": 7.917490118577075e-07,
"loss": 0.0037,
"reward": 2.6834843158721924,
"reward_std": 0.04238169267773628,
"rewards/accuracy_reward_stage2": 0.6897343993186951,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.421875,
"step": 844
},
{
"completion_length": 97.8125,
"epoch": 0.20874505928853754,
"grad_norm": 2.810706581086738,
"kl": 0.10302734375,
"learning_rate": 7.915019762845849e-07,
"loss": 0.0041,
"reward": 2.604111433029175,
"reward_std": 0.10309471189975739,
"rewards/accuracy_reward_stage2": 0.6614030599594116,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.21875,
"step": 845
},
{
"completion_length": 117.25,
"epoch": 0.20899209486166007,
"grad_norm": 2.7354608601120294,
"kl": 0.12353515625,
"learning_rate": 7.912549407114623e-07,
"loss": 0.0049,
"reward": 2.600095272064209,
"reward_std": 0.10307259112596512,
"rewards/accuracy_reward_stage2": 0.6188453435897827,
"rewards/format_reward_all_stage": 1.9812500476837158,
"scores/refine_times": 1.703125,
"step": 846
},
{
"completion_length": 94.34375,
"epoch": 0.20923913043478262,
"grad_norm": 2.56722136517573,
"kl": 0.10107421875,
"learning_rate": 7.9100790513834e-07,
"loss": 0.0041,
"reward": 2.5527243614196777,
"reward_std": 0.12820011377334595,
"rewards/accuracy_reward_stage2": 0.5995994806289673,
"rewards/format_reward_all_stage": 1.953125,
"scores/refine_times": 1.4375,
"step": 847
},
{
"completion_length": 127.0625,
"epoch": 0.20948616600790515,
"grad_norm": 2.391728807768981,
"kl": 0.0751953125,
"learning_rate": 7.907608695652174e-07,
"loss": 0.003,
"reward": 2.7395739555358887,
"reward_std": 0.15919004380702972,
"rewards/accuracy_reward_stage2": 0.7614490985870361,
"rewards/format_reward_all_stage": 1.978124976158142,
"scores/refine_times": 1.6875,
"step": 848
},
{
"completion_length": 139.4375,
"epoch": 0.20973320158102768,
"grad_norm": 2.186650932071641,
"kl": 0.07763671875,
"learning_rate": 7.905138339920948e-07,
"loss": 0.0031,
"reward": 2.6006877422332764,
"reward_std": 0.20573003590106964,
"rewards/accuracy_reward_stage2": 0.7256878018379211,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.703125,
"step": 849
},
{
"completion_length": 120.609375,
"epoch": 0.2099802371541502,
"grad_norm": 1.7864914929400448,
"kl": 0.087890625,
"learning_rate": 7.902667984189723e-07,
"loss": 0.0035,
"reward": 2.523496627807617,
"reward_std": 0.06999015808105469,
"rewards/accuracy_reward_stage2": 0.5297467708587646,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.46875,
"step": 850
},
{
"completion_length": 106.8125,
"epoch": 0.21022727272727273,
"grad_norm": 1.806522155761803,
"kl": 0.078125,
"learning_rate": 7.900197628458498e-07,
"loss": 0.0031,
"reward": 2.625,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward_stage2": 0.75,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.234375,
"step": 851
},
{
"completion_length": 115.234375,
"epoch": 0.21047430830039526,
"grad_norm": 4.110304936221235,
"kl": 0.1533203125,
"learning_rate": 7.897727272727272e-07,
"loss": 0.0061,
"reward": 2.6135501861572266,
"reward_std": 0.0783662497997284,
"rewards/accuracy_reward_stage2": 0.6135500073432922,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.546875,
"step": 852
},
{
"completion_length": 97.859375,
"epoch": 0.2107213438735178,
"grad_norm": 2.359536619986747,
"kl": 0.078125,
"learning_rate": 7.895256916996047e-07,
"loss": 0.0031,
"reward": 2.7388830184936523,
"reward_std": 0.04497361183166504,
"rewards/accuracy_reward_stage2": 0.7451329827308655,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.3125,
"step": 853
},
{
"completion_length": 84.375,
"epoch": 0.21096837944664032,
"grad_norm": 1.9983144321891226,
"kl": 0.0830078125,
"learning_rate": 7.892786561264821e-07,
"loss": 0.0033,
"reward": 2.8331847190856934,
"reward_std": 0.003715165425091982,
"rewards/accuracy_reward_stage2": 0.8331847190856934,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 854
},
{
"completion_length": 95.53125,
"epoch": 0.21121541501976285,
"grad_norm": 4.26865105498033,
"kl": 0.1103515625,
"learning_rate": 7.890316205533597e-07,
"loss": 0.0044,
"reward": 2.5257091522216797,
"reward_std": 0.08476169407367706,
"rewards/accuracy_reward_stage2": 0.5934174060821533,
"rewards/format_reward_all_stage": 1.9322917461395264,
"scores/refine_times": 1.359375,
"step": 855
},
{
"completion_length": 84.3125,
"epoch": 0.21146245059288538,
"grad_norm": 3.9489524139672327,
"kl": 0.0693359375,
"learning_rate": 7.887845849802372e-07,
"loss": 0.0028,
"reward": 2.546104907989502,
"reward_std": 0.08534545451402664,
"rewards/accuracy_reward_stage2": 0.5461047291755676,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 856
},
{
"completion_length": 102.078125,
"epoch": 0.2117094861660079,
"grad_norm": 3.4011550909615917,
"kl": 0.087890625,
"learning_rate": 7.885375494071146e-07,
"loss": 0.0035,
"reward": 2.5729289054870605,
"reward_std": 0.024190258234739304,
"rewards/accuracy_reward_stage2": 0.5729289054870605,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 857
},
{
"completion_length": 94.0,
"epoch": 0.21195652173913043,
"grad_norm": 3.6691553352007062,
"kl": 0.09765625,
"learning_rate": 7.882905138339921e-07,
"loss": 0.0039,
"reward": 2.7282519340515137,
"reward_std": 0.14093205332756042,
"rewards/accuracy_reward_stage2": 0.7282518148422241,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 858
},
{
"completion_length": 136.171875,
"epoch": 0.21220355731225296,
"grad_norm": 3.438976115970983,
"kl": 0.0859375,
"learning_rate": 7.880434782608695e-07,
"loss": 0.0034,
"reward": 2.5337977409362793,
"reward_std": 0.17680373787879944,
"rewards/accuracy_reward_stage2": 0.6035893559455872,
"rewards/format_reward_all_stage": 1.9302083253860474,
"scores/refine_times": 1.65625,
"step": 859
},
{
"completion_length": 116.359375,
"epoch": 0.2124505928853755,
"grad_norm": 3.3182114661161433,
"kl": 0.087890625,
"learning_rate": 7.87796442687747e-07,
"loss": 0.0035,
"reward": 2.672604560852051,
"reward_std": 0.22977818548679352,
"rewards/accuracy_reward_stage2": 0.7351046800613403,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.453125,
"step": 860
},
{
"completion_length": 86.671875,
"epoch": 0.21269762845849802,
"grad_norm": 3.0669111128024706,
"kl": 0.0712890625,
"learning_rate": 7.875494071146245e-07,
"loss": 0.0029,
"reward": 2.748086452484131,
"reward_std": 0.03686128184199333,
"rewards/accuracy_reward_stage2": 0.7480865716934204,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 861
},
{
"completion_length": 104.65625,
"epoch": 0.21294466403162055,
"grad_norm": 2.7593142734907627,
"kl": 0.0888671875,
"learning_rate": 7.873023715415019e-07,
"loss": 0.0036,
"reward": 2.674123525619507,
"reward_std": 0.15257620811462402,
"rewards/accuracy_reward_stage2": 0.68818598985672,
"rewards/format_reward_all_stage": 1.985937476158142,
"scores/refine_times": 1.5625,
"step": 862
},
{
"completion_length": 110.3125,
"epoch": 0.21319169960474307,
"grad_norm": 2.715061158769814,
"kl": 0.0771484375,
"learning_rate": 7.870553359683793e-07,
"loss": 0.0031,
"reward": 2.671416759490967,
"reward_std": 0.09916723519563675,
"rewards/accuracy_reward_stage2": 0.6714168787002563,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 863
},
{
"completion_length": 140.625,
"epoch": 0.2134387351778656,
"grad_norm": 4.1276033550796765,
"kl": 0.080078125,
"learning_rate": 7.868083003952569e-07,
"loss": 0.0032,
"reward": 2.622908115386963,
"reward_std": 0.11840154230594635,
"rewards/accuracy_reward_stage2": 0.6229078769683838,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.546875,
"step": 864
},
{
"completion_length": 106.6875,
"epoch": 0.21368577075098813,
"grad_norm": 1.7834332667991692,
"kl": 0.07275390625,
"learning_rate": 7.865612648221343e-07,
"loss": 0.0029,
"reward": 2.72995662689209,
"reward_std": 0.06430064141750336,
"rewards/accuracy_reward_stage2": 0.7299565076828003,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 865
},
{
"completion_length": 109.0,
"epoch": 0.21393280632411066,
"grad_norm": 6.533478455009108,
"kl": 0.056396484375,
"learning_rate": 7.863142292490119e-07,
"loss": 0.0023,
"reward": 2.5403122901916504,
"reward_std": 0.14339181780815125,
"rewards/accuracy_reward_stage2": 0.5403121113777161,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.109375,
"step": 866
},
{
"completion_length": 127.734375,
"epoch": 0.21417984189723321,
"grad_norm": 2.927182477110509,
"kl": 0.09375,
"learning_rate": 7.860671936758893e-07,
"loss": 0.0038,
"reward": 2.6628847122192383,
"reward_std": 0.1707833856344223,
"rewards/accuracy_reward_stage2": 0.6847599744796753,
"rewards/format_reward_all_stage": 1.978124976158142,
"scores/refine_times": 1.5625,
"step": 867
},
{
"completion_length": 84.125,
"epoch": 0.21442687747035574,
"grad_norm": 2.022836352589753,
"kl": 0.0771484375,
"learning_rate": 7.858201581027668e-07,
"loss": 0.0031,
"reward": 2.7814769744873047,
"reward_std": 0.03681057691574097,
"rewards/accuracy_reward_stage2": 0.7814772129058838,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 868
},
{
"completion_length": 112.1875,
"epoch": 0.21467391304347827,
"grad_norm": 4.342396733664187,
"kl": 0.07177734375,
"learning_rate": 7.855731225296443e-07,
"loss": 0.0029,
"reward": 2.410027503967285,
"reward_std": 0.20548567175865173,
"rewards/accuracy_reward_stage2": 0.5131524801254272,
"rewards/format_reward_all_stage": 1.896875023841858,
"scores/refine_times": 1.4375,
"step": 869
},
{
"completion_length": 123.40625,
"epoch": 0.2149209486166008,
"grad_norm": 1.617395557985399,
"kl": 0.07373046875,
"learning_rate": 7.853260869565217e-07,
"loss": 0.003,
"reward": 2.830418348312378,
"reward_std": 0.01754125952720642,
"rewards/accuracy_reward_stage2": 0.8304183483123779,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 870
},
{
"completion_length": 75.0625,
"epoch": 0.21516798418972333,
"grad_norm": 4.3176509106435255,
"kl": 0.078125,
"learning_rate": 7.850790513833991e-07,
"loss": 0.0031,
"reward": 2.724806308746338,
"reward_std": 0.04878024384379387,
"rewards/accuracy_reward_stage2": 0.7248064875602722,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 871
},
{
"completion_length": 129.4375,
"epoch": 0.21541501976284586,
"grad_norm": 2.812378128182156,
"kl": 0.083984375,
"learning_rate": 7.848320158102767e-07,
"loss": 0.0034,
"reward": 2.668769359588623,
"reward_std": 0.09495694190263748,
"rewards/accuracy_reward_stage2": 0.6750193238258362,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.546875,
"step": 872
},
{
"completion_length": 89.0,
"epoch": 0.21566205533596838,
"grad_norm": 2.8304688457113945,
"kl": 0.0869140625,
"learning_rate": 7.845849802371541e-07,
"loss": 0.0035,
"reward": 2.8485946655273438,
"reward_std": 0.05919472128152847,
"rewards/accuracy_reward_stage2": 0.8485945463180542,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 873
},
{
"completion_length": 90.578125,
"epoch": 0.2159090909090909,
"grad_norm": 3.8502592160508633,
"kl": 0.06982421875,
"learning_rate": 7.843379446640315e-07,
"loss": 0.0028,
"reward": 2.465599298477173,
"reward_std": 0.2232595980167389,
"rewards/accuracy_reward_stage2": 0.5905991792678833,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.203125,
"step": 874
},
{
"completion_length": 107.96875,
"epoch": 0.21615612648221344,
"grad_norm": 2.163127681716706,
"kl": 0.08740234375,
"learning_rate": 7.840909090909091e-07,
"loss": 0.0035,
"reward": 2.5576090812683105,
"reward_std": 0.07345429062843323,
"rewards/accuracy_reward_stage2": 0.5576090812683105,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.40625,
"step": 875
},
{
"completion_length": 90.609375,
"epoch": 0.21640316205533597,
"grad_norm": 2.911026203681279,
"kl": 0.07568359375,
"learning_rate": 7.838438735177866e-07,
"loss": 0.003,
"reward": 2.7440195083618164,
"reward_std": 0.10027820616960526,
"rewards/accuracy_reward_stage2": 0.7440195083618164,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 876
},
{
"completion_length": 118.5625,
"epoch": 0.2166501976284585,
"grad_norm": 3.7722970526311936,
"kl": 0.078125,
"learning_rate": 7.83596837944664e-07,
"loss": 0.0031,
"reward": 2.705394744873047,
"reward_std": 0.2528733015060425,
"rewards/accuracy_reward_stage2": 0.7116448283195496,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.46875,
"step": 877
},
{
"completion_length": 79.015625,
"epoch": 0.21689723320158102,
"grad_norm": 2.2348619430388097,
"kl": 0.1025390625,
"learning_rate": 7.833498023715415e-07,
"loss": 0.0041,
"reward": 2.8125,
"reward_std": 0.06681530922651291,
"rewards/accuracy_reward_stage2": 0.8125,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 878
},
{
"completion_length": 101.59375,
"epoch": 0.21714426877470355,
"grad_norm": 1.7133622297820073,
"kl": 0.06201171875,
"learning_rate": 7.831027667984189e-07,
"loss": 0.0025,
"reward": 2.6325061321258545,
"reward_std": 0.13117417693138123,
"rewards/accuracy_reward_stage2": 0.7575061321258545,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.15625,
"step": 879
},
{
"completion_length": 126.6875,
"epoch": 0.21739130434782608,
"grad_norm": 3.445132940662978,
"kl": 0.0703125,
"learning_rate": 7.828557312252963e-07,
"loss": 0.0028,
"reward": 2.574112892150879,
"reward_std": 0.10674077272415161,
"rewards/accuracy_reward_stage2": 0.5741128921508789,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.46875,
"step": 880
},
{
"completion_length": 111.34375,
"epoch": 0.2176383399209486,
"grad_norm": 3.4192114253246424,
"kl": 0.07666015625,
"learning_rate": 7.826086956521739e-07,
"loss": 0.0031,
"reward": 2.6431050300598145,
"reward_std": 0.21736359596252441,
"rewards/accuracy_reward_stage2": 0.7681052088737488,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.3125,
"step": 881
},
{
"completion_length": 102.953125,
"epoch": 0.21788537549407114,
"grad_norm": 2.257112329346978,
"kl": 0.0791015625,
"learning_rate": 7.823616600790513e-07,
"loss": 0.0032,
"reward": 2.7524609565734863,
"reward_std": 0.0027502209413796663,
"rewards/accuracy_reward_stage2": 0.7524610161781311,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 882
},
{
"completion_length": 103.15625,
"epoch": 0.21813241106719367,
"grad_norm": 3.942982015966452,
"kl": 0.072265625,
"learning_rate": 7.821146245059288e-07,
"loss": 0.0029,
"reward": 2.6912498474121094,
"reward_std": 0.0877779871225357,
"rewards/accuracy_reward_stage2": 0.7016666531562805,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.296875,
"step": 883
},
{
"completion_length": 92.46875,
"epoch": 0.2183794466403162,
"grad_norm": 4.112406050123377,
"kl": 0.1376953125,
"learning_rate": 7.818675889328063e-07,
"loss": 0.0055,
"reward": 2.843651056289673,
"reward_std": 0.11524404585361481,
"rewards/accuracy_reward_stage2": 0.8436509966850281,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 884
},
{
"completion_length": 100.5625,
"epoch": 0.21862648221343872,
"grad_norm": 2.8425864804826904,
"kl": 0.099609375,
"learning_rate": 7.816205533596838e-07,
"loss": 0.004,
"reward": 2.7560110092163086,
"reward_std": 0.07648982852697372,
"rewards/accuracy_reward_stage2": 0.7560111284255981,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 885
},
{
"completion_length": 153.015625,
"epoch": 0.21887351778656128,
"grad_norm": 1.7320481010726945,
"kl": 0.07861328125,
"learning_rate": 7.813735177865613e-07,
"loss": 0.0031,
"reward": 2.765270233154297,
"reward_std": 0.033314161002635956,
"rewards/accuracy_reward_stage2": 0.7652702331542969,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.640625,
"step": 886
},
{
"completion_length": 165.3125,
"epoch": 0.2191205533596838,
"grad_norm": 3.2769897873042666,
"kl": 0.06396484375,
"learning_rate": 7.811264822134387e-07,
"loss": 0.0025,
"reward": 2.6706483364105225,
"reward_std": 0.09551921486854553,
"rewards/accuracy_reward_stage2": 0.6706483364105225,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.75,
"step": 887
},
{
"completion_length": 87.90625,
"epoch": 0.21936758893280633,
"grad_norm": 2.9755951499704385,
"kl": 0.09765625,
"learning_rate": 7.808794466403161e-07,
"loss": 0.0039,
"reward": 2.736006736755371,
"reward_std": 0.0732710063457489,
"rewards/accuracy_reward_stage2": 0.7360066771507263,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 888
},
{
"completion_length": 95.109375,
"epoch": 0.21961462450592886,
"grad_norm": 3.388507614969505,
"kl": 0.07568359375,
"learning_rate": 7.806324110671937e-07,
"loss": 0.003,
"reward": 2.7535786628723145,
"reward_std": 0.028518326580524445,
"rewards/accuracy_reward_stage2": 0.7535787224769592,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 889
},
{
"completion_length": 85.640625,
"epoch": 0.2198616600790514,
"grad_norm": 3.8655007295215915,
"kl": 0.06298828125,
"learning_rate": 7.803853754940711e-07,
"loss": 0.0025,
"reward": 2.7656240463256836,
"reward_std": 0.10700556635856628,
"rewards/accuracy_reward_stage2": 0.7656240463256836,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 890
},
{
"completion_length": 93.90625,
"epoch": 0.22010869565217392,
"grad_norm": 1.700698474645943,
"kl": 0.08544921875,
"learning_rate": 7.801383399209485e-07,
"loss": 0.0034,
"reward": 2.713747501373291,
"reward_std": 0.02260005660355091,
"rewards/accuracy_reward_stage2": 0.7137476205825806,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 891
},
{
"completion_length": 118.640625,
"epoch": 0.22035573122529645,
"grad_norm": 2.2855748354605305,
"kl": 0.0634765625,
"learning_rate": 7.79891304347826e-07,
"loss": 0.0025,
"reward": 2.671875,
"reward_std": 0.11100947856903076,
"rewards/accuracy_reward_stage2": 0.671875,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 892
},
{
"completion_length": 88.109375,
"epoch": 0.22060276679841898,
"grad_norm": 3.9695782246909497,
"kl": 0.06005859375,
"learning_rate": 7.796442687747036e-07,
"loss": 0.0024,
"reward": 2.7079808712005615,
"reward_std": 0.1411241590976715,
"rewards/accuracy_reward_stage2": 0.7079808712005615,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 893
},
{
"completion_length": 100.046875,
"epoch": 0.2208498023715415,
"grad_norm": 3.3606486294352473,
"kl": 0.0810546875,
"learning_rate": 7.793972332015811e-07,
"loss": 0.0033,
"reward": 2.6410939693450928,
"reward_std": 0.11747156083583832,
"rewards/accuracy_reward_stage2": 0.6410939693450928,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 894
},
{
"completion_length": 84.78125,
"epoch": 0.22109683794466403,
"grad_norm": 3.441456907155892,
"kl": 0.0859375,
"learning_rate": 7.791501976284585e-07,
"loss": 0.0034,
"reward": 2.6340184211730957,
"reward_std": 0.08257357776165009,
"rewards/accuracy_reward_stage2": 0.6340183019638062,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 895
},
{
"completion_length": 110.4375,
"epoch": 0.22134387351778656,
"grad_norm": 2.6882059908650606,
"kl": 0.0634765625,
"learning_rate": 7.789031620553359e-07,
"loss": 0.0025,
"reward": 2.633941173553467,
"reward_std": 0.11902174353599548,
"rewards/accuracy_reward_stage2": 0.6912329196929932,
"rewards/format_reward_all_stage": 1.9427083730697632,
"scores/refine_times": 1.328125,
"step": 896
},
{
"completion_length": 111.765625,
"epoch": 0.2215909090909091,
"grad_norm": 2.5472008274936466,
"kl": 0.07763671875,
"learning_rate": 7.786561264822134e-07,
"loss": 0.0031,
"reward": 2.6494970321655273,
"reward_std": 0.1488344967365265,
"rewards/accuracy_reward_stage2": 0.6494969725608826,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.625,
"step": 897
},
{
"completion_length": 142.640625,
"epoch": 0.22183794466403162,
"grad_norm": 3.487433794581114,
"kl": 0.061767578125,
"learning_rate": 7.784090909090909e-07,
"loss": 0.0025,
"reward": 2.633687734603882,
"reward_std": 0.16814634203910828,
"rewards/accuracy_reward_stage2": 0.6336876749992371,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.515625,
"step": 898
},
{
"completion_length": 97.71875,
"epoch": 0.22208498023715414,
"grad_norm": 2.1276640745086226,
"kl": 0.068359375,
"learning_rate": 7.781620553359683e-07,
"loss": 0.0027,
"reward": 2.71793270111084,
"reward_std": 0.02380518987774849,
"rewards/accuracy_reward_stage2": 0.7179328799247742,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 899
},
{
"completion_length": 89.171875,
"epoch": 0.22233201581027667,
"grad_norm": 4.0853981081869435,
"kl": 0.08642578125,
"learning_rate": 7.779150197628458e-07,
"loss": 0.0035,
"reward": 2.381068229675293,
"reward_std": 0.2457825243473053,
"rewards/accuracy_reward_stage2": 0.5060682892799377,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.265625,
"step": 900
},
{
"completion_length": 86.390625,
"epoch": 0.2225790513833992,
"grad_norm": 3.8442263343748673,
"kl": 0.0703125,
"learning_rate": 7.776679841897232e-07,
"loss": 0.0028,
"reward": 2.7973074913024902,
"reward_std": 0.070428267121315,
"rewards/accuracy_reward_stage2": 0.7973074913024902,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 901
},
{
"completion_length": 109.34375,
"epoch": 0.22282608695652173,
"grad_norm": 2.3746786204682397,
"kl": 0.07861328125,
"learning_rate": 7.774209486166008e-07,
"loss": 0.0031,
"reward": 2.5825467109680176,
"reward_std": 0.22069254517555237,
"rewards/accuracy_reward_stage2": 0.7231717109680176,
"rewards/format_reward_all_stage": 1.859375,
"scores/refine_times": 1.28125,
"step": 902
},
{
"completion_length": 115.71875,
"epoch": 0.22307312252964426,
"grad_norm": 3.744151198948681,
"kl": 0.1181640625,
"learning_rate": 7.771739130434783e-07,
"loss": 0.0047,
"reward": 2.5876035690307617,
"reward_std": 0.13640564680099487,
"rewards/accuracy_reward_stage2": 0.6110408306121826,
"rewards/format_reward_all_stage": 1.9765625,
"scores/refine_times": 1.5625,
"step": 903
},
{
"completion_length": 95.859375,
"epoch": 0.22332015810276679,
"grad_norm": 3.9864543123830023,
"kl": 0.0888671875,
"learning_rate": 7.769268774703557e-07,
"loss": 0.0036,
"reward": 2.503157615661621,
"reward_std": 0.07907243072986603,
"rewards/accuracy_reward_stage2": 0.5031576156616211,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 904
},
{
"completion_length": 124.15625,
"epoch": 0.22356719367588934,
"grad_norm": 3.414528616278631,
"kl": 0.10009765625,
"learning_rate": 7.766798418972331e-07,
"loss": 0.004,
"reward": 2.6220712661743164,
"reward_std": 0.03463301062583923,
"rewards/accuracy_reward_stage2": 0.6220711469650269,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.59375,
"step": 905
},
{
"completion_length": 108.421875,
"epoch": 0.22381422924901187,
"grad_norm": 2.65452224241206,
"kl": 0.07080078125,
"learning_rate": 7.764328063241107e-07,
"loss": 0.0028,
"reward": 2.69258713722229,
"reward_std": 0.15411412715911865,
"rewards/accuracy_reward_stage2": 0.75508713722229,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.4375,
"step": 906
},
{
"completion_length": 120.15625,
"epoch": 0.2240612648221344,
"grad_norm": 3.8526319437643717,
"kl": 0.09619140625,
"learning_rate": 7.761857707509881e-07,
"loss": 0.0038,
"reward": 2.6302669048309326,
"reward_std": 0.13176321983337402,
"rewards/accuracy_reward_stage2": 0.6302669048309326,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.59375,
"step": 907
},
{
"completion_length": 103.4375,
"epoch": 0.22430830039525693,
"grad_norm": 2.8425333906095998,
"kl": 0.083984375,
"learning_rate": 7.759387351778656e-07,
"loss": 0.0034,
"reward": 2.5473570823669434,
"reward_std": 0.11453460156917572,
"rewards/accuracy_reward_stage2": 0.5473569631576538,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.421875,
"step": 908
},
{
"completion_length": 108.078125,
"epoch": 0.22455533596837945,
"grad_norm": 4.249231689338272,
"kl": 0.09716796875,
"learning_rate": 7.75691699604743e-07,
"loss": 0.0039,
"reward": 2.6726608276367188,
"reward_std": 0.05191066488623619,
"rewards/accuracy_reward_stage2": 0.6726609468460083,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.46875,
"step": 909
},
{
"completion_length": 100.75,
"epoch": 0.22480237154150198,
"grad_norm": 1.3264075118453993,
"kl": 0.0751953125,
"learning_rate": 7.754446640316205e-07,
"loss": 0.003,
"reward": 2.7188069820404053,
"reward_std": 0.10090002417564392,
"rewards/accuracy_reward_stage2": 0.7344319820404053,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.375,
"step": 910
},
{
"completion_length": 100.671875,
"epoch": 0.2250494071146245,
"grad_norm": 2.6499971246685643,
"kl": 0.0966796875,
"learning_rate": 7.75197628458498e-07,
"loss": 0.0039,
"reward": 2.651829481124878,
"reward_std": 0.05644657090306282,
"rewards/accuracy_reward_stage2": 0.6518294215202332,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.375,
"step": 911
},
{
"completion_length": 82.625,
"epoch": 0.22529644268774704,
"grad_norm": 4.2859115031657,
"kl": 0.0908203125,
"learning_rate": 7.749505928853755e-07,
"loss": 0.0036,
"reward": 2.6028127670288086,
"reward_std": 0.09801940619945526,
"rewards/accuracy_reward_stage2": 0.6106254458427429,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.234375,
"step": 912
},
{
"completion_length": 82.1875,
"epoch": 0.22554347826086957,
"grad_norm": 3.7686117517627125,
"kl": 0.07421875,
"learning_rate": 7.747035573122529e-07,
"loss": 0.003,
"reward": 2.5454444885253906,
"reward_std": 0.11515636742115021,
"rewards/accuracy_reward_stage2": 0.5454442501068115,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.078125,
"step": 913
},
{
"completion_length": 111.703125,
"epoch": 0.2257905138339921,
"grad_norm": 2.298219131706107,
"kl": 0.0732421875,
"learning_rate": 7.744565217391305e-07,
"loss": 0.0029,
"reward": 2.8503479957580566,
"reward_std": 0.055034905672073364,
"rewards/accuracy_reward_stage2": 0.8503477573394775,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.4375,
"step": 914
},
{
"completion_length": 121.0,
"epoch": 0.22603754940711462,
"grad_norm": 2.4396894747569893,
"kl": 0.10302734375,
"learning_rate": 7.742094861660079e-07,
"loss": 0.0041,
"reward": 2.753645658493042,
"reward_std": 0.07929170876741409,
"rewards/accuracy_reward_stage2": 0.753645658493042,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.515625,
"step": 915
},
{
"completion_length": 118.328125,
"epoch": 0.22628458498023715,
"grad_norm": 1.9607094755268186,
"kl": 0.09375,
"learning_rate": 7.739624505928853e-07,
"loss": 0.0038,
"reward": 2.658937454223633,
"reward_std": 0.02175423502922058,
"rewards/accuracy_reward_stage2": 0.658937394618988,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 916
},
{
"completion_length": 93.40625,
"epoch": 0.22653162055335968,
"grad_norm": 2.5887397012016566,
"kl": 0.07958984375,
"learning_rate": 7.737154150197628e-07,
"loss": 0.0032,
"reward": 2.5921926498413086,
"reward_std": 0.07951997220516205,
"rewards/accuracy_reward_stage2": 0.5921927690505981,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 917
},
{
"completion_length": 102.40625,
"epoch": 0.2267786561264822,
"grad_norm": 2.6225329982689742,
"kl": 0.0732421875,
"learning_rate": 7.734683794466402e-07,
"loss": 0.0029,
"reward": 2.766453266143799,
"reward_std": 0.06807538866996765,
"rewards/accuracy_reward_stage2": 0.7664532661437988,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 918
},
{
"completion_length": 87.078125,
"epoch": 0.22702569169960474,
"grad_norm": 2.812032447985376,
"kl": 0.06884765625,
"learning_rate": 7.732213438735177e-07,
"loss": 0.0027,
"reward": 2.8405091762542725,
"reward_std": 0.07255198061466217,
"rewards/accuracy_reward_stage2": 0.9030092358589172,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.125,
"step": 919
},
{
"completion_length": 88.578125,
"epoch": 0.22727272727272727,
"grad_norm": 3.352416052089907,
"kl": 0.08056640625,
"learning_rate": 7.729743083003952e-07,
"loss": 0.0032,
"reward": 2.8867108821868896,
"reward_std": 0.13572408258914948,
"rewards/accuracy_reward_stage2": 0.8971275687217712,
"rewards/format_reward_all_stage": 1.9895832538604736,
"scores/refine_times": 1.140625,
"step": 920
},
{
"completion_length": 120.65625,
"epoch": 0.2275197628458498,
"grad_norm": 1.9280858288695926,
"kl": 0.08544921875,
"learning_rate": 7.727272727272727e-07,
"loss": 0.0034,
"reward": 2.597029209136963,
"reward_std": 0.10120301693677902,
"rewards/accuracy_reward_stage2": 0.5970292687416077,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.546875,
"step": 921
},
{
"completion_length": 95.28125,
"epoch": 0.22776679841897232,
"grad_norm": 3.494588928223786,
"kl": 0.0849609375,
"learning_rate": 7.724802371541502e-07,
"loss": 0.0034,
"reward": 2.81472110748291,
"reward_std": 0.046646032482385635,
"rewards/accuracy_reward_stage2": 0.8147209882736206,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 922
},
{
"completion_length": 145.453125,
"epoch": 0.22801383399209485,
"grad_norm": 4.451245280949107,
"kl": 0.251953125,
"learning_rate": 7.722332015810277e-07,
"loss": 0.01,
"reward": 2.779994010925293,
"reward_std": 0.1533237248659134,
"rewards/accuracy_reward_stage2": 0.9049938917160034,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.65625,
"step": 923
},
{
"completion_length": 141.109375,
"epoch": 0.22826086956521738,
"grad_norm": 3.10942872267095,
"kl": 0.08740234375,
"learning_rate": 7.719861660079051e-07,
"loss": 0.0035,
"reward": 2.6243720054626465,
"reward_std": 0.08140174299478531,
"rewards/accuracy_reward_stage2": 0.6243720650672913,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.453125,
"step": 924
},
{
"completion_length": 113.125,
"epoch": 0.22850790513833993,
"grad_norm": 2.764415964686548,
"kl": 0.0986328125,
"learning_rate": 7.717391304347826e-07,
"loss": 0.0039,
"reward": 2.570035457611084,
"reward_std": 0.056792110204696655,
"rewards/accuracy_reward_stage2": 0.5700353384017944,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 925
},
{
"completion_length": 107.390625,
"epoch": 0.22875494071146246,
"grad_norm": 3.2431842066672534,
"kl": 0.123046875,
"learning_rate": 7.7149209486166e-07,
"loss": 0.0049,
"reward": 2.628124952316284,
"reward_std": 0.16987210512161255,
"rewards/accuracy_reward_stage2": 0.690625011920929,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.4375,
"step": 926
},
{
"completion_length": 116.078125,
"epoch": 0.229001976284585,
"grad_norm": 3.0388485387607407,
"kl": 0.0859375,
"learning_rate": 7.712450592885375e-07,
"loss": 0.0034,
"reward": 2.7118468284606934,
"reward_std": 0.11388581991195679,
"rewards/accuracy_reward_stage2": 0.7274720668792725,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.453125,
"step": 927
},
{
"completion_length": 125.390625,
"epoch": 0.22924901185770752,
"grad_norm": 2.4235888830830064,
"kl": 0.07373046875,
"learning_rate": 7.70998023715415e-07,
"loss": 0.0029,
"reward": 2.7621376514434814,
"reward_std": 0.01607050932943821,
"rewards/accuracy_reward_stage2": 0.7621376514434814,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.40625,
"step": 928
},
{
"completion_length": 152.75,
"epoch": 0.22949604743083005,
"grad_norm": 3.48783524178748,
"kl": 0.11083984375,
"learning_rate": 7.707509881422924e-07,
"loss": 0.0044,
"reward": 2.7437613010406494,
"reward_std": 0.1258169412612915,
"rewards/accuracy_reward_stage2": 0.7437613010406494,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.59375,
"step": 929
},
{
"completion_length": 125.78125,
"epoch": 0.22974308300395258,
"grad_norm": 2.84328279429593,
"kl": 0.07763671875,
"learning_rate": 7.705039525691699e-07,
"loss": 0.0031,
"reward": 2.6663095951080322,
"reward_std": 0.07182341814041138,
"rewards/accuracy_reward_stage2": 0.6819344162940979,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.609375,
"step": 930
},
{
"completion_length": 92.515625,
"epoch": 0.2299901185770751,
"grad_norm": 4.25905140589049,
"kl": 0.107421875,
"learning_rate": 7.702569169960475e-07,
"loss": 0.0043,
"reward": 2.658834218978882,
"reward_std": 0.1202336847782135,
"rewards/accuracy_reward_stage2": 0.6588343381881714,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 931
},
{
"completion_length": 131.265625,
"epoch": 0.23023715415019763,
"grad_norm": 3.87126995736488,
"kl": 0.083984375,
"learning_rate": 7.700098814229249e-07,
"loss": 0.0034,
"reward": 2.730388641357422,
"reward_std": 0.14922493696212769,
"rewards/accuracy_reward_stage2": 0.7928886413574219,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.5,
"step": 932
},
{
"completion_length": 153.6875,
"epoch": 0.23048418972332016,
"grad_norm": 3.205279272182687,
"kl": 0.0966796875,
"learning_rate": 7.697628458498024e-07,
"loss": 0.0039,
"reward": 2.547888994216919,
"reward_std": 0.1733008772134781,
"rewards/accuracy_reward_stage2": 0.6728890538215637,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.671875,
"step": 933
},
{
"completion_length": 126.84375,
"epoch": 0.2307312252964427,
"grad_norm": 3.400946207231384,
"kl": 0.083984375,
"learning_rate": 7.695158102766798e-07,
"loss": 0.0033,
"reward": 2.578770160675049,
"reward_std": 0.13955532014369965,
"rewards/accuracy_reward_stage2": 0.6516869068145752,
"rewards/format_reward_all_stage": 1.9270832538604736,
"scores/refine_times": 1.453125,
"step": 934
},
{
"completion_length": 122.65625,
"epoch": 0.23097826086956522,
"grad_norm": 3.248359221948695,
"kl": 0.0830078125,
"learning_rate": 7.692687747035573e-07,
"loss": 0.0033,
"reward": 2.5240418910980225,
"reward_std": 0.21152980625629425,
"rewards/accuracy_reward_stage2": 0.6438335180282593,
"rewards/format_reward_all_stage": 1.8802083730697632,
"scores/refine_times": 1.359375,
"step": 935
},
{
"completion_length": 133.171875,
"epoch": 0.23122529644268774,
"grad_norm": 2.4067173262093635,
"kl": 0.0810546875,
"learning_rate": 7.690217391304348e-07,
"loss": 0.0032,
"reward": 2.5248830318450928,
"reward_std": 0.36290836334228516,
"rewards/accuracy_reward_stage2": 0.7384247183799744,
"rewards/format_reward_all_stage": 1.7864583730697632,
"scores/refine_times": 1.59375,
"step": 936
},
{
"completion_length": 100.078125,
"epoch": 0.23147233201581027,
"grad_norm": 3.3677672306290787,
"kl": 0.06591796875,
"learning_rate": 7.687747035573122e-07,
"loss": 0.0026,
"reward": 2.5898051261901855,
"reward_std": 0.08437584340572357,
"rewards/accuracy_reward_stage2": 0.5898053646087646,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 937
},
{
"completion_length": 104.90625,
"epoch": 0.2317193675889328,
"grad_norm": 4.578017762931248,
"kl": 0.0986328125,
"learning_rate": 7.685276679841896e-07,
"loss": 0.0039,
"reward": 2.447653293609619,
"reward_std": 0.44203102588653564,
"rewards/accuracy_reward_stage2": 0.7757784128189087,
"rewards/format_reward_all_stage": 1.671875,
"scores/refine_times": 1.296875,
"step": 938
},
{
"completion_length": 101.359375,
"epoch": 0.23196640316205533,
"grad_norm": 4.2459965005065925,
"kl": 0.1328125,
"learning_rate": 7.682806324110671e-07,
"loss": 0.0053,
"reward": 2.704395294189453,
"reward_std": 0.073523610830307,
"rewards/accuracy_reward_stage2": 0.7200204133987427,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.1875,
"step": 939
},
{
"completion_length": 141.59375,
"epoch": 0.23221343873517786,
"grad_norm": 3.0089788909015347,
"kl": 0.11181640625,
"learning_rate": 7.680335968379447e-07,
"loss": 0.0045,
"reward": 2.509549140930176,
"reward_std": 0.21922683715820312,
"rewards/accuracy_reward_stage2": 0.6345490217208862,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.53125,
"step": 940
},
{
"completion_length": 88.828125,
"epoch": 0.23246047430830039,
"grad_norm": 3.976294816968691,
"kl": 0.07861328125,
"learning_rate": 7.677865612648221e-07,
"loss": 0.0031,
"reward": 2.5723557472229004,
"reward_std": 0.0782502293586731,
"rewards/accuracy_reward_stage2": 0.5723556280136108,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 941
},
{
"completion_length": 87.703125,
"epoch": 0.2327075098814229,
"grad_norm": 4.189264645949934,
"kl": 0.1025390625,
"learning_rate": 7.675395256916996e-07,
"loss": 0.0041,
"reward": 2.315624952316284,
"reward_std": 0.4645420014858246,
"rewards/accuracy_reward_stage2": 0.690625011920929,
"rewards/format_reward_all_stage": 1.625,
"scores/refine_times": 1.265625,
"step": 942
},
{
"completion_length": 107.46875,
"epoch": 0.23295454545454544,
"grad_norm": 3.1924104886203106,
"kl": 0.08935546875,
"learning_rate": 7.67292490118577e-07,
"loss": 0.0036,
"reward": 2.8017683029174805,
"reward_std": 0.05147233232855797,
"rewards/accuracy_reward_stage2": 0.8017681837081909,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.484375,
"step": 943
},
{
"completion_length": 116.75,
"epoch": 0.233201581027668,
"grad_norm": 2.5798367724537314,
"kl": 0.08349609375,
"learning_rate": 7.670454545454545e-07,
"loss": 0.0033,
"reward": 2.5186541080474854,
"reward_std": 0.1710503101348877,
"rewards/accuracy_reward_stage2": 0.6436540484428406,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.59375,
"step": 944
},
{
"completion_length": 92.71875,
"epoch": 0.23344861660079053,
"grad_norm": 2.7387857072229442,
"kl": 0.07177734375,
"learning_rate": 7.66798418972332e-07,
"loss": 0.0029,
"reward": 2.7367796897888184,
"reward_std": 0.0510886088013649,
"rewards/accuracy_reward_stage2": 0.7367798089981079,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 945
},
{
"completion_length": 122.171875,
"epoch": 0.23369565217391305,
"grad_norm": 4.013586606877289,
"kl": 0.08154296875,
"learning_rate": 7.665513833992094e-07,
"loss": 0.0033,
"reward": 2.650787353515625,
"reward_std": 0.2529640793800354,
"rewards/accuracy_reward_stage2": 0.7757871150970459,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.484375,
"step": 946
},
{
"completion_length": 123.203125,
"epoch": 0.23394268774703558,
"grad_norm": 3.047862851518325,
"kl": 0.08251953125,
"learning_rate": 7.663043478260868e-07,
"loss": 0.0033,
"reward": 2.8821022510528564,
"reward_std": 0.11100947856903076,
"rewards/accuracy_reward_stage2": 0.8821022510528564,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.53125,
"step": 947
},
{
"completion_length": 114.65625,
"epoch": 0.2341897233201581,
"grad_norm": 3.592191483783041,
"kl": 0.07958984375,
"learning_rate": 7.660573122529644e-07,
"loss": 0.0032,
"reward": 2.639193534851074,
"reward_std": 0.07999169081449509,
"rewards/accuracy_reward_stage2": 0.6391934156417847,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.4375,
"step": 948
},
{
"completion_length": 84.3125,
"epoch": 0.23443675889328064,
"grad_norm": 2.1159024076815345,
"kl": 0.0751953125,
"learning_rate": 7.658102766798419e-07,
"loss": 0.003,
"reward": 2.8071794509887695,
"reward_std": 0.0024097806308418512,
"rewards/accuracy_reward_stage2": 0.8071794509887695,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 949
},
{
"completion_length": 115.6875,
"epoch": 0.23468379446640317,
"grad_norm": 2.8719086261669236,
"kl": 0.0888671875,
"learning_rate": 7.655632411067194e-07,
"loss": 0.0036,
"reward": 2.5976603031158447,
"reward_std": 0.19923239946365356,
"rewards/accuracy_reward_stage2": 0.7226603031158447,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.578125,
"step": 950
},
{
"completion_length": 130.765625,
"epoch": 0.2349308300395257,
"grad_norm": 2.8415116889005954,
"kl": 0.08203125,
"learning_rate": 7.653162055335968e-07,
"loss": 0.0033,
"reward": 2.8011350631713867,
"reward_std": 0.024894852191209793,
"rewards/accuracy_reward_stage2": 0.8011349439620972,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.484375,
"step": 951
},
{
"completion_length": 113.90625,
"epoch": 0.23517786561264822,
"grad_norm": 1.8274315476260714,
"kl": 0.09228515625,
"learning_rate": 7.650691699604743e-07,
"loss": 0.0037,
"reward": 2.7534830570220947,
"reward_std": 0.07084871828556061,
"rewards/accuracy_reward_stage2": 0.75348299741745,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 952
},
{
"completion_length": 111.015625,
"epoch": 0.23542490118577075,
"grad_norm": 4.2390429237173235,
"kl": 0.0927734375,
"learning_rate": 7.648221343873518e-07,
"loss": 0.0037,
"reward": 2.6470580101013184,
"reward_std": 0.18395009636878967,
"rewards/accuracy_reward_stage2": 0.7199746370315552,
"rewards/format_reward_all_stage": 1.9270833730697632,
"scores/refine_times": 1.515625,
"step": 953
},
{
"completion_length": 77.625,
"epoch": 0.23567193675889328,
"grad_norm": 4.251078064094217,
"kl": 0.10302734375,
"learning_rate": 7.645750988142292e-07,
"loss": 0.0041,
"reward": 2.7399420738220215,
"reward_std": 0.034756097942590714,
"rewards/accuracy_reward_stage2": 0.7399421334266663,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 954
},
{
"completion_length": 83.3125,
"epoch": 0.2359189723320158,
"grad_norm": 4.688994500071449,
"kl": 0.099609375,
"learning_rate": 7.643280632411066e-07,
"loss": 0.004,
"reward": 2.4010202884674072,
"reward_std": 0.06971799582242966,
"rewards/accuracy_reward_stage2": 0.40102022886276245,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.21875,
"step": 955
},
{
"completion_length": 90.90625,
"epoch": 0.23616600790513834,
"grad_norm": 3.0926787814732064,
"kl": 0.068359375,
"learning_rate": 7.640810276679841e-07,
"loss": 0.0027,
"reward": 2.7051496505737305,
"reward_std": 0.012495389208197594,
"rewards/accuracy_reward_stage2": 0.7051496505737305,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 956
},
{
"completion_length": 120.546875,
"epoch": 0.23641304347826086,
"grad_norm": 3.619910409738279,
"kl": 0.11669921875,
"learning_rate": 7.638339920948616e-07,
"loss": 0.0047,
"reward": 2.355205535888672,
"reward_std": 0.16237197816371918,
"rewards/accuracy_reward_stage2": 0.469789057970047,
"rewards/format_reward_all_stage": 1.8854166269302368,
"scores/refine_times": 1.53125,
"step": 957
},
{
"completion_length": 104.4375,
"epoch": 0.2366600790513834,
"grad_norm": 4.1469106870051835,
"kl": 0.109375,
"learning_rate": 7.635869565217391e-07,
"loss": 0.0044,
"reward": 2.559445858001709,
"reward_std": 0.15328779816627502,
"rewards/accuracy_reward_stage2": 0.5698623061180115,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.53125,
"step": 958
},
{
"completion_length": 79.09375,
"epoch": 0.23690711462450592,
"grad_norm": 1.1230604076719064,
"kl": 0.08935546875,
"learning_rate": 7.633399209486166e-07,
"loss": 0.0036,
"reward": 2.766244649887085,
"reward_std": 0.012647372670471668,
"rewards/accuracy_reward_stage2": 0.7662445902824402,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.15625,
"step": 959
},
{
"completion_length": 81.59375,
"epoch": 0.23715415019762845,
"grad_norm": 4.099677035647309,
"kl": 0.095703125,
"learning_rate": 7.63092885375494e-07,
"loss": 0.0038,
"reward": 2.7637486457824707,
"reward_std": 0.053550224751234055,
"rewards/accuracy_reward_stage2": 0.7637484073638916,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 960
},
{
"completion_length": 117.5,
"epoch": 0.23740118577075098,
"grad_norm": 2.4667498900768488,
"kl": 0.07568359375,
"learning_rate": 7.628458498023716e-07,
"loss": 0.003,
"reward": 2.700864791870117,
"reward_std": 0.13465310633182526,
"rewards/accuracy_reward_stage2": 0.7008647918701172,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 961
},
{
"completion_length": 68.125,
"epoch": 0.2376482213438735,
"grad_norm": 4.113070453187623,
"kl": 0.09033203125,
"learning_rate": 7.62598814229249e-07,
"loss": 0.0036,
"reward": 2.504481077194214,
"reward_std": 0.016891546547412872,
"rewards/accuracy_reward_stage2": 0.5044810771942139,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0,
"step": 962
},
{
"completion_length": 92.0625,
"epoch": 0.23789525691699603,
"grad_norm": 3.146318360416011,
"kl": 0.0703125,
"learning_rate": 7.623517786561264e-07,
"loss": 0.0028,
"reward": 2.723379135131836,
"reward_std": 0.16081024706363678,
"rewards/accuracy_reward_stage2": 0.8483791351318359,
"rewards/format_reward_all_stage": 1.875,
"scores/refine_times": 1.203125,
"step": 963
},
{
"completion_length": 101.640625,
"epoch": 0.2381422924901186,
"grad_norm": 1.2878437891896688,
"kl": 0.0810546875,
"learning_rate": 7.621047430830039e-07,
"loss": 0.0032,
"reward": 2.9084339141845703,
"reward_std": 0.018986623734235764,
"rewards/accuracy_reward_stage2": 0.9084337949752808,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.46875,
"step": 964
},
{
"completion_length": 95.4375,
"epoch": 0.23838932806324112,
"grad_norm": 3.417312289254013,
"kl": 0.08740234375,
"learning_rate": 7.618577075098814e-07,
"loss": 0.0035,
"reward": 2.6807949542999268,
"reward_std": 0.14062517881393433,
"rewards/accuracy_reward_stage2": 0.6964200735092163,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.296875,
"step": 965
},
{
"completion_length": 77.859375,
"epoch": 0.23863636363636365,
"grad_norm": 4.137033586610404,
"kl": 0.08251953125,
"learning_rate": 7.616106719367588e-07,
"loss": 0.0033,
"reward": 2.4498672485351562,
"reward_std": 0.012593725696206093,
"rewards/accuracy_reward_stage2": 0.4498673677444458,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 966
},
{
"completion_length": 97.296875,
"epoch": 0.23888339920948617,
"grad_norm": 3.496193472267087,
"kl": 0.0927734375,
"learning_rate": 7.613636363636364e-07,
"loss": 0.0037,
"reward": 2.7825238704681396,
"reward_std": 0.07081930339336395,
"rewards/accuracy_reward_stage2": 0.7825238704681396,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 967
},
{
"completion_length": 91.4375,
"epoch": 0.2391304347826087,
"grad_norm": 3.9040594654045284,
"kl": 0.0849609375,
"learning_rate": 7.611166007905138e-07,
"loss": 0.0034,
"reward": 2.6640286445617676,
"reward_std": 0.02129337564110756,
"rewards/accuracy_reward_stage2": 0.6640284657478333,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 968
},
{
"completion_length": 85.109375,
"epoch": 0.23937747035573123,
"grad_norm": 4.528559323672272,
"kl": 0.083984375,
"learning_rate": 7.608695652173913e-07,
"loss": 0.0034,
"reward": 2.4769628047943115,
"reward_std": 0.14590422809123993,
"rewards/accuracy_reward_stage2": 0.4769628345966339,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 969
},
{
"completion_length": 85.4375,
"epoch": 0.23962450592885376,
"grad_norm": 2.181031316308596,
"kl": 0.11279296875,
"learning_rate": 7.606225296442688e-07,
"loss": 0.0045,
"reward": 2.686516284942627,
"reward_std": 0.03785046935081482,
"rewards/accuracy_reward_stage2": 0.6865162253379822,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 970
},
{
"completion_length": 105.796875,
"epoch": 0.2398715415019763,
"grad_norm": 2.563633299576689,
"kl": 0.1435546875,
"learning_rate": 7.603754940711462e-07,
"loss": 0.0057,
"reward": 2.738003730773926,
"reward_std": 0.037225548177957535,
"rewards/accuracy_reward_stage2": 0.7380036115646362,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.515625,
"step": 971
},
{
"completion_length": 85.921875,
"epoch": 0.24011857707509882,
"grad_norm": 5.044397747161594,
"kl": 0.10302734375,
"learning_rate": 7.601284584980236e-07,
"loss": 0.0041,
"reward": 2.5778369903564453,
"reward_std": 0.07295674085617065,
"rewards/accuracy_reward_stage2": 0.5934619307518005,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.203125,
"step": 972
},
{
"completion_length": 94.234375,
"epoch": 0.24036561264822134,
"grad_norm": 4.030016499108354,
"kl": 0.09228515625,
"learning_rate": 7.598814229249012e-07,
"loss": 0.0037,
"reward": 2.612272262573242,
"reward_std": 0.03439907357096672,
"rewards/accuracy_reward_stage2": 0.6122722029685974,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.421875,
"step": 973
},
{
"completion_length": 100.96875,
"epoch": 0.24061264822134387,
"grad_norm": 2.7858570805718648,
"kl": 0.0751953125,
"learning_rate": 7.596343873517786e-07,
"loss": 0.003,
"reward": 2.638523578643799,
"reward_std": 0.0695050060749054,
"rewards/accuracy_reward_stage2": 0.6385236978530884,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.40625,
"step": 974
},
{
"completion_length": 100.5,
"epoch": 0.2408596837944664,
"grad_norm": 3.356791391332148,
"kl": 0.1005859375,
"learning_rate": 7.59387351778656e-07,
"loss": 0.004,
"reward": 2.85272216796875,
"reward_std": 0.05522162467241287,
"rewards/accuracy_reward_stage2": 0.85272216796875,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.453125,
"step": 975
},
{
"completion_length": 90.484375,
"epoch": 0.24110671936758893,
"grad_norm": 2.5461528562751194,
"kl": 0.099609375,
"learning_rate": 7.591403162055335e-07,
"loss": 0.004,
"reward": 2.7427287101745605,
"reward_std": 0.06681530922651291,
"rewards/accuracy_reward_stage2": 0.7427287697792053,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.375,
"step": 976
},
{
"completion_length": 99.84375,
"epoch": 0.24135375494071146,
"grad_norm": 3.1758314697986822,
"kl": 0.1171875,
"learning_rate": 7.58893280632411e-07,
"loss": 0.0047,
"reward": 2.753756523132324,
"reward_std": 0.08295577764511108,
"rewards/accuracy_reward_stage2": 0.7537566423416138,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.46875,
"step": 977
},
{
"completion_length": 69.4375,
"epoch": 0.24160079051383399,
"grad_norm": 2.2364912811701285,
"kl": 0.109375,
"learning_rate": 7.586462450592886e-07,
"loss": 0.0044,
"reward": 2.868748188018799,
"reward_std": 0.00799154955893755,
"rewards/accuracy_reward_stage2": 0.8687483072280884,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 978
},
{
"completion_length": 92.125,
"epoch": 0.2418478260869565,
"grad_norm": 3.7954929280625547,
"kl": 0.08251953125,
"learning_rate": 7.58399209486166e-07,
"loss": 0.0033,
"reward": 2.7547996044158936,
"reward_std": 0.14635036885738373,
"rewards/accuracy_reward_stage2": 0.8199037313461304,
"rewards/format_reward_all_stage": 1.9348958730697632,
"scores/refine_times": 1.40625,
"step": 979
},
{
"completion_length": 75.46875,
"epoch": 0.24209486166007904,
"grad_norm": 2.5535180288665376,
"kl": 0.09130859375,
"learning_rate": 7.581521739130434e-07,
"loss": 0.0037,
"reward": 2.840176582336426,
"reward_std": 0.07781560719013214,
"rewards/accuracy_reward_stage2": 0.8401765823364258,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 980
},
{
"completion_length": 133.421875,
"epoch": 0.24234189723320157,
"grad_norm": 2.3731209551138543,
"kl": 0.0712890625,
"learning_rate": 7.579051383399209e-07,
"loss": 0.0029,
"reward": 2.651949882507324,
"reward_std": 0.11739328503608704,
"rewards/accuracy_reward_stage2": 0.7144500017166138,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.90625,
"step": 981
},
{
"completion_length": 84.84375,
"epoch": 0.2425889328063241,
"grad_norm": 5.0187188109372975,
"kl": 0.0966796875,
"learning_rate": 7.576581027667984e-07,
"loss": 0.0039,
"reward": 2.6951634883880615,
"reward_std": 0.04134798049926758,
"rewards/accuracy_reward_stage2": 0.6951634883880615,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 982
},
{
"completion_length": 84.671875,
"epoch": 0.24283596837944665,
"grad_norm": 2.31892736773026,
"kl": 0.07080078125,
"learning_rate": 7.574110671936758e-07,
"loss": 0.0028,
"reward": 2.881075859069824,
"reward_std": 0.06218536198139191,
"rewards/accuracy_reward_stage2": 0.8914925456047058,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.1875,
"step": 983
},
{
"completion_length": 101.5,
"epoch": 0.24308300395256918,
"grad_norm": 4.532001278651413,
"kl": 0.09228515625,
"learning_rate": 7.571640316205533e-07,
"loss": 0.0037,
"reward": 2.686784505844116,
"reward_std": 0.14913466572761536,
"rewards/accuracy_reward_stage2": 0.6945970058441162,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.359375,
"step": 984
},
{
"completion_length": 78.65625,
"epoch": 0.2433300395256917,
"grad_norm": 2.427351371197481,
"kl": 0.1025390625,
"learning_rate": 7.569169960474307e-07,
"loss": 0.0041,
"reward": 2.7506766319274902,
"reward_std": 0.001765109016560018,
"rewards/accuracy_reward_stage2": 0.7506764531135559,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 985
},
{
"completion_length": 69.65625,
"epoch": 0.24357707509881424,
"grad_norm": 4.71948540499666,
"kl": 0.095703125,
"learning_rate": 7.566699604743084e-07,
"loss": 0.0038,
"reward": 2.7605538368225098,
"reward_std": 0.06787852942943573,
"rewards/accuracy_reward_stage2": 0.7605538964271545,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 986
},
{
"completion_length": 62.53125,
"epoch": 0.24382411067193677,
"grad_norm": 2.790062335645781,
"kl": 0.095703125,
"learning_rate": 7.564229249011858e-07,
"loss": 0.0038,
"reward": 2.872288703918457,
"reward_std": 0.0004953596508130431,
"rewards/accuracy_reward_stage2": 0.8722887635231018,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.0625,
"step": 987
},
{
"completion_length": 92.09375,
"epoch": 0.2440711462450593,
"grad_norm": 2.5955491599396074,
"kl": 0.0712890625,
"learning_rate": 7.561758893280632e-07,
"loss": 0.0029,
"reward": 2.7611703872680664,
"reward_std": 0.0292732622474432,
"rewards/accuracy_reward_stage2": 0.7611702680587769,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 988
},
{
"completion_length": 124.15625,
"epoch": 0.24431818181818182,
"grad_norm": 2.4770893409000316,
"kl": 0.07470703125,
"learning_rate": 7.559288537549407e-07,
"loss": 0.003,
"reward": 2.566910743713379,
"reward_std": 0.079023078083992,
"rewards/accuracy_reward_stage2": 0.5669107437133789,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.65625,
"step": 989
},
{
"completion_length": 88.0,
"epoch": 0.24456521739130435,
"grad_norm": 4.556477660838416,
"kl": 0.0869140625,
"learning_rate": 7.556818181818182e-07,
"loss": 0.0035,
"reward": 2.6633074283599854,
"reward_std": 0.09606172144412994,
"rewards/accuracy_reward_stage2": 0.6633073687553406,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.125,
"step": 990
},
{
"completion_length": 104.796875,
"epoch": 0.24481225296442688,
"grad_norm": 2.6122086287602713,
"kl": 0.07666015625,
"learning_rate": 7.554347826086956e-07,
"loss": 0.0031,
"reward": 2.571780204772949,
"reward_std": 0.07648254930973053,
"rewards/accuracy_reward_stage2": 0.5717802047729492,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.453125,
"step": 991
},
{
"completion_length": 76.59375,
"epoch": 0.2450592885375494,
"grad_norm": 4.7804674694975775,
"kl": 0.07080078125,
"learning_rate": 7.551877470355731e-07,
"loss": 0.0028,
"reward": 2.714232921600342,
"reward_std": 0.06315543502569199,
"rewards/accuracy_reward_stage2": 0.7142329216003418,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.109375,
"step": 992
},
{
"completion_length": 95.734375,
"epoch": 0.24530632411067194,
"grad_norm": 2.5192200109285294,
"kl": 0.08984375,
"learning_rate": 7.549407114624505e-07,
"loss": 0.0036,
"reward": 2.724212408065796,
"reward_std": 0.016075868159532547,
"rewards/accuracy_reward_stage2": 0.7242124676704407,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 993
},
{
"completion_length": 91.53125,
"epoch": 0.24555335968379446,
"grad_norm": 3.1808436621622116,
"kl": 0.07470703125,
"learning_rate": 7.546936758893279e-07,
"loss": 0.003,
"reward": 2.5958046913146973,
"reward_std": 0.02057287096977234,
"rewards/accuracy_reward_stage2": 0.595804750919342,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 994
},
{
"completion_length": 88.078125,
"epoch": 0.245800395256917,
"grad_norm": 3.075094133501221,
"kl": 0.08544921875,
"learning_rate": 7.544466403162056e-07,
"loss": 0.0034,
"reward": 2.7611076831817627,
"reward_std": 0.09097757935523987,
"rewards/accuracy_reward_stage2": 0.7611076831817627,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.234375,
"step": 995
},
{
"completion_length": 106.84375,
"epoch": 0.24604743083003952,
"grad_norm": 2.2773682917414493,
"kl": 0.060546875,
"learning_rate": 7.54199604743083e-07,
"loss": 0.0024,
"reward": 2.7901453971862793,
"reward_std": 0.0059069436974823475,
"rewards/accuracy_reward_stage2": 0.7901455163955688,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.328125,
"step": 996
},
{
"completion_length": 103.9375,
"epoch": 0.24629446640316205,
"grad_norm": 3.5787808365072875,
"kl": 0.09765625,
"learning_rate": 7.539525691699604e-07,
"loss": 0.0039,
"reward": 2.538865566253662,
"reward_std": 0.17112267017364502,
"rewards/accuracy_reward_stage2": 0.5388656258583069,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.375,
"step": 997
},
{
"completion_length": 122.25,
"epoch": 0.24654150197628458,
"grad_norm": 3.092097391385804,
"kl": 0.0625,
"learning_rate": 7.537055335968379e-07,
"loss": 0.0025,
"reward": 2.701768636703491,
"reward_std": 0.07486184686422348,
"rewards/accuracy_reward_stage2": 0.7017685770988464,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.453125,
"step": 998
},
{
"completion_length": 106.09375,
"epoch": 0.2467885375494071,
"grad_norm": 3.818115552269006,
"kl": 0.08837890625,
"learning_rate": 7.534584980237154e-07,
"loss": 0.0035,
"reward": 2.655691385269165,
"reward_std": 0.1008155569434166,
"rewards/accuracy_reward_stage2": 0.6556915044784546,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.390625,
"step": 999
},
{
"completion_length": 75.1875,
"epoch": 0.24703557312252963,
"grad_norm": 1.9883109917462498,
"kl": 0.076171875,
"learning_rate": 7.532114624505929e-07,
"loss": 0.003,
"reward": 2.7738916873931885,
"reward_std": 0.03292723000049591,
"rewards/accuracy_reward_stage2": 0.7738916873931885,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.171875,
"step": 1000
},
{
"completion_length": 115.71875,
"epoch": 0.24728260869565216,
"grad_norm": 3.1982546373064786,
"kl": 0.09033203125,
"learning_rate": 7.529644268774703e-07,
"loss": 0.0036,
"reward": 2.843109607696533,
"reward_std": 0.12261004000902176,
"rewards/accuracy_reward_stage2": 0.8431097269058228,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.484375,
"step": 1001
},
{
"completion_length": 101.109375,
"epoch": 0.2475296442687747,
"grad_norm": 2.574042445943342,
"kl": 0.06689453125,
"learning_rate": 7.527173913043477e-07,
"loss": 0.0027,
"reward": 2.5342187881469727,
"reward_std": 0.12293697893619537,
"rewards/accuracy_reward_stage2": 0.5420314073562622,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.390625,
"step": 1002
},
{
"completion_length": 93.890625,
"epoch": 0.24777667984189725,
"grad_norm": 0.19752743577346163,
"kl": 0.0615234375,
"learning_rate": 7.524703557312253e-07,
"loss": 0.0025,
"reward": 2.5,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 0.5,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.3125,
"step": 1003
},
{
"completion_length": 74.140625,
"epoch": 0.24802371541501977,
"grad_norm": 3.25297354456302,
"kl": 0.115234375,
"learning_rate": 7.522233201581028e-07,
"loss": 0.0046,
"reward": 2.7395200729370117,
"reward_std": 0.0691099762916565,
"rewards/accuracy_reward_stage2": 0.7395201921463013,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 1004
},
{
"completion_length": 101.265625,
"epoch": 0.2482707509881423,
"grad_norm": 2.786549692375035,
"kl": 0.07421875,
"learning_rate": 7.519762845849802e-07,
"loss": 0.003,
"reward": 2.7277092933654785,
"reward_std": 0.08703543990850449,
"rewards/accuracy_reward_stage2": 0.7277094125747681,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.34375,
"step": 1005
},
{
"completion_length": 73.265625,
"epoch": 0.24851778656126483,
"grad_norm": 2.9933686564396313,
"kl": 0.06396484375,
"learning_rate": 7.517292490118577e-07,
"loss": 0.0026,
"reward": 2.625016212463379,
"reward_std": 0.05336294695734978,
"rewards/accuracy_reward_stage2": 0.6250161528587341,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.140625,
"step": 1006
},
{
"completion_length": 90.71875,
"epoch": 0.24876482213438736,
"grad_norm": 3.37668174197228,
"kl": 0.07275390625,
"learning_rate": 7.514822134387352e-07,
"loss": 0.0029,
"reward": 2.533514976501465,
"reward_std": 0.07456710934638977,
"rewards/accuracy_reward_stage2": 0.5335150957107544,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 1007
},
{
"completion_length": 109.453125,
"epoch": 0.2490118577075099,
"grad_norm": 3.9869505928266453,
"kl": 0.10546875,
"learning_rate": 7.512351778656126e-07,
"loss": 0.0042,
"reward": 2.6633753776550293,
"reward_std": 0.08245876431465149,
"rewards/accuracy_reward_stage2": 0.6633754372596741,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 1008
},
{
"completion_length": 101.828125,
"epoch": 0.24925889328063242,
"grad_norm": 3.3323257698984188,
"kl": 0.06884765625,
"learning_rate": 7.509881422924901e-07,
"loss": 0.0028,
"reward": 2.789440870285034,
"reward_std": 0.11122827976942062,
"rewards/accuracy_reward_stage2": 0.7894407510757446,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.28125,
"step": 1009
},
{
"completion_length": 154.296875,
"epoch": 0.24950592885375494,
"grad_norm": 3.1765086057799063,
"kl": 0.07373046875,
"learning_rate": 7.507411067193675e-07,
"loss": 0.0029,
"reward": 2.5902161598205566,
"reward_std": 0.2405627965927124,
"rewards/accuracy_reward_stage2": 0.615216076374054,
"rewards/format_reward_all_stage": 1.975000023841858,
"scores/refine_times": 2.078125,
"step": 1010
},
{
"completion_length": 113.5625,
"epoch": 0.24975296442687747,
"grad_norm": 3.620470729082483,
"kl": 0.0830078125,
"learning_rate": 7.50494071146245e-07,
"loss": 0.0033,
"reward": 2.6796677112579346,
"reward_std": 0.16890506446361542,
"rewards/accuracy_reward_stage2": 0.6796677112579346,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.5625,
"step": 1011
},
{
"completion_length": 124.171875,
"epoch": 0.25,
"grad_norm": 3.01631508432083,
"kl": 0.10400390625,
"learning_rate": 7.502470355731225e-07,
"loss": 0.0042,
"reward": 2.738471269607544,
"reward_std": 0.10241246223449707,
"rewards/accuracy_reward_stage2": 0.7384714484214783,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.859375,
"step": 1012
},
{
"completion_length": 148.671875,
"epoch": 0.25024703557312256,
"grad_norm": 1.4692891485044033,
"kl": 0.076171875,
"learning_rate": 7.5e-07,
"loss": 0.003,
"reward": 2.6656787395477295,
"reward_std": 0.08242332935333252,
"rewards/accuracy_reward_stage2": 0.6656786799430847,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.984375,
"step": 1013
},
{
"completion_length": 130.828125,
"epoch": 0.25049407114624506,
"grad_norm": 2.7604690193081596,
"kl": 0.07666015625,
"learning_rate": 7.497529644268775e-07,
"loss": 0.0031,
"reward": 2.571310520172119,
"reward_std": 0.060206782072782516,
"rewards/accuracy_reward_stage2": 0.5713105201721191,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.875,
"step": 1014
},
{
"completion_length": 90.125,
"epoch": 0.2507411067193676,
"grad_norm": 3.249774020301462,
"kl": 0.0703125,
"learning_rate": 7.495059288537549e-07,
"loss": 0.0028,
"reward": 2.7852025032043457,
"reward_std": 0.005425943061709404,
"rewards/accuracy_reward_stage2": 0.7852025628089905,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.265625,
"step": 1015
},
{
"completion_length": 90.21875,
"epoch": 0.2509881422924901,
"grad_norm": 3.406147860849275,
"kl": 0.0791015625,
"learning_rate": 7.492588932806324e-07,
"loss": 0.0032,
"reward": 2.6671500205993652,
"reward_std": 0.06550855934619904,
"rewards/accuracy_reward_stage2": 0.6671501994132996,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.296875,
"step": 1016
},
{
"completion_length": 119.359375,
"epoch": 0.25123517786561267,
"grad_norm": 2.5853308765349716,
"kl": 0.0634765625,
"learning_rate": 7.490118577075099e-07,
"loss": 0.0025,
"reward": 2.4825072288513184,
"reward_std": 0.08852247148752213,
"rewards/accuracy_reward_stage2": 0.4825071692466736,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.46875,
"step": 1017
},
{
"completion_length": 108.234375,
"epoch": 0.25148221343873517,
"grad_norm": 0.17500328397226975,
"kl": 0.04931640625,
"learning_rate": 7.487648221343873e-07,
"loss": 0.002,
"reward": 3.0,
"reward_std": 0.0,
"rewards/accuracy_reward_stage2": 1.0,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.421875,
"step": 1018
},
{
"completion_length": 114.9375,
"epoch": 0.2517292490118577,
"grad_norm": 1.8677219538650585,
"kl": 0.06298828125,
"learning_rate": 7.485177865612647e-07,
"loss": 0.0025,
"reward": 2.8387131690979004,
"reward_std": 0.03201249614357948,
"rewards/accuracy_reward_stage2": 0.8387130498886108,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.515625,
"step": 1019
},
{
"completion_length": 116.5,
"epoch": 0.2519762845849802,
"grad_norm": 3.918756506685844,
"kl": 0.08935546875,
"learning_rate": 7.482707509881423e-07,
"loss": 0.0036,
"reward": 2.706343650817871,
"reward_std": 0.1193799301981926,
"rewards/accuracy_reward_stage2": 0.7688437700271606,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.546875,
"step": 1020
},
{
"completion_length": 103.046875,
"epoch": 0.2522233201581028,
"grad_norm": 2.2787870909974424,
"kl": 0.099609375,
"learning_rate": 7.480237154150197e-07,
"loss": 0.004,
"reward": 2.6051716804504395,
"reward_std": 0.0634067952632904,
"rewards/accuracy_reward_stage2": 0.6114215850830078,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.546875,
"step": 1021
},
{
"completion_length": 140.375,
"epoch": 0.2524703557312253,
"grad_norm": 1.5786423143571975,
"kl": 0.0634765625,
"learning_rate": 7.477766798418971e-07,
"loss": 0.0025,
"reward": 2.8234591484069824,
"reward_std": 0.1224951446056366,
"rewards/accuracy_reward_stage2": 0.8234590888023376,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.90625,
"step": 1022
},
{
"completion_length": 130.015625,
"epoch": 0.25271739130434784,
"grad_norm": 3.6531054074473466,
"kl": 0.0654296875,
"learning_rate": 7.475296442687747e-07,
"loss": 0.0026,
"reward": 2.7177672386169434,
"reward_std": 0.05522045120596886,
"rewards/accuracy_reward_stage2": 0.7177671194076538,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.53125,
"step": 1023
},
{
"completion_length": 138.125,
"epoch": 0.25296442687747034,
"grad_norm": 1.4829799008640552,
"kl": 0.09619140625,
"learning_rate": 7.472826086956522e-07,
"loss": 0.0038,
"reward": 2.8771204948425293,
"reward_std": 0.04724188148975372,
"rewards/accuracy_reward_stage2": 0.8911830186843872,
"rewards/format_reward_all_stage": 1.985937476158142,
"scores/refine_times": 1.96875,
"step": 1024
},
{
"completion_length": 127.140625,
"epoch": 0.2532114624505929,
"grad_norm": 1.5881287321745357,
"kl": 0.0771484375,
"learning_rate": 7.470355731225296e-07,
"loss": 0.0031,
"reward": 2.7447822093963623,
"reward_std": 0.0155083192512393,
"rewards/accuracy_reward_stage2": 0.7447823286056519,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.546875,
"step": 1025
},
{
"completion_length": 84.609375,
"epoch": 0.2534584980237154,
"grad_norm": 3.0329888153387974,
"kl": 0.07080078125,
"learning_rate": 7.467885375494071e-07,
"loss": 0.0028,
"reward": 2.5835673809051514,
"reward_std": 0.016321806237101555,
"rewards/accuracy_reward_stage2": 0.5835674405097961,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.203125,
"step": 1026
},
{
"completion_length": 115.34375,
"epoch": 0.25370553359683795,
"grad_norm": 3.298502539047111,
"kl": 0.0654296875,
"learning_rate": 7.465415019762845e-07,
"loss": 0.0026,
"reward": 2.6484375,
"reward_std": 0.12232004851102829,
"rewards/accuracy_reward_stage2": 0.65625,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.515625,
"step": 1027
},
{
"completion_length": 131.609375,
"epoch": 0.25395256916996045,
"grad_norm": 1.8281242653098133,
"kl": 0.05712890625,
"learning_rate": 7.462944664031621e-07,
"loss": 0.0023,
"reward": 2.6590847969055176,
"reward_std": 0.05560882389545441,
"rewards/accuracy_reward_stage2": 0.6590847373008728,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.625,
"step": 1028
},
{
"completion_length": 123.59375,
"epoch": 0.254199604743083,
"grad_norm": 4.069162484990196,
"kl": 0.078125,
"learning_rate": 7.460474308300395e-07,
"loss": 0.0031,
"reward": 2.5520071983337402,
"reward_std": 0.1841985285282135,
"rewards/accuracy_reward_stage2": 0.5582571029663086,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.625,
"step": 1029
},
{
"completion_length": 110.25,
"epoch": 0.2544466403162055,
"grad_norm": 3.208104168556757,
"kl": 0.1005859375,
"learning_rate": 7.458003952569169e-07,
"loss": 0.004,
"reward": 2.6219356060028076,
"reward_std": 0.02025969699025154,
"rewards/accuracy_reward_stage2": 0.6219354867935181,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.390625,
"step": 1030
},
{
"completion_length": 92.171875,
"epoch": 0.25469367588932806,
"grad_norm": 2.5573874648011263,
"kl": 0.07666015625,
"learning_rate": 7.455533596837944e-07,
"loss": 0.0031,
"reward": 2.6007657051086426,
"reward_std": 0.07939323782920837,
"rewards/accuracy_reward_stage2": 0.6007658243179321,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.1875,
"step": 1031
},
{
"completion_length": 114.59375,
"epoch": 0.2549407114624506,
"grad_norm": 1.6819453240515354,
"kl": 0.06884765625,
"learning_rate": 7.45306324110672e-07,
"loss": 0.0028,
"reward": 2.874180555343628,
"reward_std": 0.039452213793992996,
"rewards/accuracy_reward_stage2": 0.8845971822738647,
"rewards/format_reward_all_stage": 1.9895832538604736,
"scores/refine_times": 1.546875,
"step": 1032
},
{
"completion_length": 94.640625,
"epoch": 0.2551877470355731,
"grad_norm": 3.445762215503828,
"kl": 0.09033203125,
"learning_rate": 7.450592885375494e-07,
"loss": 0.0036,
"reward": 2.756856918334961,
"reward_std": 0.0909591019153595,
"rewards/accuracy_reward_stage2": 0.7568570375442505,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.375,
"step": 1033
},
{
"completion_length": 117.0625,
"epoch": 0.2554347826086957,
"grad_norm": 3.6510651933840377,
"kl": 0.07958984375,
"learning_rate": 7.448122529644269e-07,
"loss": 0.0032,
"reward": 2.7670958042144775,
"reward_std": 0.1569576859474182,
"rewards/accuracy_reward_stage2": 0.7733457684516907,
"rewards/format_reward_all_stage": 1.993749976158142,
"scores/refine_times": 1.40625,
"step": 1034
},
{
"completion_length": 84.296875,
"epoch": 0.2556818181818182,
"grad_norm": 2.941993649127529,
"kl": 0.10400390625,
"learning_rate": 7.445652173913043e-07,
"loss": 0.0042,
"reward": 2.608677625656128,
"reward_std": 0.01940278336405754,
"rewards/accuracy_reward_stage2": 0.6086776256561279,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 1035
},
{
"completion_length": 141.015625,
"epoch": 0.25592885375494073,
"grad_norm": 2.011819562020041,
"kl": 0.091796875,
"learning_rate": 7.443181818181817e-07,
"loss": 0.0037,
"reward": 2.705474853515625,
"reward_std": 0.15190356969833374,
"rewards/accuracy_reward_stage2": 0.7408912777900696,
"rewards/format_reward_all_stage": 1.964583396911621,
"scores/refine_times": 1.75,
"step": 1036
},
{
"completion_length": 112.796875,
"epoch": 0.25617588932806323,
"grad_norm": 1.4643025390300515,
"kl": 0.099609375,
"learning_rate": 7.440711462450593e-07,
"loss": 0.004,
"reward": 2.752537727355957,
"reward_std": 0.04632541537284851,
"rewards/accuracy_reward_stage2": 0.7603504061698914,
"rewards/format_reward_all_stage": 1.9921875,
"scores/refine_times": 1.578125,
"step": 1037
},
{
"completion_length": 119.546875,
"epoch": 0.2564229249011858,
"grad_norm": 2.068330713002787,
"kl": 0.0654296875,
"learning_rate": 7.438241106719367e-07,
"loss": 0.0026,
"reward": 2.9245011806488037,
"reward_std": 0.0003982662165071815,
"rewards/accuracy_reward_stage2": 0.9245011806488037,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.484375,
"step": 1038
},
{
"completion_length": 76.03125,
"epoch": 0.2566699604743083,
"grad_norm": 3.220272666537123,
"kl": 0.08154296875,
"learning_rate": 7.435770750988141e-07,
"loss": 0.0033,
"reward": 2.675968885421753,
"reward_std": 0.027604416012763977,
"rewards/accuracy_reward_stage2": 0.6759688854217529,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.09375,
"step": 1039
},
{
"completion_length": 114.046875,
"epoch": 0.25691699604743085,
"grad_norm": 2.6078505140682116,
"kl": 0.09619140625,
"learning_rate": 7.433300395256916e-07,
"loss": 0.0039,
"reward": 2.783844232559204,
"reward_std": 0.16356155276298523,
"rewards/accuracy_reward_stage2": 0.7838441133499146,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.53125,
"step": 1040
},
{
"completion_length": 104.578125,
"epoch": 0.25716403162055335,
"grad_norm": 2.81805356497672,
"kl": 0.09228515625,
"learning_rate": 7.430830039525692e-07,
"loss": 0.0037,
"reward": 2.5050501823425293,
"reward_std": 0.08182928711175919,
"rewards/accuracy_reward_stage2": 0.5050500631332397,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.4375,
"step": 1041
},
{
"completion_length": 95.8125,
"epoch": 0.2574110671936759,
"grad_norm": 2.0889313582541154,
"kl": 0.08154296875,
"learning_rate": 7.428359683794467e-07,
"loss": 0.0033,
"reward": 2.8057987689971924,
"reward_std": 0.03332037478685379,
"rewards/accuracy_reward_stage2": 0.8057988882064819,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.40625,
"step": 1042
},
{
"completion_length": 105.6875,
"epoch": 0.2576581027667984,
"grad_norm": 3.2267311653851065,
"kl": 0.1650390625,
"learning_rate": 7.425889328063241e-07,
"loss": 0.0066,
"reward": 2.6126112937927246,
"reward_std": 0.17364341020584106,
"rewards/accuracy_reward_stage2": 0.7454237937927246,
"rewards/format_reward_all_stage": 1.8671875,
"scores/refine_times": 1.5625,
"step": 1043
},
{
"completion_length": 100.609375,
"epoch": 0.25790513833992096,
"grad_norm": 1.925286487853523,
"kl": 0.09912109375,
"learning_rate": 7.423418972332015e-07,
"loss": 0.004,
"reward": 2.8033337593078613,
"reward_std": 0.09121645987033844,
"rewards/accuracy_reward_stage2": 0.8189588189125061,
"rewards/format_reward_all_stage": 1.984375,
"scores/refine_times": 1.4375,
"step": 1044
},
{
"completion_length": 91.796875,
"epoch": 0.25815217391304346,
"grad_norm": 0.7462047196287118,
"kl": 0.0908203125,
"learning_rate": 7.420948616600791e-07,
"loss": 0.0036,
"reward": 2.6150918006896973,
"reward_std": 0.02063605561852455,
"rewards/accuracy_reward_stage2": 0.6150918006896973,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.25,
"step": 1045
},
{
"completion_length": 118.390625,
"epoch": 0.258399209486166,
"grad_norm": 2.5501600272917697,
"kl": 0.08837890625,
"learning_rate": 7.418478260869565e-07,
"loss": 0.0035,
"reward": 2.7188854217529297,
"reward_std": 0.07711507380008698,
"rewards/accuracy_reward_stage2": 0.7293022274971008,
"rewards/format_reward_all_stage": 1.9895833730697632,
"scores/refine_times": 1.546875,
"step": 1046
},
{
"completion_length": 96.46875,
"epoch": 0.2586462450592885,
"grad_norm": 2.45339267390217,
"kl": 0.0869140625,
"learning_rate": 7.416007905138339e-07,
"loss": 0.0035,
"reward": 2.712198257446289,
"reward_std": 0.06328752636909485,
"rewards/accuracy_reward_stage2": 0.7121983766555786,
"rewards/format_reward_all_stage": 2.0,
"scores/refine_times": 1.359375,
"step": 1047
},
{
"completion_length": 89.875,
"epoch": 0.25889328063241107,
"grad_norm": 3.8252492083394243,
"kl": 0.09814453125,
"learning_rate": 7.413537549407114e-07,
"loss": 0.0039,
"reward": 2.5751380920410156,
"reward_std": 0.16470584273338318,
"rewards/accuracy_reward_stage2": 0.6376380920410156,
"rewards/format_reward_all_stage": 1.9375,
"scores/refine_times": 1.203125,
"step": 1048
},
{
"completion_length": 138.625,
"epoch": 0.25914031620553357,
"grad_norm": 2.602693075049154,
"kl": 0.0869140625,
"learning_rate": 7.411067193675889e-07,
"loss": 0.0035,
"reward": 2.7950634956359863,
"reward_std": 0.13027089834213257,
"rewards/accuracy_reward_stage2": 0.8653761148452759,
"rewards/format_reward_all_stage": 1.9296875,
"scores/refine_times": 1.6875,
"step": 1049
},
{
"completion_length": 105.390625,
"epoch": 0.25938735177865613,
"grad_norm": 2.8055130377676445,
"kl": 0.0810546875,
"learning_rate": 7.408596837944664e-07,
"loss": 0.0032,
"reward": 2.751202344894409,
"reward_std": 0.08926315605640411,
"rewards/accuracy_reward_stage2": 0.818910539150238,
"rewards/format_reward_all_stage": 1.9322917461395264,
"scores/refine_times": 1.359375,
"step": 1050
}
],
"logging_steps": 1.0,
"max_steps": 4048,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}