| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0180722891566263, |
| "eval_steps": 500, |
| "global_step": 501, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 285.25, |
| "epoch": 0.0029850746268656717, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.4705882352941178e-07, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 1 |
| }, |
| { |
| "completion_length": 427.875, |
| "epoch": 0.005970149253731343, |
| "grad_norm": 0.22855468094348907, |
| "kl": 0.0, |
| "learning_rate": 2.9411764705882356e-07, |
| "loss": -0.0, |
| "reward": -0.24074998497962952, |
| "reward_std": 0.6006079912185669, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2407499998807907, |
| "step": 2 |
| }, |
| { |
| "completion_length": 296.5, |
| "epoch": 0.008955223880597015, |
| "grad_norm": 8.133344090310857e-05, |
| "kl": 4.377714503789321e-06, |
| "learning_rate": 4.4117647058823536e-07, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 3 |
| }, |
| { |
| "completion_length": 238.375, |
| "epoch": 0.011940298507462687, |
| "grad_norm": 0.0003321934782434255, |
| "kl": 1.140108270192286e-05, |
| "learning_rate": 5.882352941176471e-07, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 4 |
| }, |
| { |
| "completion_length": 182.5, |
| "epoch": 0.014925373134328358, |
| "grad_norm": 0.3859254717826843, |
| "kl": 1.0751177796919364e-05, |
| "learning_rate": 7.352941176470589e-07, |
| "loss": 0.0, |
| "reward": -0.07850000262260437, |
| "reward_std": 0.22203154861927032, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07850000262260437, |
| "step": 5 |
| }, |
| { |
| "completion_length": 311.75, |
| "epoch": 0.01791044776119403, |
| "grad_norm": 0.34084582328796387, |
| "kl": 8.012263606360648e-06, |
| "learning_rate": 8.823529411764707e-07, |
| "loss": 0.0, |
| "reward": -0.015250000171363354, |
| "reward_std": 0.04313351586461067, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.015250000171363354, |
| "step": 6 |
| }, |
| { |
| "completion_length": 190.75, |
| "epoch": 0.020895522388059702, |
| "grad_norm": 0.5919244885444641, |
| "kl": 9.190077435050625e-06, |
| "learning_rate": 1.0294117647058825e-06, |
| "loss": 0.0, |
| "reward": -0.013125000521540642, |
| "reward_std": 0.04896482080221176, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.013125000521540642, |
| "step": 7 |
| }, |
| { |
| "completion_length": 226.75, |
| "epoch": 0.023880597014925373, |
| "grad_norm": 0.31808775663375854, |
| "kl": 4.68343068860122e-06, |
| "learning_rate": 1.1764705882352942e-06, |
| "loss": 0.0, |
| "reward": -0.03175000101327896, |
| "reward_std": 0.0898025631904602, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.03175000101327896, |
| "step": 8 |
| }, |
| { |
| "completion_length": 418.75, |
| "epoch": 0.026865671641791045, |
| "grad_norm": 0.0001131733224610798, |
| "kl": 8.002484719327185e-06, |
| "learning_rate": 1.323529411764706e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 9 |
| }, |
| { |
| "completion_length": 579.625, |
| "epoch": 0.029850746268656716, |
| "grad_norm": 0.16198158264160156, |
| "kl": 6.150351055111969e-06, |
| "learning_rate": 1.4705882352941177e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 10 |
| }, |
| { |
| "completion_length": 514.0, |
| "epoch": 0.03283582089552239, |
| "grad_norm": 0.1703915297985077, |
| "kl": 6.072848009353038e-06, |
| "learning_rate": 1.6176470588235297e-06, |
| "loss": 0.0, |
| "reward": -0.21412500739097595, |
| "reward_std": 0.6056370139122009, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.21412500739097595, |
| "step": 11 |
| }, |
| { |
| "completion_length": 459.875, |
| "epoch": 0.03582089552238806, |
| "grad_norm": 6.846042379038408e-05, |
| "kl": 4.143657861277461e-06, |
| "learning_rate": 1.7647058823529414e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 12 |
| }, |
| { |
| "completion_length": 530.375, |
| "epoch": 0.03880597014925373, |
| "grad_norm": 7.420629845000803e-05, |
| "kl": 5.574378064920893e-06, |
| "learning_rate": 1.9117647058823528e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 13 |
| }, |
| { |
| "completion_length": 493.875, |
| "epoch": 0.041791044776119404, |
| "grad_norm": 0.0001040869319695048, |
| "kl": 8.489936590194702e-06, |
| "learning_rate": 2.058823529411765e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 14 |
| }, |
| { |
| "completion_length": 614.125, |
| "epoch": 0.04477611940298507, |
| "grad_norm": 0.20984017848968506, |
| "kl": 1.4391247532330453e-05, |
| "learning_rate": 2.2058823529411767e-06, |
| "loss": 0.0, |
| "reward": -0.26112499833106995, |
| "reward_std": 0.5091128349304199, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.26112499833106995, |
| "step": 15 |
| }, |
| { |
| "completion_length": 512.375, |
| "epoch": 0.04776119402985075, |
| "grad_norm": 0.19785551726818085, |
| "kl": 8.07260767032858e-06, |
| "learning_rate": 2.3529411764705885e-06, |
| "loss": 0.0, |
| "reward": -0.2941250205039978, |
| "reward_std": 0.5475807785987854, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2941250205039978, |
| "step": 16 |
| }, |
| { |
| "completion_length": 308.375, |
| "epoch": 0.050746268656716415, |
| "grad_norm": 0.33640098571777344, |
| "kl": 2.1692663722205907e-05, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "reward": 0.18700000643730164, |
| "reward_std": 0.7908055782318115, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.06300000101327896, |
| "step": 17 |
| }, |
| { |
| "completion_length": 663.125, |
| "epoch": 0.05373134328358209, |
| "grad_norm": 0.17784741520881653, |
| "kl": 1.2350877113931347e-05, |
| "learning_rate": 2.647058823529412e-06, |
| "loss": 0.0, |
| "reward": -0.1081250011920929, |
| "reward_std": 0.35900595784187317, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1081250011920929, |
| "step": 18 |
| }, |
| { |
| "completion_length": 245.5, |
| "epoch": 0.056716417910447764, |
| "grad_norm": 0.0005393940955400467, |
| "kl": 1.7745833247317933e-05, |
| "learning_rate": 2.7941176470588237e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 19 |
| }, |
| { |
| "completion_length": 203.25, |
| "epoch": 0.05970149253731343, |
| "grad_norm": 0.35458770394325256, |
| "kl": 8.00752459326759e-05, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 0.0, |
| "reward": -0.12062501162290573, |
| "reward_std": 0.5963314175605774, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.12062500417232513, |
| "step": 20 |
| }, |
| { |
| "completion_length": 381.25, |
| "epoch": 0.0626865671641791, |
| "grad_norm": 0.22898265719413757, |
| "kl": 3.1856085115578026e-05, |
| "learning_rate": 3.0882352941176476e-06, |
| "loss": 0.0, |
| "reward": -0.0364999994635582, |
| "reward_std": 0.1032375916838646, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0364999994635582, |
| "step": 21 |
| }, |
| { |
| "completion_length": 369.5, |
| "epoch": 0.06567164179104477, |
| "grad_norm": 0.00023389812849927694, |
| "kl": 1.4598858797398861e-05, |
| "learning_rate": 3.2352941176470594e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 22 |
| }, |
| { |
| "completion_length": 350.625, |
| "epoch": 0.06865671641791045, |
| "grad_norm": 0.21955104172229767, |
| "kl": 7.147570431698114e-05, |
| "learning_rate": 3.382352941176471e-06, |
| "loss": 0.0, |
| "reward": -0.2084999978542328, |
| "reward_std": 0.5897271037101746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2084999978542328, |
| "step": 23 |
| }, |
| { |
| "completion_length": 418.25, |
| "epoch": 0.07164179104477612, |
| "grad_norm": 0.27754807472229004, |
| "kl": 6.455584662035108e-05, |
| "learning_rate": 3.529411764705883e-06, |
| "loss": 0.0, |
| "reward": -0.046875, |
| "reward_std": 0.48620080947875977, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.296875, |
| "step": 24 |
| }, |
| { |
| "completion_length": 342.625, |
| "epoch": 0.07462686567164178, |
| "grad_norm": 0.33675655722618103, |
| "kl": 0.000209408113732934, |
| "learning_rate": 3.6764705882352946e-06, |
| "loss": 0.0, |
| "reward": -0.242374986410141, |
| "reward_std": 0.45189568400382996, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.24237500131130219, |
| "step": 25 |
| }, |
| { |
| "completion_length": 249.25, |
| "epoch": 0.07761194029850746, |
| "grad_norm": 0.000760515860747546, |
| "kl": 6.706830026814714e-05, |
| "learning_rate": 3.8235294117647055e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 26 |
| }, |
| { |
| "completion_length": 261.125, |
| "epoch": 0.08059701492537313, |
| "grad_norm": 0.5024670362472534, |
| "kl": 0.00022520618222188205, |
| "learning_rate": 3.970588235294118e-06, |
| "loss": 0.0, |
| "reward": -0.016625000163912773, |
| "reward_std": 0.04702260345220566, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.016625000163912773, |
| "step": 27 |
| }, |
| { |
| "completion_length": 551.375, |
| "epoch": 0.08358208955223881, |
| "grad_norm": 0.20812274515628815, |
| "kl": 0.00023469090228900313, |
| "learning_rate": 4.11764705882353e-06, |
| "loss": 0.0, |
| "reward": -0.296750009059906, |
| "reward_std": 0.557201087474823, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.296750009059906, |
| "step": 28 |
| }, |
| { |
| "completion_length": 286.625, |
| "epoch": 0.08656716417910448, |
| "grad_norm": 0.48242834210395813, |
| "kl": 0.00043666691635735333, |
| "learning_rate": 4.264705882352942e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 29 |
| }, |
| { |
| "completion_length": 320.25, |
| "epoch": 0.08955223880597014, |
| "grad_norm": 0.2443850338459015, |
| "kl": 0.001531625515781343, |
| "learning_rate": 4.411764705882353e-06, |
| "loss": 0.0001, |
| "reward": -0.11225000023841858, |
| "reward_std": 0.31749093532562256, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11225000023841858, |
| "step": 30 |
| }, |
| { |
| "completion_length": 370.125, |
| "epoch": 0.09253731343283582, |
| "grad_norm": 0.28154098987579346, |
| "kl": 0.0002926041779574007, |
| "learning_rate": 4.558823529411765e-06, |
| "loss": 0.0, |
| "reward": 0.03137499839067459, |
| "reward_std": 0.05809582397341728, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03137499839067459, |
| "step": 31 |
| }, |
| { |
| "completion_length": 249.625, |
| "epoch": 0.0955223880597015, |
| "grad_norm": 0.0027966005727648735, |
| "kl": 0.0003874501271639019, |
| "learning_rate": 4.705882352941177e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 282.625, |
| "epoch": 0.09850746268656717, |
| "grad_norm": 0.0010501514188945293, |
| "kl": 0.0001470946444896981, |
| "learning_rate": 4.852941176470589e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 33 |
| }, |
| { |
| "completion_length": 439.625, |
| "epoch": 0.10149253731343283, |
| "grad_norm": 0.0016689057229086757, |
| "kl": 0.00023773338762111962, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 34 |
| }, |
| { |
| "completion_length": 677.375, |
| "epoch": 0.1044776119402985, |
| "grad_norm": 0.0020174484234303236, |
| "kl": 0.00034356946707703173, |
| "learning_rate": 4.999863832700438e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 35 |
| }, |
| { |
| "completion_length": 524.25, |
| "epoch": 0.10746268656716418, |
| "grad_norm": 0.24175743758678436, |
| "kl": 0.0005913617205806077, |
| "learning_rate": 4.9994553456349785e-06, |
| "loss": 0.0, |
| "reward": -0.1913750022649765, |
| "reward_std": 0.541290283203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1913750022649765, |
| "step": 36 |
| }, |
| { |
| "completion_length": 498.5, |
| "epoch": 0.11044776119402985, |
| "grad_norm": 0.007789173629134893, |
| "kl": 0.0014796899631619453, |
| "learning_rate": 4.998774583301685e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 37 |
| }, |
| { |
| "completion_length": 402.5, |
| "epoch": 0.11343283582089553, |
| "grad_norm": 0.2350546419620514, |
| "kl": 0.001118505373597145, |
| "learning_rate": 4.997821619858614e-06, |
| "loss": 0.0, |
| "reward": 0.04012499749660492, |
| "reward_std": 0.11349063366651535, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.20987500250339508, |
| "step": 38 |
| }, |
| { |
| "completion_length": 212.625, |
| "epoch": 0.11641791044776119, |
| "grad_norm": 0.594977855682373, |
| "kl": 0.0025897223968058825, |
| "learning_rate": 4.9965965591157314e-06, |
| "loss": 0.0001, |
| "reward": 0.21799999475479126, |
| "reward_std": 0.7819932699203491, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.03200000151991844, |
| "step": 39 |
| }, |
| { |
| "completion_length": 171.125, |
| "epoch": 0.11940298507462686, |
| "grad_norm": 0.3670198321342468, |
| "kl": 0.002329748822376132, |
| "learning_rate": 4.995099534523608e-06, |
| "loss": 0.0001, |
| "reward": -0.00037499889731407166, |
| "reward_std": 0.08900070190429688, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.00037499889731407166, |
| "step": 40 |
| }, |
| { |
| "completion_length": 363.75, |
| "epoch": 0.12238805970149254, |
| "grad_norm": 0.162104994058609, |
| "kl": 0.0005531070055440068, |
| "learning_rate": 4.993330709158879e-06, |
| "loss": 0.0, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 41 |
| }, |
| { |
| "completion_length": 370.75, |
| "epoch": 0.1253731343283582, |
| "grad_norm": 0.22303998470306396, |
| "kl": 0.0011743077775463462, |
| "learning_rate": 4.991290275706486e-06, |
| "loss": 0.0, |
| "reward": 0.17225000262260437, |
| "reward_std": 0.8312016725540161, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07774999737739563, |
| "step": 42 |
| }, |
| { |
| "completion_length": 585.75, |
| "epoch": 0.12835820895522387, |
| "grad_norm": 0.0013642405392602086, |
| "kl": 0.00035193481016904116, |
| "learning_rate": 4.988978456438678e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 43 |
| }, |
| { |
| "completion_length": 299.75, |
| "epoch": 0.13134328358208955, |
| "grad_norm": 0.0038114753551781178, |
| "kl": 0.0007628994644619524, |
| "learning_rate": 4.986395503190805e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 44 |
| }, |
| { |
| "completion_length": 483.875, |
| "epoch": 0.13432835820895522, |
| "grad_norm": 0.0009761088294908404, |
| "kl": 0.000256923318374902, |
| "learning_rate": 4.9835416973338815e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 45 |
| }, |
| { |
| "completion_length": 625.75, |
| "epoch": 0.1373134328358209, |
| "grad_norm": 0.0015355540672317147, |
| "kl": 0.0003070808306802064, |
| "learning_rate": 4.980417349743936e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 46 |
| }, |
| { |
| "completion_length": 371.0, |
| "epoch": 0.14029850746268657, |
| "grad_norm": 0.29057756066322327, |
| "kl": 0.0009468809003010392, |
| "learning_rate": 4.97702280076815e-06, |
| "loss": 0.0, |
| "reward": -0.05824999883770943, |
| "reward_std": 0.16475588083267212, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05824999883770943, |
| "step": 47 |
| }, |
| { |
| "completion_length": 352.375, |
| "epoch": 0.14328358208955225, |
| "grad_norm": 0.2544783651828766, |
| "kl": 0.0009582972852513194, |
| "learning_rate": 4.973358420187776e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 48 |
| }, |
| { |
| "completion_length": 366.375, |
| "epoch": 0.14626865671641792, |
| "grad_norm": 0.38926711678504944, |
| "kl": 0.001495029078796506, |
| "learning_rate": 4.969424607177861e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 49 |
| }, |
| { |
| "completion_length": 339.0, |
| "epoch": 0.14925373134328357, |
| "grad_norm": 0.0019049796974286437, |
| "kl": 0.00044422244536690414, |
| "learning_rate": 4.96522179026376e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 50 |
| }, |
| { |
| "completion_length": 277.875, |
| "epoch": 0.15223880597014924, |
| "grad_norm": 0.28892993927001953, |
| "kl": 0.0017293869750574231, |
| "learning_rate": 4.960750427274458e-06, |
| "loss": 0.0001, |
| "reward": -0.054374996572732925, |
| "reward_std": 0.3191067576408386, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.054374996572732925, |
| "step": 51 |
| }, |
| { |
| "completion_length": 292.75, |
| "epoch": 0.15522388059701492, |
| "grad_norm": 0.0033381686080247164, |
| "kl": 0.0008934027864597738, |
| "learning_rate": 4.956011005292693e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 52 |
| }, |
| { |
| "completion_length": 356.875, |
| "epoch": 0.1582089552238806, |
| "grad_norm": 0.0015895323595032096, |
| "kl": 0.000459035363746807, |
| "learning_rate": 4.951004040601898e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 53 |
| }, |
| { |
| "completion_length": 355.0, |
| "epoch": 0.16119402985074627, |
| "grad_norm": 0.24429979920387268, |
| "kl": 0.0013310480862855911, |
| "learning_rate": 4.945730078629965e-06, |
| "loss": 0.0001, |
| "reward": -0.31862500309944153, |
| "reward_std": 0.4402158558368683, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.31862500309944153, |
| "step": 54 |
| }, |
| { |
| "completion_length": 471.125, |
| "epoch": 0.16417910447761194, |
| "grad_norm": 0.17072314023971558, |
| "kl": 0.0008681550971232355, |
| "learning_rate": 4.940189693889819e-06, |
| "loss": 0.0, |
| "reward": 0.01549999974668026, |
| "reward_std": 0.10450837016105652, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01549999974668026, |
| "step": 55 |
| }, |
| { |
| "completion_length": 393.75, |
| "epoch": 0.16716417910447762, |
| "grad_norm": 0.0035993217024952173, |
| "kl": 0.00099479453638196, |
| "learning_rate": 4.934383489916843e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 56 |
| }, |
| { |
| "completion_length": 335.375, |
| "epoch": 0.1701492537313433, |
| "grad_norm": 0.004349116701632738, |
| "kl": 0.00097746797837317, |
| "learning_rate": 4.928312099203131e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 57 |
| }, |
| { |
| "completion_length": 476.0, |
| "epoch": 0.17313432835820897, |
| "grad_norm": 0.002000964479520917, |
| "kl": 0.0002213095867773518, |
| "learning_rate": 4.921976183128585e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 58 |
| }, |
| { |
| "completion_length": 160.75, |
| "epoch": 0.1761194029850746, |
| "grad_norm": 0.00427111005410552, |
| "kl": 0.001069595105946064, |
| "learning_rate": 4.915376431888871e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 59 |
| }, |
| { |
| "completion_length": 300.125, |
| "epoch": 0.1791044776119403, |
| "grad_norm": 0.003434703918173909, |
| "kl": 0.000937220233026892, |
| "learning_rate": 4.908513564420231e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 60 |
| }, |
| { |
| "completion_length": 369.875, |
| "epoch": 0.18208955223880596, |
| "grad_norm": 0.27929064631462097, |
| "kl": 0.0012021824950352311, |
| "learning_rate": 4.9013883283211705e-06, |
| "loss": 0.0, |
| "reward": -0.019999999552965164, |
| "reward_std": 0.11566577106714249, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.019999999552965164, |
| "step": 61 |
| }, |
| { |
| "completion_length": 362.875, |
| "epoch": 0.18507462686567164, |
| "grad_norm": 0.00122266192920506, |
| "kl": 0.0004295996914152056, |
| "learning_rate": 4.894001499771015e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 62 |
| }, |
| { |
| "completion_length": 325.125, |
| "epoch": 0.1880597014925373, |
| "grad_norm": 0.0036570935044437647, |
| "kl": 0.0008783735684119165, |
| "learning_rate": 4.886353883445363e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 63 |
| }, |
| { |
| "completion_length": 423.625, |
| "epoch": 0.191044776119403, |
| "grad_norm": 0.0054810927249491215, |
| "kl": 0.0009283218532800674, |
| "learning_rate": 4.878446312428424e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 64 |
| }, |
| { |
| "completion_length": 342.625, |
| "epoch": 0.19402985074626866, |
| "grad_norm": 0.19578829407691956, |
| "kl": 0.0010514282621443272, |
| "learning_rate": 4.870279648122271e-06, |
| "loss": 0.0, |
| "reward": -0.1782499998807907, |
| "reward_std": 0.5041671395301819, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1782499998807907, |
| "step": 65 |
| }, |
| { |
| "completion_length": 334.875, |
| "epoch": 0.19701492537313434, |
| "grad_norm": 0.00260003749281168, |
| "kl": 0.0005953626241534948, |
| "learning_rate": 4.8618547801530045e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 66 |
| }, |
| { |
| "completion_length": 365.5, |
| "epoch": 0.2, |
| "grad_norm": 0.3674154579639435, |
| "kl": 0.0011260712053626776, |
| "learning_rate": 4.853172626273841e-06, |
| "loss": 0.0, |
| "reward": -0.21437500417232513, |
| "reward_std": 0.4378893971443176, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.21437500417232513, |
| "step": 67 |
| }, |
| { |
| "completion_length": 432.875, |
| "epoch": 0.20298507462686566, |
| "grad_norm": 0.001397120882757008, |
| "kl": 0.00025881017791107297, |
| "learning_rate": 4.844234132265139e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 68 |
| }, |
| { |
| "completion_length": 364.875, |
| "epoch": 0.20597014925373133, |
| "grad_norm": 0.0018441714346408844, |
| "kl": 0.00031896165455691516, |
| "learning_rate": 4.835040271831371e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 69 |
| }, |
| { |
| "completion_length": 584.875, |
| "epoch": 0.208955223880597, |
| "grad_norm": 0.001404198701493442, |
| "kl": 0.0003437511913944036, |
| "learning_rate": 4.8255920464950545e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 70 |
| }, |
| { |
| "completion_length": 287.875, |
| "epoch": 0.21194029850746268, |
| "grad_norm": 0.23131783306598663, |
| "kl": 0.00048740627244114876, |
| "learning_rate": 4.8158904854876555e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 71 |
| }, |
| { |
| "completion_length": 481.875, |
| "epoch": 0.21492537313432836, |
| "grad_norm": 0.27868691086769104, |
| "kl": 0.0005523943109437823, |
| "learning_rate": 4.805936645637463e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 72 |
| }, |
| { |
| "completion_length": 498.375, |
| "epoch": 0.21791044776119403, |
| "grad_norm": 0.0016176890349015594, |
| "kl": 0.0005396915948949754, |
| "learning_rate": 4.795731611254473e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 73 |
| }, |
| { |
| "completion_length": 300.875, |
| "epoch": 0.2208955223880597, |
| "grad_norm": 0.24887824058532715, |
| "kl": 0.0005686509539373219, |
| "learning_rate": 4.7852764940122636e-06, |
| "loss": 0.0, |
| "reward": 0.22587499022483826, |
| "reward_std": 0.5445812344551086, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.024125002324581146, |
| "step": 74 |
| }, |
| { |
| "completion_length": 177.75, |
| "epoch": 0.22388059701492538, |
| "grad_norm": 0.0012561667244881392, |
| "kl": 0.00028040556935593486, |
| "learning_rate": 4.7745724328269e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 75 |
| }, |
| { |
| "completion_length": 410.625, |
| "epoch": 0.22686567164179106, |
| "grad_norm": 0.2470964640378952, |
| "kl": 0.000733887602109462, |
| "learning_rate": 4.763620593732867e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 76 |
| }, |
| { |
| "completion_length": 278.125, |
| "epoch": 0.2298507462686567, |
| "grad_norm": 0.2767794728279114, |
| "kl": 0.0013002330670133233, |
| "learning_rate": 4.752422169756048e-06, |
| "loss": 0.0001, |
| "reward": -0.025499999523162842, |
| "reward_std": 0.0721248909831047, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.025499999523162842, |
| "step": 77 |
| }, |
| { |
| "completion_length": 296.625, |
| "epoch": 0.23283582089552238, |
| "grad_norm": 0.0011332810390740633, |
| "kl": 0.0002715845184866339, |
| "learning_rate": 4.7409783807837654e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 78 |
| }, |
| { |
| "completion_length": 248.625, |
| "epoch": 0.23582089552238805, |
| "grad_norm": 0.001428711460903287, |
| "kl": 0.0004016221791971475, |
| "learning_rate": 4.729290473431892e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 79 |
| }, |
| { |
| "completion_length": 492.875, |
| "epoch": 0.23880597014925373, |
| "grad_norm": 0.24282296001911163, |
| "kl": 0.0003622036019805819, |
| "learning_rate": 4.717359720909053e-06, |
| "loss": 0.0, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 80 |
| }, |
| { |
| "completion_length": 241.875, |
| "epoch": 0.2417910447761194, |
| "grad_norm": 0.2795076370239258, |
| "kl": 0.0006243584793992341, |
| "learning_rate": 4.705187422877931e-06, |
| "loss": 0.0, |
| "reward": 0.01575000025331974, |
| "reward_std": 0.044547729194164276, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01575000025331974, |
| "step": 81 |
| }, |
| { |
| "completion_length": 405.0, |
| "epoch": 0.24477611940298508, |
| "grad_norm": 0.3399125039577484, |
| "kl": 0.00068903889041394, |
| "learning_rate": 4.692774905313687e-06, |
| "loss": 0.0, |
| "reward": -0.051500000059604645, |
| "reward_std": 0.2009889781475067, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.051500000059604645, |
| "step": 82 |
| }, |
| { |
| "completion_length": 334.5, |
| "epoch": 0.24776119402985075, |
| "grad_norm": 0.0014138659462332726, |
| "kl": 0.0003496205317787826, |
| "learning_rate": 4.68012352035952e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 83 |
| }, |
| { |
| "completion_length": 446.0, |
| "epoch": 0.2507462686567164, |
| "grad_norm": 0.0010413933778181672, |
| "kl": 0.0003284848644398153, |
| "learning_rate": 4.667234646179368e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 84 |
| }, |
| { |
| "completion_length": 487.0, |
| "epoch": 0.2537313432835821, |
| "grad_norm": 0.001018830225802958, |
| "kl": 0.0002749576815403998, |
| "learning_rate": 4.654109686807787e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 85 |
| }, |
| { |
| "completion_length": 295.5, |
| "epoch": 0.25671641791044775, |
| "grad_norm": 0.0019163364777341485, |
| "kl": 0.0005629758234135807, |
| "learning_rate": 4.640750071996995e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 86 |
| }, |
| { |
| "completion_length": 318.25, |
| "epoch": 0.25970149253731345, |
| "grad_norm": 0.5596569180488586, |
| "kl": 0.0005682025803253055, |
| "learning_rate": 4.62715725706113e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 87 |
| }, |
| { |
| "completion_length": 361.0, |
| "epoch": 0.2626865671641791, |
| "grad_norm": 0.0013579176738858223, |
| "kl": 0.0004695644020102918, |
| "learning_rate": 4.613332722717714e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 88 |
| }, |
| { |
| "completion_length": 555.625, |
| "epoch": 0.2656716417910448, |
| "grad_norm": 0.22543174028396606, |
| "kl": 0.0003525623178575188, |
| "learning_rate": 4.599277974926355e-06, |
| "loss": 0.0, |
| "reward": -0.05037499964237213, |
| "reward_std": 0.14248201251029968, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05037499964237213, |
| "step": 89 |
| }, |
| { |
| "completion_length": 300.625, |
| "epoch": 0.26865671641791045, |
| "grad_norm": 0.21516487002372742, |
| "kl": 0.0005444185808300972, |
| "learning_rate": 4.584994544724695e-06, |
| "loss": 0.0, |
| "reward": 0.19550000131130219, |
| "reward_std": 0.5529574751853943, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.054499998688697815, |
| "step": 90 |
| }, |
| { |
| "completion_length": 495.25, |
| "epoch": 0.2716417910447761, |
| "grad_norm": 0.22942061722278595, |
| "kl": 0.0016139632789418101, |
| "learning_rate": 4.57048398806163e-06, |
| "loss": 0.0001, |
| "reward": -0.07262499630451202, |
| "reward_std": 0.3230404257774353, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07262499630451202, |
| "step": 91 |
| }, |
| { |
| "completion_length": 333.375, |
| "epoch": 0.2746268656716418, |
| "grad_norm": 0.001300108153373003, |
| "kl": 0.0003806241147685796, |
| "learning_rate": 4.555747885627812e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 92 |
| }, |
| { |
| "completion_length": 285.125, |
| "epoch": 0.27761194029850744, |
| "grad_norm": 0.0014808428240939975, |
| "kl": 0.00030873267678543925, |
| "learning_rate": 4.540787842683459e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 93 |
| }, |
| { |
| "completion_length": 148.375, |
| "epoch": 0.28059701492537314, |
| "grad_norm": 0.005105325020849705, |
| "kl": 0.0015376665396615863, |
| "learning_rate": 4.525605488883493e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 94 |
| }, |
| { |
| "completion_length": 302.125, |
| "epoch": 0.2835820895522388, |
| "grad_norm": 0.00199362775310874, |
| "kl": 0.00041642598807811737, |
| "learning_rate": 4.510202478100008e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 95 |
| }, |
| { |
| "completion_length": 182.625, |
| "epoch": 0.2865671641791045, |
| "grad_norm": 0.0016307829646393657, |
| "kl": 0.0004398175806272775, |
| "learning_rate": 4.494580488242109e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 96 |
| }, |
| { |
| "completion_length": 226.875, |
| "epoch": 0.28955223880597014, |
| "grad_norm": 0.41053909063339233, |
| "kl": 0.0007566440617665648, |
| "learning_rate": 4.478741221073136e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 97 |
| }, |
| { |
| "completion_length": 375.25, |
| "epoch": 0.29253731343283584, |
| "grad_norm": 0.17745234072208405, |
| "kl": 0.00043587430263869464, |
| "learning_rate": 4.462686402025277e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 98 |
| }, |
| { |
| "completion_length": 546.25, |
| "epoch": 0.2955223880597015, |
| "grad_norm": 0.1550239473581314, |
| "kl": 0.00024276912154164165, |
| "learning_rate": 4.446417780011618e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 99 |
| }, |
| { |
| "completion_length": 316.25, |
| "epoch": 0.29850746268656714, |
| "grad_norm": 0.22868771851062775, |
| "kl": 0.000838124135043472, |
| "learning_rate": 4.42993712723562e-06, |
| "loss": 0.0, |
| "reward": 0.04699999839067459, |
| "reward_std": 0.13293607532978058, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04699999839067459, |
| "step": 100 |
| }, |
| { |
| "completion_length": 645.0, |
| "epoch": 0.30149253731343284, |
| "grad_norm": 0.1801333874464035, |
| "kl": 0.00038152255001477897, |
| "learning_rate": 4.413246238998069e-06, |
| "loss": 0.0, |
| "reward": -0.06024999916553497, |
| "reward_std": 0.17041273415088654, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.06024999916553497, |
| "step": 101 |
| }, |
| { |
| "completion_length": 220.875, |
| "epoch": 0.3044776119402985, |
| "grad_norm": 0.0017630718648433685, |
| "kl": 0.00038715871050953865, |
| "learning_rate": 4.396346933501508e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 102 |
| }, |
| { |
| "completion_length": 508.125, |
| "epoch": 0.3074626865671642, |
| "grad_norm": 0.20094433426856995, |
| "kl": 0.0006657220656052232, |
| "learning_rate": 4.379241051652174e-06, |
| "loss": 0.0, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 103 |
| }, |
| { |
| "completion_length": 501.5, |
| "epoch": 0.31044776119402984, |
| "grad_norm": 0.17015448212623596, |
| "kl": 0.000687247549649328, |
| "learning_rate": 4.361930456859455e-06, |
| "loss": 0.0, |
| "reward": 0.04699999839067459, |
| "reward_std": 0.13293607532978058, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04699999839067459, |
| "step": 104 |
| }, |
| { |
| "completion_length": 414.875, |
| "epoch": 0.31343283582089554, |
| "grad_norm": 0.0016345038311555982, |
| "kl": 0.00038708740612491965, |
| "learning_rate": 4.3444170348329095e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 105 |
| }, |
| { |
| "completion_length": 371.25, |
| "epoch": 0.3164179104477612, |
| "grad_norm": 0.00202854722738266, |
| "kl": 0.0006857087719254196, |
| "learning_rate": 4.326702693376844e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 106 |
| }, |
| { |
| "completion_length": 615.875, |
| "epoch": 0.3194029850746269, |
| "grad_norm": 0.1853620409965515, |
| "kl": 0.0010476625757291913, |
| "learning_rate": 4.308789362182492e-06, |
| "loss": 0.0, |
| "reward": -0.25699999928474426, |
| "reward_std": 0.7269057631492615, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.25699999928474426, |
| "step": 107 |
| }, |
| { |
| "completion_length": 626.625, |
| "epoch": 0.32238805970149254, |
| "grad_norm": 0.22608496248722076, |
| "kl": 0.0009553819545544684, |
| "learning_rate": 4.290678992617797e-06, |
| "loss": 0.0, |
| "reward": 0.03137499839067459, |
| "reward_std": 0.05809582397341728, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03137499839067459, |
| "step": 108 |
| }, |
| { |
| "completion_length": 374.0, |
| "epoch": 0.3253731343283582, |
| "grad_norm": 0.003469746559858322, |
| "kl": 0.001379699446260929, |
| "learning_rate": 4.2723735575148585e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 109 |
| }, |
| { |
| "completion_length": 412.875, |
| "epoch": 0.3283582089552239, |
| "grad_norm": 0.19724370539188385, |
| "kl": 0.0006244105170480907, |
| "learning_rate": 4.253875050955005e-06, |
| "loss": 0.0, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 110 |
| }, |
| { |
| "completion_length": 362.375, |
| "epoch": 0.33134328358208953, |
| "grad_norm": 0.17809787392616272, |
| "kl": 0.0009810776682570577, |
| "learning_rate": 4.2351854880515856e-06, |
| "loss": 0.0, |
| "reward": -0.34437501430511475, |
| "reward_std": 0.9740396738052368, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.34437501430511475, |
| "step": 111 |
| }, |
| { |
| "completion_length": 254.375, |
| "epoch": 0.33432835820895523, |
| "grad_norm": 0.28602710366249084, |
| "kl": 0.0038207019679248333, |
| "learning_rate": 4.216306904730448e-06, |
| "loss": 0.0002, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 112 |
| }, |
| { |
| "completion_length": 304.25, |
| "epoch": 0.3373134328358209, |
| "grad_norm": 0.005425428505986929, |
| "kl": 0.0019627241417765617, |
| "learning_rate": 4.197241357508159e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 113 |
| }, |
| { |
| "completion_length": 281.0, |
| "epoch": 0.3402985074626866, |
| "grad_norm": 0.47344425320625305, |
| "kl": 0.010544579476118088, |
| "learning_rate": 4.177990923267986e-06, |
| "loss": 0.0004, |
| "reward": -0.07524999976158142, |
| "reward_std": 0.13945992290973663, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07524999976158142, |
| "step": 114 |
| }, |
| { |
| "completion_length": 486.5, |
| "epoch": 0.34328358208955223, |
| "grad_norm": 0.1647060066461563, |
| "kl": 0.003495145123451948, |
| "learning_rate": 4.158557699033644e-06, |
| "loss": 0.0001, |
| "reward": -0.11837500333786011, |
| "reward_std": 0.33481505513191223, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11837500333786011, |
| "step": 115 |
| }, |
| { |
| "completion_length": 316.125, |
| "epoch": 0.34626865671641793, |
| "grad_norm": 0.00556787708774209, |
| "kl": 0.002718576230108738, |
| "learning_rate": 4.138943801740865e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 116 |
| }, |
| { |
| "completion_length": 307.25, |
| "epoch": 0.3492537313432836, |
| "grad_norm": 0.3354603350162506, |
| "kl": 0.0044554159976542, |
| "learning_rate": 4.119151368006793e-06, |
| "loss": 0.0002, |
| "reward": 0.039750002324581146, |
| "reward_std": 0.07568685710430145, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.039750002324581146, |
| "step": 117 |
| }, |
| { |
| "completion_length": 527.75, |
| "epoch": 0.3522388059701492, |
| "grad_norm": 0.18804031610488892, |
| "kl": 0.0013875120785087347, |
| "learning_rate": 4.099182553897228e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 118 |
| }, |
| { |
| "completion_length": 233.625, |
| "epoch": 0.35522388059701493, |
| "grad_norm": 0.3928685188293457, |
| "kl": 0.004655573982745409, |
| "learning_rate": 4.0790395346917674e-06, |
| "loss": 0.0002, |
| "reward": 0.06274999678134918, |
| "reward_std": 0.11619041860103607, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06274999678134918, |
| "step": 119 |
| }, |
| { |
| "completion_length": 455.0, |
| "epoch": 0.3582089552238806, |
| "grad_norm": 0.24913738667964935, |
| "kl": 0.0021684349048882723, |
| "learning_rate": 4.058724504646834e-06, |
| "loss": 0.0001, |
| "reward": -0.5518749952316284, |
| "reward_std": 0.8652141690254211, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5518749952316284, |
| "step": 120 |
| }, |
| { |
| "completion_length": 504.75, |
| "epoch": 0.3611940298507463, |
| "grad_norm": 0.25603148341178894, |
| "kl": 0.0012253073509782553, |
| "learning_rate": 4.038239676756654e-06, |
| "loss": 0.0, |
| "reward": -0.05650000274181366, |
| "reward_std": 0.2148142158985138, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05650000274181366, |
| "step": 121 |
| }, |
| { |
| "completion_length": 479.125, |
| "epoch": 0.3641791044776119, |
| "grad_norm": 0.2229258269071579, |
| "kl": 0.0028934157453477383, |
| "learning_rate": 4.017587282512181e-06, |
| "loss": 0.0001, |
| "reward": -0.1338750123977661, |
| "reward_std": 0.5096223950386047, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.13387499749660492, |
| "step": 122 |
| }, |
| { |
| "completion_length": 372.875, |
| "epoch": 0.36716417910447763, |
| "grad_norm": 0.27619239687919617, |
| "kl": 0.002216364722698927, |
| "learning_rate": 3.996769571658022e-06, |
| "loss": 0.0001, |
| "reward": -0.226624995470047, |
| "reward_std": 0.6928819417953491, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.226624995470047, |
| "step": 123 |
| }, |
| { |
| "completion_length": 461.5, |
| "epoch": 0.3701492537313433, |
| "grad_norm": 0.19344386458396912, |
| "kl": 0.002842097310349345, |
| "learning_rate": 3.975788811947351e-06, |
| "loss": 0.0001, |
| "reward": 0.281125009059906, |
| "reward_std": 0.745917558670044, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03112499974668026, |
| "step": 124 |
| }, |
| { |
| "completion_length": 309.75, |
| "epoch": 0.373134328358209, |
| "grad_norm": 0.2874149978160858, |
| "kl": 0.0032137392554432154, |
| "learning_rate": 3.9546472888948825e-06, |
| "loss": 0.0001, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 125 |
| }, |
| { |
| "completion_length": 221.375, |
| "epoch": 0.3761194029850746, |
| "grad_norm": 0.009057161398231983, |
| "kl": 0.005167881492525339, |
| "learning_rate": 3.933347305527898e-06, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 126 |
| }, |
| { |
| "completion_length": 435.875, |
| "epoch": 0.37910447761194027, |
| "grad_norm": 0.002526797354221344, |
| "kl": 0.0017403013771399856, |
| "learning_rate": 3.911891182135371e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 127 |
| }, |
| { |
| "completion_length": 449.75, |
| "epoch": 0.382089552238806, |
| "grad_norm": 0.0034000773448497057, |
| "kl": 0.00144351611379534, |
| "learning_rate": 3.890281256015207e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 128 |
| }, |
| { |
| "completion_length": 330.875, |
| "epoch": 0.3850746268656716, |
| "grad_norm": 0.18436795473098755, |
| "kl": 0.0032451425213366747, |
| "learning_rate": 3.868519881219631e-06, |
| "loss": 0.0001, |
| "reward": -0.11350000649690628, |
| "reward_std": 0.43147987127304077, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11350000649690628, |
| "step": 129 |
| }, |
| { |
| "completion_length": 265.625, |
| "epoch": 0.3880597014925373, |
| "grad_norm": 0.011368674226105213, |
| "kl": 0.005110865458846092, |
| "learning_rate": 3.8466094282987575e-06, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 130 |
| }, |
| { |
| "completion_length": 261.625, |
| "epoch": 0.39104477611940297, |
| "grad_norm": 0.3966436982154846, |
| "kl": 0.004083710256963968, |
| "learning_rate": 3.824552284042351e-06, |
| "loss": 0.0002, |
| "reward": 0.01575000025331974, |
| "reward_std": 0.044547729194164276, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01575000025331974, |
| "step": 131 |
| }, |
| { |
| "completion_length": 371.125, |
| "epoch": 0.3940298507462687, |
| "grad_norm": 0.0057938722893595695, |
| "kl": 0.003004050347954035, |
| "learning_rate": 3.802350851219826e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 132 |
| }, |
| { |
| "completion_length": 496.75, |
| "epoch": 0.3970149253731343, |
| "grad_norm": 0.24294711649417877, |
| "kl": 0.004811118356883526, |
| "learning_rate": 3.7800075483185073e-06, |
| "loss": 0.0002, |
| "reward": 0.016750000417232513, |
| "reward_std": 0.1915021389722824, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.016750000417232513, |
| "step": 133 |
| }, |
| { |
| "completion_length": 292.5, |
| "epoch": 0.4, |
| "grad_norm": 0.004627264104783535, |
| "kl": 0.002543792361393571, |
| "learning_rate": 3.7575248092801686e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 134 |
| }, |
| { |
| "completion_length": 304.125, |
| "epoch": 0.40298507462686567, |
| "grad_norm": 0.007394778076559305, |
| "kl": 0.003581845434382558, |
| "learning_rate": 3.734905083235901e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 135 |
| }, |
| { |
| "completion_length": 365.375, |
| "epoch": 0.4059701492537313, |
| "grad_norm": 0.2609359323978424, |
| "kl": 0.004685729276388884, |
| "learning_rate": 3.712150834239313e-06, |
| "loss": 0.0002, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 136 |
| }, |
| { |
| "completion_length": 521.25, |
| "epoch": 0.408955223880597, |
| "grad_norm": 0.31001636385917664, |
| "kl": 0.002258291933685541, |
| "learning_rate": 3.6892645409981166e-06, |
| "loss": 0.0001, |
| "reward": 0.046875, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 137 |
| }, |
| { |
| "completion_length": 237.875, |
| "epoch": 0.41194029850746267, |
| "grad_norm": 0.24746236205101013, |
| "kl": 0.007873887196183205, |
| "learning_rate": 3.6662486966041104e-06, |
| "loss": 0.0003, |
| "reward": -0.011750001460313797, |
| "reward_std": 0.14610245823860168, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.011749999597668648, |
| "step": 138 |
| }, |
| { |
| "completion_length": 510.875, |
| "epoch": 0.41492537313432837, |
| "grad_norm": 0.2037159949541092, |
| "kl": 0.0031850896775722504, |
| "learning_rate": 3.6431058082615966e-06, |
| "loss": 0.0001, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 139 |
| }, |
| { |
| "completion_length": 371.125, |
| "epoch": 0.417910447761194, |
| "grad_norm": 0.25025588274002075, |
| "kl": 0.0022993905004113913, |
| "learning_rate": 3.619838397014263e-06, |
| "loss": 0.0001, |
| "reward": 0.03137499839067459, |
| "reward_std": 0.08874189853668213, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03137499839067459, |
| "step": 140 |
| }, |
| { |
| "completion_length": 451.0, |
| "epoch": 0.4208955223880597, |
| "grad_norm": 0.0034099018666893244, |
| "kl": 0.0028660029638558626, |
| "learning_rate": 3.5964489974705553e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 141 |
| }, |
| { |
| "completion_length": 497.75, |
| "epoch": 0.42388059701492536, |
| "grad_norm": 0.4137725234031677, |
| "kl": 0.0026251752860844135, |
| "learning_rate": 3.5729401575275724e-06, |
| "loss": 0.0001, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 142 |
| }, |
| { |
| "completion_length": 320.625, |
| "epoch": 0.42686567164179107, |
| "grad_norm": 0.004924552049487829, |
| "kl": 0.0024821085389703512, |
| "learning_rate": 3.5493144380935155e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 143 |
| }, |
| { |
| "completion_length": 297.125, |
| "epoch": 0.4298507462686567, |
| "grad_norm": 0.006190278101712465, |
| "kl": 0.0024942473974078894, |
| "learning_rate": 3.5255744128087175e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 144 |
| }, |
| { |
| "completion_length": 484.0, |
| "epoch": 0.43283582089552236, |
| "grad_norm": 0.32550495862960815, |
| "kl": 0.0038871050346642733, |
| "learning_rate": 3.501722667765286e-06, |
| "loss": 0.0002, |
| "reward": 0.046875, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 145 |
| }, |
| { |
| "completion_length": 325.5, |
| "epoch": 0.43582089552238806, |
| "grad_norm": 0.39831942319869995, |
| "kl": 0.00664663827046752, |
| "learning_rate": 3.47776180122539e-06, |
| "loss": 0.0003, |
| "reward": 0.007000000216066837, |
| "reward_std": 0.019798990339040756, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.007000000216066837, |
| "step": 146 |
| }, |
| { |
| "completion_length": 716.875, |
| "epoch": 0.4388059701492537, |
| "grad_norm": 0.0020847241394221783, |
| "kl": 0.0007909125415608287, |
| "learning_rate": 3.4536944233382248e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 147 |
| }, |
| { |
| "completion_length": 581.875, |
| "epoch": 0.4417910447761194, |
| "grad_norm": 0.21988965570926666, |
| "kl": 0.0023626910988241434, |
| "learning_rate": 3.429523155855672e-06, |
| "loss": 0.0001, |
| "reward": 0.046875, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 148 |
| }, |
| { |
| "completion_length": 259.625, |
| "epoch": 0.44477611940298506, |
| "grad_norm": 0.26226532459259033, |
| "kl": 0.0048947930335998535, |
| "learning_rate": 3.405250631846708e-06, |
| "loss": 0.0002, |
| "reward": 0.06274999678134918, |
| "reward_std": 0.0948694571852684, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06274999678134918, |
| "step": 149 |
| }, |
| { |
| "completion_length": 632.375, |
| "epoch": 0.44776119402985076, |
| "grad_norm": 0.20669730007648468, |
| "kl": 0.0018882754957303405, |
| "learning_rate": 3.3808794954105716e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0578637570142746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 150 |
| }, |
| { |
| "completion_length": 290.875, |
| "epoch": 0.4507462686567164, |
| "grad_norm": 0.003552640788257122, |
| "kl": 0.0016134099569171667, |
| "learning_rate": 3.3564124013887324e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 151 |
| }, |
| { |
| "completion_length": 177.75, |
| "epoch": 0.4537313432835821, |
| "grad_norm": 0.009745140559971333, |
| "kl": 0.004595119506120682, |
| "learning_rate": 3.331852015075685e-06, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 152 |
| }, |
| { |
| "completion_length": 479.5, |
| "epoch": 0.45671641791044776, |
| "grad_norm": 0.27110740542411804, |
| "kl": 0.0047483891248703, |
| "learning_rate": 3.3072010119286156e-06, |
| "loss": 0.0002, |
| "reward": 0.02812499925494194, |
| "reward_std": 0.05250425264239311, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.02812499925494194, |
| "step": 153 |
| }, |
| { |
| "completion_length": 220.5, |
| "epoch": 0.4597014925373134, |
| "grad_norm": 0.4211854636669159, |
| "kl": 0.006590801756829023, |
| "learning_rate": 3.2824620772759475e-06, |
| "loss": 0.0003, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 154 |
| }, |
| { |
| "completion_length": 241.625, |
| "epoch": 0.4626865671641791, |
| "grad_norm": 0.3244410753250122, |
| "kl": 0.003724359441548586, |
| "learning_rate": 3.257637906024822e-06, |
| "loss": 0.0001, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 155 |
| }, |
| { |
| "completion_length": 414.5, |
| "epoch": 0.46567164179104475, |
| "grad_norm": 0.2715875506401062, |
| "kl": 0.0029947988223284483, |
| "learning_rate": 3.2327312023675287e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 156 |
| }, |
| { |
| "completion_length": 196.375, |
| "epoch": 0.46865671641791046, |
| "grad_norm": 0.007158320862799883, |
| "kl": 0.0028687817975878716, |
| "learning_rate": 3.20774467948693e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 157 |
| }, |
| { |
| "completion_length": 275.375, |
| "epoch": 0.4716417910447761, |
| "grad_norm": 0.3656024932861328, |
| "kl": 0.007250001188367605, |
| "learning_rate": 3.1826810592609036e-06, |
| "loss": 0.0003, |
| "reward": 0.046875, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 158 |
| }, |
| { |
| "completion_length": 285.875, |
| "epoch": 0.4746268656716418, |
| "grad_norm": 0.4725809693336487, |
| "kl": 0.003180581144988537, |
| "learning_rate": 3.157543071965835e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 159 |
| }, |
| { |
| "completion_length": 580.125, |
| "epoch": 0.47761194029850745, |
| "grad_norm": 0.30390796065330505, |
| "kl": 0.004015565849840641, |
| "learning_rate": 3.132333455979202e-06, |
| "loss": 0.0002, |
| "reward": -0.017625000327825546, |
| "reward_std": 0.17501260340213776, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.017625000327825546, |
| "step": 160 |
| }, |
| { |
| "completion_length": 292.75, |
| "epoch": 0.48059701492537316, |
| "grad_norm": 0.010895282961428165, |
| "kl": 0.004954541102051735, |
| "learning_rate": 3.107054957481271e-06, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 161 |
| }, |
| { |
| "completion_length": 260.25, |
| "epoch": 0.4835820895522388, |
| "grad_norm": 0.29293543100357056, |
| "kl": 0.006690036039799452, |
| "learning_rate": 3.0817103301559422e-06, |
| "loss": 0.0003, |
| "reward": 0.06262499839067459, |
| "reward_std": 0.13396474719047546, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06262499839067459, |
| "step": 162 |
| }, |
| { |
| "completion_length": 329.25, |
| "epoch": 0.48656716417910445, |
| "grad_norm": 0.28982600569725037, |
| "kl": 0.004020814783871174, |
| "learning_rate": 3.056302334890786e-06, |
| "loss": 0.0002, |
| "reward": 0.046875, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 163 |
| }, |
| { |
| "completion_length": 211.25, |
| "epoch": 0.48955223880597015, |
| "grad_norm": 0.5855104327201843, |
| "kl": 0.017538880929350853, |
| "learning_rate": 3.030833739476285e-06, |
| "loss": 0.0007, |
| "reward": 0.9138749837875366, |
| "reward_std": 1.1292022466659546, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16387498378753662, |
| "step": 164 |
| }, |
| { |
| "completion_length": 488.75, |
| "epoch": 0.4925373134328358, |
| "grad_norm": 0.21862053871154785, |
| "kl": 0.0072796959429979324, |
| "learning_rate": 3.0053073183043257e-06, |
| "loss": 0.0003, |
| "reward": 0.0298750102519989, |
| "reward_std": 1.1403151750564575, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2201249897480011, |
| "step": 165 |
| }, |
| { |
| "completion_length": 499.75, |
| "epoch": 0.4955223880597015, |
| "grad_norm": 0.2587985396385193, |
| "kl": 0.006187543738633394, |
| "learning_rate": 2.979725852065981e-06, |
| "loss": 0.0002, |
| "reward": -0.5041249990463257, |
| "reward_std": 1.0025380849838257, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5041249990463257, |
| "step": 166 |
| }, |
| { |
| "completion_length": 473.375, |
| "epoch": 0.49850746268656715, |
| "grad_norm": 0.25732216238975525, |
| "kl": 0.0039559113793075085, |
| "learning_rate": 2.9540921274485913e-06, |
| "loss": 0.0002, |
| "reward": -0.11225000023841858, |
| "reward_std": 0.370589017868042, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11225000023841858, |
| "step": 167 |
| }, |
| { |
| "completion_length": 487.625, |
| "epoch": 0.5014925373134328, |
| "grad_norm": 0.2064179629087448, |
| "kl": 0.003909274935722351, |
| "learning_rate": 2.9284089368322044e-06, |
| "loss": 0.0002, |
| "reward": 0.04699999839067459, |
| "reward_std": 0.13293607532978058, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04699999839067459, |
| "step": 168 |
| }, |
| { |
| "completion_length": 692.75, |
| "epoch": 0.5044776119402985, |
| "grad_norm": 0.17778423428535461, |
| "kl": 0.0032577835954725742, |
| "learning_rate": 2.9026790779853877e-06, |
| "loss": 0.0001, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 169 |
| }, |
| { |
| "completion_length": 338.625, |
| "epoch": 0.5074626865671642, |
| "grad_norm": 0.4213738739490509, |
| "kl": 0.004577150102704763, |
| "learning_rate": 2.876905353760459e-06, |
| "loss": 0.0002, |
| "reward": -0.00612499937415123, |
| "reward_std": 0.19183358550071716, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.00612499937415123, |
| "step": 170 |
| }, |
| { |
| "completion_length": 142.875, |
| "epoch": 0.5104477611940299, |
| "grad_norm": 0.38132330775260925, |
| "kl": 0.02254224196076393, |
| "learning_rate": 2.8510905717881615e-06, |
| "loss": 0.0009, |
| "reward": 0.31575000286102295, |
| "reward_std": 0.783475935459137, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06575000286102295, |
| "step": 171 |
| }, |
| { |
| "completion_length": 542.0, |
| "epoch": 0.5134328358208955, |
| "grad_norm": 0.28400880098342896, |
| "kl": 0.006651477422565222, |
| "learning_rate": 2.8252375441718137e-06, |
| "loss": 0.0003, |
| "reward": 0.07837499678134918, |
| "reward_std": 0.09333952516317368, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07837499678134918, |
| "step": 172 |
| }, |
| { |
| "completion_length": 460.5, |
| "epoch": 0.5164179104477612, |
| "grad_norm": 0.21609985828399658, |
| "kl": 0.007514914497733116, |
| "learning_rate": 2.7993490871809808e-06, |
| "loss": 0.0003, |
| "reward": -0.07925000041723251, |
| "reward_std": 0.27812162041664124, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07925000041723251, |
| "step": 173 |
| }, |
| { |
| "completion_length": 187.5, |
| "epoch": 0.5194029850746269, |
| "grad_norm": 0.42248091101646423, |
| "kl": 0.007775603327900171, |
| "learning_rate": 2.773428020944687e-06, |
| "loss": 0.0003, |
| "reward": 0.015625, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015625, |
| "step": 174 |
| }, |
| { |
| "completion_length": 276.75, |
| "epoch": 0.5223880597014925, |
| "grad_norm": 0.3561972677707672, |
| "kl": 0.013817128725349903, |
| "learning_rate": 2.747477169144202e-06, |
| "loss": 0.0006, |
| "reward": 0.140625, |
| "reward_std": 0.12387890368700027, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.140625, |
| "step": 175 |
| }, |
| { |
| "completion_length": 217.875, |
| "epoch": 0.5253731343283582, |
| "grad_norm": 0.5078591108322144, |
| "kl": 0.01669302023947239, |
| "learning_rate": 2.721499358705458e-06, |
| "loss": 0.0007, |
| "reward": 0.046875, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 176 |
| }, |
| { |
| "completion_length": 516.125, |
| "epoch": 0.5283582089552239, |
| "grad_norm": 0.2574947476387024, |
| "kl": 0.005081856623291969, |
| "learning_rate": 2.695497419491089e-06, |
| "loss": 0.0002, |
| "reward": -0.08912499994039536, |
| "reward_std": 0.572532594203949, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.08912500739097595, |
| "step": 177 |
| }, |
| { |
| "completion_length": 460.75, |
| "epoch": 0.5313432835820896, |
| "grad_norm": 0.22305476665496826, |
| "kl": 0.015017062425613403, |
| "learning_rate": 2.6694741839921734e-06, |
| "loss": 0.0006, |
| "reward": 0.34375, |
| "reward_std": 0.7756046056747437, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09375, |
| "step": 178 |
| }, |
| { |
| "completion_length": 358.875, |
| "epoch": 0.5343283582089552, |
| "grad_norm": 0.351471871137619, |
| "kl": 0.01629817485809326, |
| "learning_rate": 2.6434324870196746e-06, |
| "loss": 0.0007, |
| "reward": 0.007375001907348633, |
| "reward_std": 0.4031447470188141, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.007374994456768036, |
| "step": 179 |
| }, |
| { |
| "completion_length": 592.0, |
| "epoch": 0.5373134328358209, |
| "grad_norm": 0.2376508116722107, |
| "kl": 0.0033545636106282473, |
| "learning_rate": 2.617375165395634e-06, |
| "loss": 0.0001, |
| "reward": 0.07824999839067459, |
| "reward_std": 0.0932672768831253, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07824999839067459, |
| "step": 180 |
| }, |
| { |
| "completion_length": 243.375, |
| "epoch": 0.5402985074626866, |
| "grad_norm": 0.38199833035469055, |
| "kl": 0.013456111773848534, |
| "learning_rate": 2.591305057644148e-06, |
| "loss": 0.0005, |
| "reward": 0.625124990940094, |
| "reward_std": 0.9728089570999146, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.125124990940094, |
| "step": 181 |
| }, |
| { |
| "completion_length": 369.75, |
| "epoch": 0.5432835820895522, |
| "grad_norm": 0.38759690523147583, |
| "kl": 0.035215843468904495, |
| "learning_rate": 2.5652250036821522e-06, |
| "loss": 0.0014, |
| "reward": 0.03612499684095383, |
| "reward_std": 0.2709351181983948, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.036125000566244125, |
| "step": 182 |
| }, |
| { |
| "completion_length": 232.25, |
| "epoch": 0.5462686567164179, |
| "grad_norm": 0.37896519899368286, |
| "kl": 0.021390574052929878, |
| "learning_rate": 2.5391378445100646e-06, |
| "loss": 0.0009, |
| "reward": 0.17350000143051147, |
| "reward_std": 0.14292055368423462, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17350000143051147, |
| "step": 183 |
| }, |
| { |
| "completion_length": 409.125, |
| "epoch": 0.5492537313432836, |
| "grad_norm": 0.3088514506816864, |
| "kl": 0.014571096748113632, |
| "learning_rate": 2.5130464219022994e-06, |
| "loss": 0.0006, |
| "reward": 0.1146249994635582, |
| "reward_std": 0.1522197127342224, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1146249994635582, |
| "step": 184 |
| }, |
| { |
| "completion_length": 242.25, |
| "epoch": 0.5522388059701493, |
| "grad_norm": 0.3677271604537964, |
| "kl": 0.012131447903811932, |
| "learning_rate": 2.4869535780977023e-06, |
| "loss": 0.0005, |
| "reward": 0.18474999070167542, |
| "reward_std": 0.0707283467054367, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18474999070167542, |
| "step": 185 |
| }, |
| { |
| "completion_length": 421.875, |
| "epoch": 0.5552238805970149, |
| "grad_norm": 0.19892147183418274, |
| "kl": 0.009639453142881393, |
| "learning_rate": 2.460862155489936e-06, |
| "loss": 0.0004, |
| "reward": 0.125, |
| "reward_std": 0.09449111670255661, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.125, |
| "step": 186 |
| }, |
| { |
| "completion_length": 615.375, |
| "epoch": 0.5582089552238806, |
| "grad_norm": 0.18018686771392822, |
| "kl": 0.005218564532697201, |
| "learning_rate": 2.4347749963178486e-06, |
| "loss": 0.0002, |
| "reward": 0.01262500137090683, |
| "reward_std": 0.3315926492214203, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01262500137090683, |
| "step": 187 |
| }, |
| { |
| "completion_length": 409.875, |
| "epoch": 0.5611940298507463, |
| "grad_norm": 0.27012187242507935, |
| "kl": 0.02046685479581356, |
| "learning_rate": 2.408694942355853e-06, |
| "loss": 0.0008, |
| "reward": -0.04087501019239426, |
| "reward_std": 0.6781936883926392, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.04087500274181366, |
| "step": 188 |
| }, |
| { |
| "completion_length": 394.125, |
| "epoch": 0.564179104477612, |
| "grad_norm": 0.30240562558174133, |
| "kl": 0.013552379794418812, |
| "learning_rate": 2.3826248346043664e-06, |
| "loss": 0.0005, |
| "reward": 0.3344999849796295, |
| "reward_std": 0.8115267157554626, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08449999988079071, |
| "step": 189 |
| }, |
| { |
| "completion_length": 498.125, |
| "epoch": 0.5671641791044776, |
| "grad_norm": 0.22516800463199615, |
| "kl": 0.007915965281426907, |
| "learning_rate": 2.356567512980326e-06, |
| "loss": 0.0003, |
| "reward": -0.17287498712539673, |
| "reward_std": 0.37088099122047424, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.17287500202655792, |
| "step": 190 |
| }, |
| { |
| "completion_length": 437.375, |
| "epoch": 0.5701492537313433, |
| "grad_norm": 0.26963454484939575, |
| "kl": 0.013868818990886211, |
| "learning_rate": 2.3305258160078274e-06, |
| "loss": 0.0006, |
| "reward": 0.671999990940094, |
| "reward_std": 0.979500949382782, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171999990940094, |
| "step": 191 |
| }, |
| { |
| "completion_length": 486.75, |
| "epoch": 0.573134328358209, |
| "grad_norm": 0.2408941686153412, |
| "kl": 0.007514089345932007, |
| "learning_rate": 2.304502580508912e-06, |
| "loss": 0.0003, |
| "reward": 0.29612499475479126, |
| "reward_std": 0.5014508962631226, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04612499475479126, |
| "step": 192 |
| }, |
| { |
| "completion_length": 664.25, |
| "epoch": 0.5761194029850746, |
| "grad_norm": 0.7232910394668579, |
| "kl": 0.009346898645162582, |
| "learning_rate": 2.278500641294543e-06, |
| "loss": 0.0004, |
| "reward": 0.046875, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.046875, |
| "step": 193 |
| }, |
| { |
| "completion_length": 361.5, |
| "epoch": 0.5791044776119403, |
| "grad_norm": 0.2798665165901184, |
| "kl": 0.01410368550568819, |
| "learning_rate": 2.252522830855798e-06, |
| "loss": 0.0006, |
| "reward": 0.10837499797344208, |
| "reward_std": 0.14315219223499298, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10837499797344208, |
| "step": 194 |
| }, |
| { |
| "completion_length": 366.75, |
| "epoch": 0.582089552238806, |
| "grad_norm": 0.4407714009284973, |
| "kl": 0.016179384663701057, |
| "learning_rate": 2.2265719790553147e-06, |
| "loss": 0.0006, |
| "reward": -0.09425000846385956, |
| "reward_std": 0.33279111981391907, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.09425000101327896, |
| "step": 195 |
| }, |
| { |
| "completion_length": 483.625, |
| "epoch": 0.5850746268656717, |
| "grad_norm": 0.23635739088058472, |
| "kl": 0.009301292710006237, |
| "learning_rate": 2.2006509128190196e-06, |
| "loss": 0.0004, |
| "reward": -0.10637500137090683, |
| "reward_std": 0.4708363115787506, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.10637500137090683, |
| "step": 196 |
| }, |
| { |
| "completion_length": 275.5, |
| "epoch": 0.5880597014925373, |
| "grad_norm": 0.30557289719581604, |
| "kl": 0.026175374165177345, |
| "learning_rate": 2.1747624558281867e-06, |
| "loss": 0.001, |
| "reward": 0.20949998497962952, |
| "reward_std": 0.15469324588775635, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2094999998807907, |
| "step": 197 |
| }, |
| { |
| "completion_length": 318.25, |
| "epoch": 0.591044776119403, |
| "grad_norm": 0.426329642534256, |
| "kl": 0.019156118854880333, |
| "learning_rate": 2.1489094282118393e-06, |
| "loss": 0.0008, |
| "reward": 0.28349998593330383, |
| "reward_std": 0.6506469249725342, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03349999710917473, |
| "step": 198 |
| }, |
| { |
| "completion_length": 658.0, |
| "epoch": 0.5940298507462687, |
| "grad_norm": 0.15060272812843323, |
| "kl": 0.004683774430304766, |
| "learning_rate": 2.1230946462395412e-06, |
| "loss": 0.0002, |
| "reward": 0.125, |
| "reward_std": 0.06681530922651291, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.125, |
| "step": 199 |
| }, |
| { |
| "completion_length": 305.0, |
| "epoch": 0.5970149253731343, |
| "grad_norm": 0.40997669100761414, |
| "kl": 0.01804080791771412, |
| "learning_rate": 2.0973209220146135e-06, |
| "loss": 0.0007, |
| "reward": 0.33412498235702515, |
| "reward_std": 0.7774099111557007, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08412499725818634, |
| "step": 200 |
| }, |
| { |
| "completion_length": 590.0, |
| "epoch": 0.6, |
| "grad_norm": 0.2662385106086731, |
| "kl": 0.006966865621507168, |
| "learning_rate": 2.071591063167797e-06, |
| "loss": 0.0003, |
| "reward": -0.23762500286102295, |
| "reward_std": 0.4790639281272888, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.23762500286102295, |
| "step": 201 |
| }, |
| { |
| "completion_length": 240.75, |
| "epoch": 0.6029850746268657, |
| "grad_norm": 0.43724602460861206, |
| "kl": 0.027211960405111313, |
| "learning_rate": 2.045907872551409e-06, |
| "loss": 0.0011, |
| "reward": 0.0846249908208847, |
| "reward_std": 0.24422587454319, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0846249982714653, |
| "step": 202 |
| }, |
| { |
| "completion_length": 463.0, |
| "epoch": 0.6059701492537314, |
| "grad_norm": 0.33706098794937134, |
| "kl": 0.012281076051294804, |
| "learning_rate": 2.0202741479340193e-06, |
| "loss": 0.0005, |
| "reward": 0.23412498831748962, |
| "reward_std": 0.1411113291978836, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23412498831748962, |
| "step": 203 |
| }, |
| { |
| "completion_length": 223.625, |
| "epoch": 0.608955223880597, |
| "grad_norm": 0.32199856638908386, |
| "kl": 0.02061256766319275, |
| "learning_rate": 1.9946926816956743e-06, |
| "loss": 0.0008, |
| "reward": 0.15049999952316284, |
| "reward_std": 0.21179774403572083, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15049999952316284, |
| "step": 204 |
| }, |
| { |
| "completion_length": 433.75, |
| "epoch": 0.6119402985074627, |
| "grad_norm": 0.16691480576992035, |
| "kl": 0.015048968605697155, |
| "learning_rate": 1.969166260523717e-06, |
| "loss": 0.0006, |
| "reward": 0.13649998605251312, |
| "reward_std": 0.33145737648010254, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13650000095367432, |
| "step": 205 |
| }, |
| { |
| "completion_length": 363.125, |
| "epoch": 0.6149253731343284, |
| "grad_norm": 0.24160078167915344, |
| "kl": 0.010224804282188416, |
| "learning_rate": 1.9436976651092143e-06, |
| "loss": 0.0004, |
| "reward": 0.13512499630451202, |
| "reward_std": 0.08309278637170792, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13512499630451202, |
| "step": 206 |
| }, |
| { |
| "completion_length": 616.625, |
| "epoch": 0.6179104477611941, |
| "grad_norm": 0.24529774487018585, |
| "kl": 0.009581487625837326, |
| "learning_rate": 1.918289669844058e-06, |
| "loss": 0.0004, |
| "reward": 0.12787500023841858, |
| "reward_std": 0.11601285636425018, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12787500023841858, |
| "step": 207 |
| }, |
| { |
| "completion_length": 285.25, |
| "epoch": 0.6208955223880597, |
| "grad_norm": 0.4515547454357147, |
| "kl": 0.03280240669846535, |
| "learning_rate": 1.8929450425187298e-06, |
| "loss": 0.0013, |
| "reward": 0.13300000131130219, |
| "reward_std": 0.052966292947530746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13300000131130219, |
| "step": 208 |
| }, |
| { |
| "completion_length": 143.125, |
| "epoch": 0.6238805970149254, |
| "grad_norm": 0.4566623866558075, |
| "kl": 0.028078753501176834, |
| "learning_rate": 1.8676665440207982e-06, |
| "loss": 0.0011, |
| "reward": 0.8535000085830688, |
| "reward_std": 1.0311086177825928, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10350000858306885, |
| "step": 209 |
| }, |
| { |
| "completion_length": 301.375, |
| "epoch": 0.6268656716417911, |
| "grad_norm": 0.35615095496177673, |
| "kl": 0.019768187776207924, |
| "learning_rate": 1.8424569280341653e-06, |
| "loss": 0.0008, |
| "reward": 0.5614999532699585, |
| "reward_std": 0.9637962579727173, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06149999797344208, |
| "step": 210 |
| }, |
| { |
| "completion_length": 205.625, |
| "epoch": 0.6298507462686567, |
| "grad_norm": 0.572158932685852, |
| "kl": 0.03914448618888855, |
| "learning_rate": 1.817318940739098e-06, |
| "loss": 0.0016, |
| "reward": 1.1281249523162842, |
| "reward_std": 1.0498979091644287, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12812499701976776, |
| "step": 211 |
| }, |
| { |
| "completion_length": 202.625, |
| "epoch": 0.6328358208955224, |
| "grad_norm": 0.41864466667175293, |
| "kl": 0.033789440989494324, |
| "learning_rate": 1.7922553205130708e-06, |
| "loss": 0.0014, |
| "reward": 0.453249990940094, |
| "reward_std": 0.7317192554473877, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203249990940094, |
| "step": 212 |
| }, |
| { |
| "completion_length": 479.375, |
| "epoch": 0.6358208955223881, |
| "grad_norm": 0.19674842059612274, |
| "kl": 0.01116740982979536, |
| "learning_rate": 1.767268797632472e-06, |
| "loss": 0.0004, |
| "reward": 0.028875000774860382, |
| "reward_std": 0.4791833460330963, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.028875000774860382, |
| "step": 213 |
| }, |
| { |
| "completion_length": 228.375, |
| "epoch": 0.6388059701492538, |
| "grad_norm": 0.38439103960990906, |
| "kl": 0.026051824912428856, |
| "learning_rate": 1.7423620939751787e-06, |
| "loss": 0.001, |
| "reward": 0.029624998569488525, |
| "reward_std": 0.3076816201210022, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.029624998569488525, |
| "step": 214 |
| }, |
| { |
| "completion_length": 267.625, |
| "epoch": 0.6417910447761194, |
| "grad_norm": 0.42961663007736206, |
| "kl": 0.015496465377509594, |
| "learning_rate": 1.7175379227240524e-06, |
| "loss": 0.0006, |
| "reward": 0.28037500381469727, |
| "reward_std": 0.08738410472869873, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28037500381469727, |
| "step": 215 |
| }, |
| { |
| "completion_length": 480.875, |
| "epoch": 0.6447761194029851, |
| "grad_norm": 0.2381271868944168, |
| "kl": 0.0213899165391922, |
| "learning_rate": 1.6927989880713852e-06, |
| "loss": 0.0009, |
| "reward": -0.02775000035762787, |
| "reward_std": 0.5771975517272949, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.02775000035762787, |
| "step": 216 |
| }, |
| { |
| "completion_length": 406.75, |
| "epoch": 0.6477611940298508, |
| "grad_norm": 0.2900620102882385, |
| "kl": 0.015803145244717598, |
| "learning_rate": 1.6681479849243153e-06, |
| "loss": 0.0006, |
| "reward": 0.218874990940094, |
| "reward_std": 0.05794192850589752, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.218874990940094, |
| "step": 217 |
| }, |
| { |
| "completion_length": 457.5, |
| "epoch": 0.6507462686567164, |
| "grad_norm": 0.22088056802749634, |
| "kl": 0.014149850234389305, |
| "learning_rate": 1.6435875986112685e-06, |
| "loss": 0.0006, |
| "reward": 0.08675000071525574, |
| "reward_std": 0.2752093970775604, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08675000071525574, |
| "step": 218 |
| }, |
| { |
| "completion_length": 466.0, |
| "epoch": 0.6537313432835821, |
| "grad_norm": 0.2606593668460846, |
| "kl": 0.010915166698396206, |
| "learning_rate": 1.6191205045894283e-06, |
| "loss": 0.0004, |
| "reward": 0.08262500166893005, |
| "reward_std": 0.21830511093139648, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08262500166893005, |
| "step": 219 |
| }, |
| { |
| "completion_length": 549.875, |
| "epoch": 0.6567164179104478, |
| "grad_norm": 0.2776827812194824, |
| "kl": 0.01001064758747816, |
| "learning_rate": 1.594749368153292e-06, |
| "loss": 0.0004, |
| "reward": 0.10012499988079071, |
| "reward_std": 0.11426840722560883, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10012499988079071, |
| "step": 220 |
| }, |
| { |
| "completion_length": 383.75, |
| "epoch": 0.6597014925373135, |
| "grad_norm": 0.3466421961784363, |
| "kl": 0.011461976915597916, |
| "learning_rate": 1.570476844144329e-06, |
| "loss": 0.0005, |
| "reward": 0.1875, |
| "reward_std": 0.06681530922651291, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 221 |
| }, |
| { |
| "completion_length": 349.625, |
| "epoch": 0.6626865671641791, |
| "grad_norm": 0.34904956817626953, |
| "kl": 0.023631775751709938, |
| "learning_rate": 1.5463055766617763e-06, |
| "loss": 0.0009, |
| "reward": 0.11737500131130219, |
| "reward_std": 0.25096383690834045, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11737500131130219, |
| "step": 222 |
| }, |
| { |
| "completion_length": 427.75, |
| "epoch": 0.6656716417910448, |
| "grad_norm": 0.28804051876068115, |
| "kl": 0.02174968458712101, |
| "learning_rate": 1.5222381987746104e-06, |
| "loss": 0.0009, |
| "reward": 0.656374990940094, |
| "reward_std": 0.910563051700592, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.156374990940094, |
| "step": 223 |
| }, |
| { |
| "completion_length": 303.625, |
| "epoch": 0.6686567164179105, |
| "grad_norm": 0.24906383454799652, |
| "kl": 0.017897039651870728, |
| "learning_rate": 1.4982773322347144e-06, |
| "loss": 0.0007, |
| "reward": 0.453249990940094, |
| "reward_std": 0.7347634434700012, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203249990940094, |
| "step": 224 |
| }, |
| { |
| "completion_length": 492.25, |
| "epoch": 0.6716417910447762, |
| "grad_norm": 0.24582958221435547, |
| "kl": 0.013822609558701515, |
| "learning_rate": 1.4744255871912825e-06, |
| "loss": 0.0006, |
| "reward": -0.2888749837875366, |
| "reward_std": 0.9461946487426758, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2888749837875366, |
| "step": 225 |
| }, |
| { |
| "completion_length": 240.5, |
| "epoch": 0.6746268656716418, |
| "grad_norm": 0.380571573972702, |
| "kl": 0.020389374345541, |
| "learning_rate": 1.4506855619064847e-06, |
| "loss": 0.0008, |
| "reward": 0.6603749990463257, |
| "reward_std": 0.9848659038543701, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16037499904632568, |
| "step": 226 |
| }, |
| { |
| "completion_length": 142.75, |
| "epoch": 0.6776119402985075, |
| "grad_norm": 0.4420141875743866, |
| "kl": 0.029349079355597496, |
| "learning_rate": 1.4270598424724291e-06, |
| "loss": 0.0012, |
| "reward": 0.9844999313354492, |
| "reward_std": 0.9861009120941162, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234499990940094, |
| "step": 227 |
| }, |
| { |
| "completion_length": 311.375, |
| "epoch": 0.6805970149253732, |
| "grad_norm": 0.35497111082077026, |
| "kl": 0.01766645722091198, |
| "learning_rate": 1.4035510025294463e-06, |
| "loss": 0.0007, |
| "reward": 0.140625, |
| "reward_std": 0.12387890368700027, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.140625, |
| "step": 228 |
| }, |
| { |
| "completion_length": 233.875, |
| "epoch": 0.6835820895522388, |
| "grad_norm": 0.30006691813468933, |
| "kl": 0.02256627008318901, |
| "learning_rate": 1.380161602985738e-06, |
| "loss": 0.0009, |
| "reward": 0.05274999886751175, |
| "reward_std": 0.35022878646850586, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.05274999886751175, |
| "step": 229 |
| }, |
| { |
| "completion_length": 427.625, |
| "epoch": 0.6865671641791045, |
| "grad_norm": 0.22094956040382385, |
| "kl": 0.014623177237808704, |
| "learning_rate": 1.3568941917384038e-06, |
| "loss": 0.0006, |
| "reward": -0.6439999938011169, |
| "reward_std": 1.0302884578704834, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6440000534057617, |
| "step": 230 |
| }, |
| { |
| "completion_length": 151.0, |
| "epoch": 0.6895522388059702, |
| "grad_norm": 0.669421374797821, |
| "kl": 0.046193912625312805, |
| "learning_rate": 1.3337513033958904e-06, |
| "loss": 0.0018, |
| "reward": 1.355125069618225, |
| "reward_std": 0.9819782376289368, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10512500256299973, |
| "step": 231 |
| }, |
| { |
| "completion_length": 524.875, |
| "epoch": 0.6925373134328359, |
| "grad_norm": 0.1570989489555359, |
| "kl": 0.016627641394734383, |
| "learning_rate": 1.310735459001884e-06, |
| "loss": 0.0007, |
| "reward": 0.31312498450279236, |
| "reward_std": 1.1351596117019653, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.18687501549720764, |
| "step": 232 |
| }, |
| { |
| "completion_length": 471.875, |
| "epoch": 0.6955223880597015, |
| "grad_norm": 0.32607302069664, |
| "kl": 0.0247778482735157, |
| "learning_rate": 1.2878491657606874e-06, |
| "loss": 0.001, |
| "reward": 0.187624990940094, |
| "reward_std": 0.1159592717885971, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.187624990940094, |
| "step": 233 |
| }, |
| { |
| "completion_length": 376.875, |
| "epoch": 0.6985074626865672, |
| "grad_norm": 0.5452307462692261, |
| "kl": 0.018254172056913376, |
| "learning_rate": 1.2650949167640997e-06, |
| "loss": 0.0007, |
| "reward": 0.171875, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171875, |
| "step": 234 |
| }, |
| { |
| "completion_length": 281.25, |
| "epoch": 0.7014925373134329, |
| "grad_norm": 0.276838093996048, |
| "kl": 0.0188542939722538, |
| "learning_rate": 1.2424751907198312e-06, |
| "loss": 0.0008, |
| "reward": 0.750124990940094, |
| "reward_std": 0.9281703233718872, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 235 |
| }, |
| { |
| "completion_length": 550.625, |
| "epoch": 0.7044776119402985, |
| "grad_norm": 0.29794007539749146, |
| "kl": 0.015997443348169327, |
| "learning_rate": 1.219992451681494e-06, |
| "loss": 0.0006, |
| "reward": 0.171999990940094, |
| "reward_std": 0.09312357008457184, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171999990940094, |
| "step": 236 |
| }, |
| { |
| "completion_length": 263.625, |
| "epoch": 0.7074626865671642, |
| "grad_norm": 0.4172457158565521, |
| "kl": 0.021728359162807465, |
| "learning_rate": 1.1976491487801747e-06, |
| "loss": 0.0009, |
| "reward": 0.140749990940094, |
| "reward_std": 0.1409870833158493, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.140749990940094, |
| "step": 237 |
| }, |
| { |
| "completion_length": 365.0, |
| "epoch": 0.7104477611940299, |
| "grad_norm": 0.2539860010147095, |
| "kl": 0.015010641887784004, |
| "learning_rate": 1.17544771595765e-06, |
| "loss": 0.0006, |
| "reward": 0.2433750182390213, |
| "reward_std": 0.9216052293777466, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.006624996662139893, |
| "step": 238 |
| }, |
| { |
| "completion_length": 206.5, |
| "epoch": 0.7134328358208956, |
| "grad_norm": 0.33303573727607727, |
| "kl": 0.020331714302301407, |
| "learning_rate": 1.1533905717012425e-06, |
| "loss": 0.0008, |
| "reward": 0.07275000214576721, |
| "reward_std": 0.3491785228252411, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07275000214576721, |
| "step": 239 |
| }, |
| { |
| "completion_length": 528.25, |
| "epoch": 0.7164179104477612, |
| "grad_norm": 0.380811870098114, |
| "kl": 0.012931251898407936, |
| "learning_rate": 1.1314801187803687e-06, |
| "loss": 0.0005, |
| "reward": 0.13875000178813934, |
| "reward_std": 0.23173367977142334, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13874998688697815, |
| "step": 240 |
| }, |
| { |
| "completion_length": 333.125, |
| "epoch": 0.7194029850746269, |
| "grad_norm": 0.4135821759700775, |
| "kl": 0.01626397855579853, |
| "learning_rate": 1.109718743984794e-06, |
| "loss": 0.0007, |
| "reward": 0.4181250035762787, |
| "reward_std": 0.7555687427520752, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1681250035762787, |
| "step": 241 |
| }, |
| { |
| "completion_length": 135.125, |
| "epoch": 0.7223880597014926, |
| "grad_norm": 0.41655805706977844, |
| "kl": 0.05684615299105644, |
| "learning_rate": 1.0881088178646291e-06, |
| "loss": 0.0023, |
| "reward": 0.4477500021457672, |
| "reward_std": 0.7376521229743958, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1977500021457672, |
| "step": 242 |
| }, |
| { |
| "completion_length": 531.875, |
| "epoch": 0.7253731343283583, |
| "grad_norm": 0.47717010974884033, |
| "kl": 0.019140595570206642, |
| "learning_rate": 1.0666526944721017e-06, |
| "loss": 0.0008, |
| "reward": 0.1667499989271164, |
| "reward_std": 0.20734770596027374, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1667499989271164, |
| "step": 243 |
| }, |
| { |
| "completion_length": 257.875, |
| "epoch": 0.7283582089552239, |
| "grad_norm": 0.2818719446659088, |
| "kl": 0.017366956919431686, |
| "learning_rate": 1.0453527111051183e-06, |
| "loss": 0.0007, |
| "reward": 0.700249969959259, |
| "reward_std": 1.00717031955719, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20024999976158142, |
| "step": 244 |
| }, |
| { |
| "completion_length": 336.625, |
| "epoch": 0.7313432835820896, |
| "grad_norm": 0.2551906108856201, |
| "kl": 0.017408214509487152, |
| "learning_rate": 1.0242111880526495e-06, |
| "loss": 0.0007, |
| "reward": 0.5737500190734863, |
| "reward_std": 0.954799234867096, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07374998927116394, |
| "step": 245 |
| }, |
| { |
| "completion_length": 246.75, |
| "epoch": 0.7343283582089553, |
| "grad_norm": 0.2634367048740387, |
| "kl": 0.020746061578392982, |
| "learning_rate": 1.0032304283419792e-06, |
| "loss": 0.0008, |
| "reward": 0.6588749885559082, |
| "reward_std": 0.8327701687812805, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0911249965429306, |
| "step": 246 |
| }, |
| { |
| "completion_length": 262.25, |
| "epoch": 0.7373134328358208, |
| "grad_norm": 0.2753911316394806, |
| "kl": 0.025336353108286858, |
| "learning_rate": 9.824127174878196e-07, |
| "loss": 0.001, |
| "reward": 0.10499998927116394, |
| "reward_std": 0.3267969787120819, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10499998927116394, |
| "step": 247 |
| }, |
| { |
| "completion_length": 434.625, |
| "epoch": 0.7402985074626866, |
| "grad_norm": 0.2999410927295685, |
| "kl": 0.013439298607409, |
| "learning_rate": 9.617603232433475e-07, |
| "loss": 0.0005, |
| "reward": 0.890625, |
| "reward_std": 1.1268585920333862, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.140625, |
| "step": 248 |
| }, |
| { |
| "completion_length": 507.375, |
| "epoch": 0.7432835820895523, |
| "grad_norm": 0.2620807886123657, |
| "kl": 0.012813393957912922, |
| "learning_rate": 9.412754953531664e-07, |
| "loss": 0.0005, |
| "reward": 0.034749992191791534, |
| "reward_std": 0.5604296326637268, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03474998474121094, |
| "step": 249 |
| }, |
| { |
| "completion_length": 249.125, |
| "epoch": 0.746268656716418, |
| "grad_norm": 0.5003867149353027, |
| "kl": 0.024488642811775208, |
| "learning_rate": 9.209604653082326e-07, |
| "loss": 0.001, |
| "reward": 0.7232499718666077, |
| "reward_std": 1.039300799369812, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.026749998331069946, |
| "step": 250 |
| }, |
| { |
| "completion_length": 453.375, |
| "epoch": 0.7492537313432835, |
| "grad_norm": 0.2221699208021164, |
| "kl": 0.01704486459493637, |
| "learning_rate": 9.008174461027724e-07, |
| "loss": 0.0007, |
| "reward": -0.044124990701675415, |
| "reward_std": 0.879956841468811, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.044124990701675415, |
| "step": 251 |
| }, |
| { |
| "completion_length": 199.25, |
| "epoch": 0.7522388059701492, |
| "grad_norm": 0.3644316792488098, |
| "kl": 0.036831244826316833, |
| "learning_rate": 8.808486319932083e-07, |
| "loss": 0.0015, |
| "reward": 0.07199999690055847, |
| "reward_std": 0.20690439641475677, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07199999690055847, |
| "step": 252 |
| }, |
| { |
| "completion_length": 140.875, |
| "epoch": 0.755223880597015, |
| "grad_norm": 0.45422235131263733, |
| "kl": 0.040265657007694244, |
| "learning_rate": 8.610561982591356e-07, |
| "loss": 0.0016, |
| "reward": 0.703125, |
| "reward_std": 0.8811962008476257, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 253 |
| }, |
| { |
| "completion_length": 460.625, |
| "epoch": 0.7582089552238805, |
| "grad_norm": 0.22300752997398376, |
| "kl": 0.011045263148844242, |
| "learning_rate": 8.414423009663564e-07, |
| "loss": 0.0004, |
| "reward": -0.3152500092983246, |
| "reward_std": 0.8068054914474487, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.3152500092983246, |
| "step": 254 |
| }, |
| { |
| "completion_length": 443.375, |
| "epoch": 0.7611940298507462, |
| "grad_norm": 0.4048740863800049, |
| "kl": 0.020228559151291847, |
| "learning_rate": 8.220090767320138e-07, |
| "loss": 0.0008, |
| "reward": 0.234624981880188, |
| "reward_std": 0.08016578108072281, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23462499678134918, |
| "step": 255 |
| }, |
| { |
| "completion_length": 372.75, |
| "epoch": 0.764179104477612, |
| "grad_norm": 0.2398412674665451, |
| "kl": 0.014233190566301346, |
| "learning_rate": 8.027586424918413e-07, |
| "loss": 0.0006, |
| "reward": 0.03700000047683716, |
| "reward_std": 0.41604670882225037, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03700000047683716, |
| "step": 256 |
| }, |
| { |
| "completion_length": 321.125, |
| "epoch": 0.7671641791044777, |
| "grad_norm": 0.2411520928144455, |
| "kl": 0.020644700154662132, |
| "learning_rate": 7.836930952695535e-07, |
| "loss": 0.0008, |
| "reward": 0.12312500178813934, |
| "reward_std": 0.2639526128768921, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12312500178813934, |
| "step": 257 |
| }, |
| { |
| "completion_length": 130.5, |
| "epoch": 0.7701492537313432, |
| "grad_norm": 0.49876320362091064, |
| "kl": 0.035634126514196396, |
| "learning_rate": 7.648145119484152e-07, |
| "loss": 0.0014, |
| "reward": 0.9228750467300415, |
| "reward_std": 1.0723521709442139, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17287498712539673, |
| "step": 258 |
| }, |
| { |
| "completion_length": 359.375, |
| "epoch": 0.7731343283582089, |
| "grad_norm": 0.22742308676242828, |
| "kl": 0.018567712977528572, |
| "learning_rate": 7.461249490449954e-07, |
| "loss": 0.0007, |
| "reward": 0.33787500858306885, |
| "reward_std": 0.8415562510490417, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08787500113248825, |
| "step": 259 |
| }, |
| { |
| "completion_length": 508.75, |
| "epoch": 0.7761194029850746, |
| "grad_norm": 0.19451585412025452, |
| "kl": 0.016084793955087662, |
| "learning_rate": 7.276264424851426e-07, |
| "loss": 0.0006, |
| "reward": 0.1028750017285347, |
| "reward_std": 0.32013678550720215, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1028750017285347, |
| "step": 260 |
| }, |
| { |
| "completion_length": 326.75, |
| "epoch": 0.7791044776119403, |
| "grad_norm": 0.27631670236587524, |
| "kl": 0.018710516393184662, |
| "learning_rate": 7.093210073822027e-07, |
| "loss": 0.0007, |
| "reward": -0.4192500114440918, |
| "reward_std": 0.6906015872955322, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4192500114440918, |
| "step": 261 |
| }, |
| { |
| "completion_length": 413.625, |
| "epoch": 0.7820895522388059, |
| "grad_norm": 0.2365199774503708, |
| "kl": 0.01709006167948246, |
| "learning_rate": 6.912106378175098e-07, |
| "loss": 0.0007, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 262 |
| }, |
| { |
| "completion_length": 264.5, |
| "epoch": 0.7850746268656716, |
| "grad_norm": 0.3793802261352539, |
| "kl": 0.02793733775615692, |
| "learning_rate": 6.732973066231563e-07, |
| "loss": 0.0011, |
| "reward": 0.6676249504089355, |
| "reward_std": 0.9522945284843445, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16762499511241913, |
| "step": 263 |
| }, |
| { |
| "completion_length": 505.625, |
| "epoch": 0.7880597014925373, |
| "grad_norm": 0.2976485788822174, |
| "kl": 0.010713014751672745, |
| "learning_rate": 6.555829651670912e-07, |
| "loss": 0.0004, |
| "reward": 0.08262500166893005, |
| "reward_std": 0.2939401865005493, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08262500166893005, |
| "step": 264 |
| }, |
| { |
| "completion_length": 249.25, |
| "epoch": 0.7910447761194029, |
| "grad_norm": 0.27218976616859436, |
| "kl": 0.029523877426981926, |
| "learning_rate": 6.380695431405453e-07, |
| "loss": 0.0012, |
| "reward": 0.05262499302625656, |
| "reward_std": 0.3924490809440613, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.05262500047683716, |
| "step": 265 |
| }, |
| { |
| "completion_length": 333.25, |
| "epoch": 0.7940298507462686, |
| "grad_norm": 0.25616171956062317, |
| "kl": 0.022499781101942062, |
| "learning_rate": 6.207589483478266e-07, |
| "loss": 0.0009, |
| "reward": 0.5889999866485596, |
| "reward_std": 0.9673720002174377, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08900000154972076, |
| "step": 266 |
| }, |
| { |
| "completion_length": 257.625, |
| "epoch": 0.7970149253731343, |
| "grad_norm": 0.32119879126548767, |
| "kl": 0.028798656538128853, |
| "learning_rate": 6.036530664984922e-07, |
| "loss": 0.0012, |
| "reward": 0.7595000267028809, |
| "reward_std": 1.2280067205429077, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.00950000062584877, |
| "step": 267 |
| }, |
| { |
| "completion_length": 330.125, |
| "epoch": 0.8, |
| "grad_norm": 0.4567493498325348, |
| "kl": 0.03090793453156948, |
| "learning_rate": 5.867537610019317e-07, |
| "loss": 0.0012, |
| "reward": 0.06974999606609344, |
| "reward_std": 0.1446550041437149, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06975000351667404, |
| "step": 268 |
| }, |
| { |
| "completion_length": 483.25, |
| "epoch": 0.8029850746268656, |
| "grad_norm": 0.1947696954011917, |
| "kl": 0.014657390303909779, |
| "learning_rate": 5.700628727643806e-07, |
| "loss": 0.0006, |
| "reward": 0.2797499895095825, |
| "reward_std": 0.4771743416786194, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.029750000685453415, |
| "step": 269 |
| }, |
| { |
| "completion_length": 390.5, |
| "epoch": 0.8059701492537313, |
| "grad_norm": 0.28339794278144836, |
| "kl": 0.01822579652070999, |
| "learning_rate": 5.53582219988382e-07, |
| "loss": 0.0007, |
| "reward": 0.0495000034570694, |
| "reward_std": 0.2241402566432953, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0494999997317791, |
| "step": 270 |
| }, |
| { |
| "completion_length": 612.375, |
| "epoch": 0.808955223880597, |
| "grad_norm": 0.19297142326831818, |
| "kl": 0.008483514189720154, |
| "learning_rate": 5.373135979747226e-07, |
| "loss": 0.0003, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 271 |
| }, |
| { |
| "completion_length": 442.25, |
| "epoch": 0.8119402985074626, |
| "grad_norm": 0.18323534727096558, |
| "kl": 0.015557816252112389, |
| "learning_rate": 5.21258778926865e-07, |
| "loss": 0.0006, |
| "reward": 0.4873749911785126, |
| "reward_std": 0.7220054864883423, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23737499117851257, |
| "step": 272 |
| }, |
| { |
| "completion_length": 420.375, |
| "epoch": 0.8149253731343283, |
| "grad_norm": 0.29259246587753296, |
| "kl": 0.014818736352026463, |
| "learning_rate": 5.054195117578914e-07, |
| "loss": 0.0006, |
| "reward": 0.10899998992681503, |
| "reward_std": 0.30351370573043823, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10899999737739563, |
| "step": 273 |
| }, |
| { |
| "completion_length": 144.875, |
| "epoch": 0.817910447761194, |
| "grad_norm": 0.4706750810146332, |
| "kl": 0.04596470296382904, |
| "learning_rate": 4.897975218999926e-07, |
| "loss": 0.0018, |
| "reward": 0.7043750286102295, |
| "reward_std": 0.9201574325561523, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2043750137090683, |
| "step": 274 |
| }, |
| { |
| "completion_length": 296.75, |
| "epoch": 0.8208955223880597, |
| "grad_norm": 0.24620626866817474, |
| "kl": 0.02245059236884117, |
| "learning_rate": 4.7439451111650685e-07, |
| "loss": 0.0009, |
| "reward": 0.40625, |
| "reward_std": 0.752228856086731, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15625, |
| "step": 275 |
| }, |
| { |
| "completion_length": 202.875, |
| "epoch": 0.8238805970149253, |
| "grad_norm": 0.4145122766494751, |
| "kl": 0.024612464010715485, |
| "learning_rate": 4.5921215731654144e-07, |
| "loss": 0.001, |
| "reward": 0.12150000035762787, |
| "reward_std": 0.18457983434200287, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12150000035762787, |
| "step": 276 |
| }, |
| { |
| "completion_length": 325.25, |
| "epoch": 0.826865671641791, |
| "grad_norm": 0.4407789409160614, |
| "kl": 0.03133422136306763, |
| "learning_rate": 4.4425211437218926e-07, |
| "loss": 0.0013, |
| "reward": 0.20337499678134918, |
| "reward_std": 0.09237955510616302, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20337499678134918, |
| "step": 277 |
| }, |
| { |
| "completion_length": 278.375, |
| "epoch": 0.8298507462686567, |
| "grad_norm": 0.39592450857162476, |
| "kl": 0.024857958778738976, |
| "learning_rate": 4.2951601193837124e-07, |
| "loss": 0.001, |
| "reward": 0.43712499737739563, |
| "reward_std": 1.1832053661346436, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.06287499517202377, |
| "step": 278 |
| }, |
| { |
| "completion_length": 557.625, |
| "epoch": 0.8328358208955224, |
| "grad_norm": 0.21943753957748413, |
| "kl": 0.008740151301026344, |
| "learning_rate": 4.150054552753055e-07, |
| "loss": 0.0003, |
| "reward": -0.22687500715255737, |
| "reward_std": 0.8986358642578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.22687500715255737, |
| "step": 279 |
| }, |
| { |
| "completion_length": 147.125, |
| "epoch": 0.835820895522388, |
| "grad_norm": 0.452767550945282, |
| "kl": 0.03773313760757446, |
| "learning_rate": 4.0072202507364543e-07, |
| "loss": 0.0015, |
| "reward": 1.2141249179840088, |
| "reward_std": 1.0455961227416992, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21412500739097595, |
| "step": 280 |
| }, |
| { |
| "completion_length": 446.25, |
| "epoch": 0.8388059701492537, |
| "grad_norm": 0.19403225183486938, |
| "kl": 0.01679888553917408, |
| "learning_rate": 3.866672772822863e-07, |
| "loss": 0.0007, |
| "reward": 0.056999996304512024, |
| "reward_std": 0.3133218586444855, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.056999996304512024, |
| "step": 281 |
| }, |
| { |
| "completion_length": 200.25, |
| "epoch": 0.8417910447761194, |
| "grad_norm": 0.60846346616745, |
| "kl": 0.029500508680939674, |
| "learning_rate": 3.728427429388709e-07, |
| "loss": 0.0012, |
| "reward": 0.7407500147819519, |
| "reward_std": 0.9356353878974915, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2407499998807907, |
| "step": 282 |
| }, |
| { |
| "completion_length": 339.125, |
| "epoch": 0.844776119402985, |
| "grad_norm": 0.25175222754478455, |
| "kl": 0.02122483216226101, |
| "learning_rate": 3.592499280030057e-07, |
| "loss": 0.0008, |
| "reward": 0.07399998605251312, |
| "reward_std": 0.5027032494544983, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07399999350309372, |
| "step": 283 |
| }, |
| { |
| "completion_length": 275.625, |
| "epoch": 0.8477611940298507, |
| "grad_norm": 0.8914472460746765, |
| "kl": 0.029980091378092766, |
| "learning_rate": 3.458903131922134e-07, |
| "loss": 0.0012, |
| "reward": 0.2407499998807907, |
| "reward_std": 0.11513439565896988, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2407499998807907, |
| "step": 284 |
| }, |
| { |
| "completion_length": 620.875, |
| "epoch": 0.8507462686567164, |
| "grad_norm": 0.16470280289649963, |
| "kl": 0.006700257305055857, |
| "learning_rate": 3.3276535382063184e-07, |
| "loss": 0.0003, |
| "reward": 0.06174999475479126, |
| "reward_std": 0.33658909797668457, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06174999475479126, |
| "step": 285 |
| }, |
| { |
| "completion_length": 506.25, |
| "epoch": 0.8537313432835821, |
| "grad_norm": 0.26415377855300903, |
| "kl": 0.01859319768846035, |
| "learning_rate": 3.1987647964048075e-07, |
| "loss": 0.0007, |
| "reward": 0.06987500190734863, |
| "reward_std": 0.2795202136039734, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06987500190734863, |
| "step": 286 |
| }, |
| { |
| "completion_length": 318.625, |
| "epoch": 0.8567164179104477, |
| "grad_norm": 0.41256532073020935, |
| "kl": 0.016763173043727875, |
| "learning_rate": 3.07225094686314e-07, |
| "loss": 0.0007, |
| "reward": 0.5, |
| "reward_std": 0.7071067690849304, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 287 |
| }, |
| { |
| "completion_length": 248.25, |
| "epoch": 0.8597014925373134, |
| "grad_norm": 0.3886096775531769, |
| "kl": 0.0391070619225502, |
| "learning_rate": 2.9481257712206974e-07, |
| "loss": 0.0016, |
| "reward": 1.424375057220459, |
| "reward_std": 1.079121470451355, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1743749976158142, |
| "step": 288 |
| }, |
| { |
| "completion_length": 198.875, |
| "epoch": 0.8626865671641791, |
| "grad_norm": 0.3038368821144104, |
| "kl": 0.028413468971848488, |
| "learning_rate": 2.8264027909094715e-07, |
| "loss": 0.0011, |
| "reward": 0.04787500202655792, |
| "reward_std": 0.277681440114975, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04787500202655792, |
| "step": 289 |
| }, |
| { |
| "completion_length": 241.375, |
| "epoch": 0.8656716417910447, |
| "grad_norm": 0.6994856595993042, |
| "kl": 0.025603370741009712, |
| "learning_rate": 2.707095265681081e-07, |
| "loss": 0.001, |
| "reward": 0.36537498235702515, |
| "reward_std": 1.275699257850647, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.13462501764297485, |
| "step": 290 |
| }, |
| { |
| "completion_length": 269.75, |
| "epoch": 0.8686567164179104, |
| "grad_norm": 0.5225069522857666, |
| "kl": 0.021305205300450325, |
| "learning_rate": 2.5902161921623454e-07, |
| "loss": 0.0009, |
| "reward": 0.24774998426437378, |
| "reward_std": 0.13129328191280365, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24774998426437378, |
| "step": 291 |
| }, |
| { |
| "completion_length": 301.875, |
| "epoch": 0.8716417910447761, |
| "grad_norm": 0.3499370813369751, |
| "kl": 0.019313901662826538, |
| "learning_rate": 2.4757783024395244e-07, |
| "loss": 0.0008, |
| "reward": 0.5285000205039978, |
| "reward_std": 0.8365910053253174, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.02850000374019146, |
| "step": 292 |
| }, |
| { |
| "completion_length": 382.125, |
| "epoch": 0.8746268656716418, |
| "grad_norm": 0.3204389214515686, |
| "kl": 0.01630595698952675, |
| "learning_rate": 2.3637940626713346e-07, |
| "loss": 0.0007, |
| "reward": 0.8981249928474426, |
| "reward_std": 1.1263054609298706, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14812499284744263, |
| "step": 293 |
| }, |
| { |
| "completion_length": 364.625, |
| "epoch": 0.8776119402985074, |
| "grad_norm": 0.43380382657051086, |
| "kl": 0.014025033451616764, |
| "learning_rate": 2.254275671731007e-07, |
| "loss": 0.0006, |
| "reward": 0.13312500715255737, |
| "reward_std": 0.20087197422981262, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13312500715255737, |
| "step": 294 |
| }, |
| { |
| "completion_length": 362.125, |
| "epoch": 0.8805970149253731, |
| "grad_norm": 0.25492313504219055, |
| "kl": 0.02461504563689232, |
| "learning_rate": 2.14723505987737e-07, |
| "loss": 0.001, |
| "reward": 1.11899995803833, |
| "reward_std": 1.0727145671844482, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11900000274181366, |
| "step": 295 |
| }, |
| { |
| "completion_length": 239.625, |
| "epoch": 0.8835820895522388, |
| "grad_norm": 0.2532467842102051, |
| "kl": 0.019054116681218147, |
| "learning_rate": 2.0426838874552713e-07, |
| "loss": 0.0008, |
| "reward": -0.02237500250339508, |
| "reward_std": 0.34399375319480896, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.022374995052814484, |
| "step": 296 |
| }, |
| { |
| "completion_length": 406.125, |
| "epoch": 0.8865671641791045, |
| "grad_norm": 0.3017323613166809, |
| "kl": 0.014569297432899475, |
| "learning_rate": 1.9406335436253727e-07, |
| "loss": 0.0006, |
| "reward": 0.1875, |
| "reward_std": 0.09449111670255661, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 297 |
| }, |
| { |
| "completion_length": 419.625, |
| "epoch": 0.8895522388059701, |
| "grad_norm": 0.27221450209617615, |
| "kl": 0.017288224771618843, |
| "learning_rate": 1.8410951451234533e-07, |
| "loss": 0.0007, |
| "reward": 0.06112499535083771, |
| "reward_std": 0.48568522930145264, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06112499535083771, |
| "step": 298 |
| }, |
| { |
| "completion_length": 499.625, |
| "epoch": 0.8925373134328358, |
| "grad_norm": 0.2659822404384613, |
| "kl": 0.013604966923594475, |
| "learning_rate": 1.7440795350494588e-07, |
| "loss": 0.0005, |
| "reward": 0.45787498354911804, |
| "reward_std": 0.7346572875976562, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20787498354911804, |
| "step": 299 |
| }, |
| { |
| "completion_length": 575.25, |
| "epoch": 0.8955223880597015, |
| "grad_norm": 0.40231838822364807, |
| "kl": 0.014159854501485825, |
| "learning_rate": 1.649597281686302e-07, |
| "loss": 0.0006, |
| "reward": 0.875, |
| "reward_std": 1.1417405605316162, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.125, |
| "step": 300 |
| }, |
| { |
| "completion_length": 499.0, |
| "epoch": 0.8985074626865671, |
| "grad_norm": 0.21424373984336853, |
| "kl": 0.032591626048088074, |
| "learning_rate": 1.5576586773486198e-07, |
| "loss": 0.0013, |
| "reward": -0.4646250009536743, |
| "reward_std": 0.961997926235199, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4646250009536743, |
| "step": 301 |
| }, |
| { |
| "completion_length": 374.375, |
| "epoch": 0.9014925373134328, |
| "grad_norm": 0.24450530111789703, |
| "kl": 0.012314786203205585, |
| "learning_rate": 1.4682737372615968e-07, |
| "loss": 0.0005, |
| "reward": -0.10649999976158142, |
| "reward_std": 0.7154986262321472, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.10650000721216202, |
| "step": 302 |
| }, |
| { |
| "completion_length": 450.25, |
| "epoch": 0.9044776119402985, |
| "grad_norm": 0.22723200917243958, |
| "kl": 0.015731429681181908, |
| "learning_rate": 1.3814521984699597e-07, |
| "loss": 0.0006, |
| "reward": 0.28125, |
| "reward_std": 0.0883931964635849, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28125, |
| "step": 303 |
| }, |
| { |
| "completion_length": 432.0, |
| "epoch": 0.9074626865671642, |
| "grad_norm": 0.22048191726207733, |
| "kl": 0.01865359954535961, |
| "learning_rate": 1.297203518777293e-07, |
| "loss": 0.0007, |
| "reward": 0.924375057220459, |
| "reward_std": 1.1127361059188843, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1743749976158142, |
| "step": 304 |
| }, |
| { |
| "completion_length": 216.75, |
| "epoch": 0.9104477611940298, |
| "grad_norm": 0.37429243326187134, |
| "kl": 0.023615093901753426, |
| "learning_rate": 1.2155368757157644e-07, |
| "loss": 0.0009, |
| "reward": 0.21875, |
| "reward_std": 0.0578637570142746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 305 |
| }, |
| { |
| "completion_length": 363.0, |
| "epoch": 0.9134328358208955, |
| "grad_norm": 0.25917860865592957, |
| "kl": 0.016996635124087334, |
| "learning_rate": 1.1364611655463737e-07, |
| "loss": 0.0007, |
| "reward": 0.2605000138282776, |
| "reward_std": 0.029698481783270836, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2605000138282776, |
| "step": 306 |
| }, |
| { |
| "completion_length": 429.875, |
| "epoch": 0.9164179104477612, |
| "grad_norm": 0.32845067977905273, |
| "kl": 0.006652818527072668, |
| "learning_rate": 1.0599850022898539e-07, |
| "loss": 0.0003, |
| "reward": 0.13762500882148743, |
| "reward_std": 0.2873464524745941, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13762499392032623, |
| "step": 307 |
| }, |
| { |
| "completion_length": 425.875, |
| "epoch": 0.9194029850746268, |
| "grad_norm": 0.45313775539398193, |
| "kl": 0.022167667746543884, |
| "learning_rate": 9.861167167883046e-08, |
| "loss": 0.0009, |
| "reward": 0.26487499475479126, |
| "reward_std": 0.042072843760252, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26487499475479126, |
| "step": 308 |
| }, |
| { |
| "completion_length": 801.125, |
| "epoch": 0.9223880597014925, |
| "grad_norm": 0.16913585364818573, |
| "kl": 0.004942856729030609, |
| "learning_rate": 9.148643557976955e-08, |
| "loss": 0.0002, |
| "reward": -0.06262499839067459, |
| "reward_std": 0.6919310092926025, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.062624990940094, |
| "step": 309 |
| }, |
| { |
| "completion_length": 368.875, |
| "epoch": 0.9253731343283582, |
| "grad_norm": 0.5053035020828247, |
| "kl": 0.026447134092450142, |
| "learning_rate": 8.462356811112987e-08, |
| "loss": 0.0011, |
| "reward": 0.9963749647140503, |
| "reward_std": 1.0054038763046265, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2463749796152115, |
| "step": 310 |
| }, |
| { |
| "completion_length": 188.875, |
| "epoch": 0.9283582089552239, |
| "grad_norm": 0.3338444232940674, |
| "kl": 0.027413196861743927, |
| "learning_rate": 7.802381687141537e-08, |
| "loss": 0.0011, |
| "reward": 0.19200000166893005, |
| "reward_std": 0.18996766209602356, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19200000166893005, |
| "step": 311 |
| }, |
| { |
| "completion_length": 450.875, |
| "epoch": 0.9313432835820895, |
| "grad_norm": 0.27691736817359924, |
| "kl": 0.016083208844065666, |
| "learning_rate": 7.168790079686932e-08, |
| "loss": 0.0006, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 312 |
| }, |
| { |
| "completion_length": 151.375, |
| "epoch": 0.9343283582089552, |
| "grad_norm": 0.48099038004875183, |
| "kl": 0.039979659020900726, |
| "learning_rate": 6.561651008315739e-08, |
| "loss": 0.0016, |
| "reward": 1.2289999723434448, |
| "reward_std": 1.0798243284225464, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2290000021457672, |
| "step": 313 |
| }, |
| { |
| "completion_length": 550.25, |
| "epoch": 0.9373134328358209, |
| "grad_norm": 0.23031727969646454, |
| "kl": 0.008523189462721348, |
| "learning_rate": 5.981030611018235e-08, |
| "loss": 0.0003, |
| "reward": -0.05887500196695328, |
| "reward_std": 0.525315523147583, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05887499451637268, |
| "step": 314 |
| }, |
| { |
| "completion_length": 265.125, |
| "epoch": 0.9402985074626866, |
| "grad_norm": 0.395933598279953, |
| "kl": 0.024086318910121918, |
| "learning_rate": 5.426992137003623e-08, |
| "loss": 0.001, |
| "reward": 0.44862499833106995, |
| "reward_std": 0.7450435757637024, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19862499833106995, |
| "step": 315 |
| }, |
| { |
| "completion_length": 305.75, |
| "epoch": 0.9432835820895522, |
| "grad_norm": 0.2765781283378601, |
| "kl": 0.02224023826420307, |
| "learning_rate": 4.899595939810237e-08, |
| "loss": 0.0009, |
| "reward": 0.22337499260902405, |
| "reward_std": 0.06204591691493988, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22337499260902405, |
| "step": 316 |
| }, |
| { |
| "completion_length": 154.875, |
| "epoch": 0.9462686567164179, |
| "grad_norm": 0.6462637782096863, |
| "kl": 0.04086542874574661, |
| "learning_rate": 4.3988994707308274e-08, |
| "loss": 0.0016, |
| "reward": 1.187749981880188, |
| "reward_std": 1.1077649593353271, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18774999678134918, |
| "step": 317 |
| }, |
| { |
| "completion_length": 366.25, |
| "epoch": 0.9492537313432836, |
| "grad_norm": 0.315337598323822, |
| "kl": 0.01272547710686922, |
| "learning_rate": 3.92495727255432e-08, |
| "loss": 0.0005, |
| "reward": 0.171875, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171875, |
| "step": 318 |
| }, |
| { |
| "completion_length": 252.375, |
| "epoch": 0.9522388059701492, |
| "grad_norm": 0.3268197774887085, |
| "kl": 0.012484841048717499, |
| "learning_rate": 3.4778209736240633e-08, |
| "loss": 0.0005, |
| "reward": 0.1706250011920929, |
| "reward_std": 0.17941246926784515, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1706250011920929, |
| "step": 319 |
| }, |
| { |
| "completion_length": 782.25, |
| "epoch": 0.9552238805970149, |
| "grad_norm": 0.17558734118938446, |
| "kl": 0.004195680376142263, |
| "learning_rate": 3.057539282213973e-08, |
| "loss": 0.0002, |
| "reward": 0.203249990940094, |
| "reward_std": 0.0932672768831253, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203249990940094, |
| "step": 320 |
| }, |
| { |
| "completion_length": 441.75, |
| "epoch": 0.9582089552238806, |
| "grad_norm": 0.21393883228302002, |
| "kl": 0.01248833630234003, |
| "learning_rate": 2.6641579812224373e-08, |
| "loss": 0.0005, |
| "reward": 0.07662499696016312, |
| "reward_std": 0.29490867257118225, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07662499696016312, |
| "step": 321 |
| }, |
| { |
| "completion_length": 269.625, |
| "epoch": 0.9611940298507463, |
| "grad_norm": 0.3684273064136505, |
| "kl": 0.034730613231658936, |
| "learning_rate": 2.2977199231850323e-08, |
| "loss": 0.0014, |
| "reward": 0.43137499690055847, |
| "reward_std": 0.7381420731544495, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18137499690055847, |
| "step": 322 |
| }, |
| { |
| "completion_length": 272.75, |
| "epoch": 0.9641791044776119, |
| "grad_norm": 0.4992087185382843, |
| "kl": 0.021383756771683693, |
| "learning_rate": 1.9582650256064206e-08, |
| "loss": 0.0009, |
| "reward": 0.20925000309944153, |
| "reward_std": 0.13599659502506256, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20924998819828033, |
| "step": 323 |
| }, |
| { |
| "completion_length": 278.0, |
| "epoch": 0.9671641791044776, |
| "grad_norm": 0.2685709595680237, |
| "kl": 0.021522654220461845, |
| "learning_rate": 1.6458302666119142e-08, |
| "loss": 0.0009, |
| "reward": 0.4894999861717224, |
| "reward_std": 0.7203638553619385, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2395000010728836, |
| "step": 324 |
| }, |
| { |
| "completion_length": 325.375, |
| "epoch": 0.9701492537313433, |
| "grad_norm": 0.44149407744407654, |
| "kl": 0.012358118779957294, |
| "learning_rate": 1.3604496809195289e-08, |
| "loss": 0.0005, |
| "reward": 0.13312500715255737, |
| "reward_std": 0.23681482672691345, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13312500715255737, |
| "step": 325 |
| }, |
| { |
| "completion_length": 322.25, |
| "epoch": 0.9731343283582089, |
| "grad_norm": 0.2590884268283844, |
| "kl": 0.010153868235647678, |
| "learning_rate": 1.1021543561322012e-08, |
| "loss": 0.0004, |
| "reward": 0.06912499666213989, |
| "reward_std": 0.4021095931529999, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06912501156330109, |
| "step": 326 |
| }, |
| { |
| "completion_length": 300.875, |
| "epoch": 0.9761194029850746, |
| "grad_norm": 0.259194016456604, |
| "kl": 0.035336654633283615, |
| "learning_rate": 8.709724293513855e-09, |
| "loss": 0.0014, |
| "reward": 0.6192499995231628, |
| "reward_std": 0.7979370951652527, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11924999952316284, |
| "step": 327 |
| }, |
| { |
| "completion_length": 224.5, |
| "epoch": 0.9791044776119403, |
| "grad_norm": 0.3342638909816742, |
| "kl": 0.046725135296583176, |
| "learning_rate": 6.66929084112089e-09, |
| "loss": 0.0019, |
| "reward": 0.41975000500679016, |
| "reward_std": 0.7516071796417236, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16975000500679016, |
| "step": 328 |
| }, |
| { |
| "completion_length": 346.0, |
| "epoch": 0.982089552238806, |
| "grad_norm": 0.31775063276290894, |
| "kl": 0.03616100549697876, |
| "learning_rate": 4.900465476393168e-09, |
| "loss": 0.0014, |
| "reward": 0.6196249723434448, |
| "reward_std": 0.8508279323577881, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11962500214576721, |
| "step": 329 |
| }, |
| { |
| "completion_length": 183.25, |
| "epoch": 0.9850746268656716, |
| "grad_norm": 0.5430064797401428, |
| "kl": 0.029290281236171722, |
| "learning_rate": 3.4034408842695264e-09, |
| "loss": 0.0012, |
| "reward": 1.1321250200271606, |
| "reward_std": 0.9209251403808594, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13212500512599945, |
| "step": 330 |
| }, |
| { |
| "completion_length": 308.625, |
| "epoch": 0.9880597014925373, |
| "grad_norm": 0.36556127667427063, |
| "kl": 0.021233482286334038, |
| "learning_rate": 2.1783801413866044e-09, |
| "loss": 0.0008, |
| "reward": 0.12475001811981201, |
| "reward_std": 0.9743397235870361, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.12524999678134918, |
| "step": 331 |
| }, |
| { |
| "completion_length": 334.125, |
| "epoch": 0.991044776119403, |
| "grad_norm": 0.44806885719299316, |
| "kl": 0.03072451800107956, |
| "learning_rate": 1.2254166983152737e-09, |
| "loss": 0.0012, |
| "reward": 0.16687500476837158, |
| "reward_std": 0.08920031785964966, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16687500476837158, |
| "step": 332 |
| }, |
| { |
| "completion_length": 501.25, |
| "epoch": 0.9940298507462687, |
| "grad_norm": 0.18010862171649933, |
| "kl": 0.009232274256646633, |
| "learning_rate": 5.446543650219905e-10, |
| "loss": 0.0004, |
| "reward": -0.12587499618530273, |
| "reward_std": 0.5240494608879089, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.12587499618530273, |
| "step": 333 |
| }, |
| { |
| "completion_length": 265.25, |
| "epoch": 0.9970149253731343, |
| "grad_norm": 0.42972275614738464, |
| "kl": 0.01687278039753437, |
| "learning_rate": 1.3616729956228425e-10, |
| "loss": 0.0007, |
| "reward": 0.21875, |
| "reward_std": 0.0578637570142746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 334 |
| }, |
| { |
| "completion_length": 280.625, |
| "epoch": 1.0, |
| "grad_norm": 0.3359426259994507, |
| "kl": 0.017340093851089478, |
| "learning_rate": 0.0, |
| "loss": 0.0007, |
| "reward": 0.43549999594688416, |
| "reward_std": 0.7575456500053406, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18549999594688416, |
| "step": 335 |
| }, |
| { |
| "completion_length": 655.25, |
| "epoch": 2.0240963855421685, |
| "grad_norm": 0.20819780230522156, |
| "kl": 0.007548333611339331, |
| "learning_rate": 1.4831583923105e-06, |
| "loss": 0.0003, |
| "reward": 0.140749990940094, |
| "reward_std": 0.12400546669960022, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.140749990940094, |
| "step": 336 |
| }, |
| { |
| "completion_length": 217.375, |
| "epoch": 2.0301204819277108, |
| "grad_norm": 0.3707653880119324, |
| "kl": 0.034900326281785965, |
| "learning_rate": 1.467238925438646e-06, |
| "loss": 0.0014, |
| "reward": 0.6980000138282776, |
| "reward_std": 1.0477970838546753, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1979999989271164, |
| "step": 337 |
| }, |
| { |
| "completion_length": 726.25, |
| "epoch": 2.036144578313253, |
| "grad_norm": 0.2153882533311844, |
| "kl": 0.006285813637077808, |
| "learning_rate": 1.4513697938845571e-06, |
| "loss": 0.0003, |
| "reward": 0.1875, |
| "reward_std": 0.06681530922651291, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 338 |
| }, |
| { |
| "completion_length": 299.25, |
| "epoch": 2.0421686746987953, |
| "grad_norm": 0.28123152256011963, |
| "kl": 0.019150294363498688, |
| "learning_rate": 1.4355517710873184e-06, |
| "loss": 0.0008, |
| "reward": 0.2615000009536743, |
| "reward_std": 0.8954194784164429, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.011500000953674316, |
| "step": 339 |
| }, |
| { |
| "completion_length": 220.25, |
| "epoch": 2.0481927710843375, |
| "grad_norm": 0.3615806996822357, |
| "kl": 0.02874070033431053, |
| "learning_rate": 1.419785627995044e-06, |
| "loss": 0.0011, |
| "reward": 1.5212500095367432, |
| "reward_std": 1.0071378946304321, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.27125000953674316, |
| "step": 340 |
| }, |
| { |
| "completion_length": 278.0, |
| "epoch": 2.0542168674698793, |
| "grad_norm": 0.4012732207775116, |
| "kl": 0.023837679997086525, |
| "learning_rate": 1.4040721330273063e-06, |
| "loss": 0.001, |
| "reward": 0.484375, |
| "reward_std": 0.6629126071929932, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 341 |
| }, |
| { |
| "completion_length": 411.25, |
| "epoch": 2.0602409638554215, |
| "grad_norm": 0.27616533637046814, |
| "kl": 0.017284037545323372, |
| "learning_rate": 1.388412052037682e-06, |
| "loss": 0.0007, |
| "reward": 0.09037499129772186, |
| "reward_std": 0.2937574088573456, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09037499129772186, |
| "step": 342 |
| }, |
| { |
| "completion_length": 509.375, |
| "epoch": 2.066265060240964, |
| "grad_norm": 0.652441680431366, |
| "kl": 0.041089244186878204, |
| "learning_rate": 1.3728061482764238e-06, |
| "loss": 0.0016, |
| "reward": 0.17987500131130219, |
| "reward_std": 0.10290833562612534, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17987500131130219, |
| "step": 343 |
| }, |
| { |
| "completion_length": 900.0, |
| "epoch": 2.072289156626506, |
| "grad_norm": 0.15484927594661713, |
| "kl": 0.0041319322772324085, |
| "learning_rate": 1.3572551823532654e-06, |
| "loss": 0.0002, |
| "reward": 0.13512499630451202, |
| "reward_std": 0.09873548895120621, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13512499630451202, |
| "step": 344 |
| }, |
| { |
| "completion_length": 401.5, |
| "epoch": 2.0783132530120483, |
| "grad_norm": 0.3243913948535919, |
| "kl": 0.026439081877470016, |
| "learning_rate": 1.3417599122003464e-06, |
| "loss": 0.0011, |
| "reward": -0.03162500262260437, |
| "reward_std": 0.5784817934036255, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.03162500262260437, |
| "step": 345 |
| }, |
| { |
| "completion_length": 315.75, |
| "epoch": 2.0843373493975905, |
| "grad_norm": 0.29938212037086487, |
| "kl": 0.03102310746908188, |
| "learning_rate": 1.3263210930352737e-06, |
| "loss": 0.0012, |
| "reward": 0.5933749675750732, |
| "reward_std": 0.8945927619934082, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09337499737739563, |
| "step": 346 |
| }, |
| { |
| "completion_length": 418.75, |
| "epoch": 2.0903614457831323, |
| "grad_norm": 0.24994419515132904, |
| "kl": 0.015923254191875458, |
| "learning_rate": 1.3109394773243117e-06, |
| "loss": 0.0006, |
| "reward": 0.014375001192092896, |
| "reward_std": 0.5910600423812866, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.014375001192092896, |
| "step": 347 |
| }, |
| { |
| "completion_length": 377.0, |
| "epoch": 2.0963855421686746, |
| "grad_norm": 0.19088208675384521, |
| "kl": 0.018240461125969887, |
| "learning_rate": 1.2956158147457116e-06, |
| "loss": 0.0007, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 348 |
| }, |
| { |
| "completion_length": 397.875, |
| "epoch": 2.102409638554217, |
| "grad_norm": 0.1473723202943802, |
| "kl": 0.019901467487215996, |
| "learning_rate": 1.280350852153168e-06, |
| "loss": 0.0008, |
| "reward": 0.22362500429153442, |
| "reward_std": 0.9566056132316589, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.026375003159046173, |
| "step": 349 |
| }, |
| { |
| "completion_length": 554.875, |
| "epoch": 2.108433734939759, |
| "grad_norm": 0.1978859007358551, |
| "kl": 0.009672200307250023, |
| "learning_rate": 1.2651453335394232e-06, |
| "loss": 0.0004, |
| "reward": 0.46875, |
| "reward_std": 0.7219455242156982, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 350 |
| }, |
| { |
| "completion_length": 436.5, |
| "epoch": 2.1144578313253013, |
| "grad_norm": 0.18422538042068481, |
| "kl": 0.016564225777983665, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0007, |
| "reward": 0.03349999338388443, |
| "reward_std": 0.4848186671733856, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03349999338388443, |
| "step": 351 |
| }, |
| { |
| "completion_length": 345.875, |
| "epoch": 2.1204819277108435, |
| "grad_norm": 0.22677145898342133, |
| "kl": 0.024902012199163437, |
| "learning_rate": 1.234915589697091e-06, |
| "loss": 0.001, |
| "reward": -0.13850000500679016, |
| "reward_std": 0.9170010089874268, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.13850000500679016, |
| "step": 352 |
| }, |
| { |
| "completion_length": 593.625, |
| "epoch": 2.1265060240963853, |
| "grad_norm": 0.14154018461704254, |
| "kl": 0.010703539475798607, |
| "learning_rate": 1.2198928378235717e-06, |
| "loss": 0.0004, |
| "reward": 0.20762500166893005, |
| "reward_std": 0.0964038223028183, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20762500166893005, |
| "step": 353 |
| }, |
| { |
| "completion_length": 380.375, |
| "epoch": 2.1325301204819276, |
| "grad_norm": 0.3278507590293884, |
| "kl": 0.01132420264184475, |
| "learning_rate": 1.204932476567175e-06, |
| "loss": 0.0005, |
| "reward": 0.07412499934434891, |
| "reward_std": 0.38485187292099, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07412499934434891, |
| "step": 354 |
| }, |
| { |
| "completion_length": 470.125, |
| "epoch": 2.13855421686747, |
| "grad_norm": 0.23620018362998962, |
| "kl": 0.01436136569827795, |
| "learning_rate": 1.1900352350748026e-06, |
| "loss": 0.0006, |
| "reward": 0.250124990940094, |
| "reward_std": 0.0670829713344574, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 355 |
| }, |
| { |
| "completion_length": 416.125, |
| "epoch": 2.144578313253012, |
| "grad_norm": 0.3359208405017853, |
| "kl": 0.014237859286367893, |
| "learning_rate": 1.1752018394169882e-06, |
| "loss": 0.0006, |
| "reward": 0.32512497901916504, |
| "reward_std": 0.8155838251113892, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07512499392032623, |
| "step": 356 |
| }, |
| { |
| "completion_length": 327.875, |
| "epoch": 2.1506024096385543, |
| "grad_norm": 0.2985382676124573, |
| "kl": 0.026154540479183197, |
| "learning_rate": 1.160433012552508e-06, |
| "loss": 0.001, |
| "reward": 0.203125, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 357 |
| }, |
| { |
| "completion_length": 770.75, |
| "epoch": 2.1566265060240966, |
| "grad_norm": 0.2134159803390503, |
| "kl": 0.0066121285781264305, |
| "learning_rate": 1.1457294742931508e-06, |
| "loss": 0.0003, |
| "reward": 0.171875, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171875, |
| "step": 358 |
| }, |
| { |
| "completion_length": 594.625, |
| "epoch": 2.1626506024096384, |
| "grad_norm": 0.2000584453344345, |
| "kl": 0.007993868552148342, |
| "learning_rate": 1.1310919412686248e-06, |
| "loss": 0.0003, |
| "reward": -0.28224998712539673, |
| "reward_std": 0.9825229644775391, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.28224998712539673, |
| "step": 359 |
| }, |
| { |
| "completion_length": 262.375, |
| "epoch": 2.1686746987951806, |
| "grad_norm": 0.28641971945762634, |
| "kl": 0.03493226692080498, |
| "learning_rate": 1.11652112689164e-06, |
| "loss": 0.0014, |
| "reward": 0.675125002861023, |
| "reward_std": 0.9916070103645325, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17512500286102295, |
| "step": 360 |
| }, |
| { |
| "completion_length": 362.75, |
| "epoch": 2.174698795180723, |
| "grad_norm": 0.29293859004974365, |
| "kl": 0.011597638949751854, |
| "learning_rate": 1.1020177413231334e-06, |
| "loss": 0.0005, |
| "reward": 0.17649999260902405, |
| "reward_std": 0.21882934868335724, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17649999260902405, |
| "step": 361 |
| }, |
| { |
| "completion_length": 336.375, |
| "epoch": 2.180722891566265, |
| "grad_norm": 0.006778970826417208, |
| "kl": 0.01573561131954193, |
| "learning_rate": 1.0875824914376555e-06, |
| "loss": 0.0006, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 362 |
| }, |
| { |
| "completion_length": 622.125, |
| "epoch": 2.1867469879518073, |
| "grad_norm": 0.2229079008102417, |
| "kl": 0.017959900200366974, |
| "learning_rate": 1.073216080788921e-06, |
| "loss": 0.0007, |
| "reward": 0.18774999678134918, |
| "reward_std": 0.13389840722084045, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.187749981880188, |
| "step": 363 |
| }, |
| { |
| "completion_length": 559.0, |
| "epoch": 2.1927710843373496, |
| "grad_norm": 0.20531107485294342, |
| "kl": 0.020543191581964493, |
| "learning_rate": 1.0589192095755172e-06, |
| "loss": 0.0008, |
| "reward": 0.5233749747276306, |
| "reward_std": 1.107194185256958, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.226624995470047, |
| "step": 364 |
| }, |
| { |
| "completion_length": 298.375, |
| "epoch": 2.1987951807228914, |
| "grad_norm": 0.3007963299751282, |
| "kl": 0.02151988446712494, |
| "learning_rate": 1.0446925746067768e-06, |
| "loss": 0.0009, |
| "reward": -0.07700000703334808, |
| "reward_std": 0.4532400965690613, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07699999958276749, |
| "step": 365 |
| }, |
| { |
| "completion_length": 506.875, |
| "epoch": 2.2048192771084336, |
| "grad_norm": 0.22336696088314056, |
| "kl": 0.008836156688630581, |
| "learning_rate": 1.0305368692688175e-06, |
| "loss": 0.0004, |
| "reward": -0.44300001859664917, |
| "reward_std": 1.2848740816116333, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4429999589920044, |
| "step": 366 |
| }, |
| { |
| "completion_length": 231.375, |
| "epoch": 2.210843373493976, |
| "grad_norm": 0.36337810754776, |
| "kl": 0.03387583792209625, |
| "learning_rate": 1.0164527834907468e-06, |
| "loss": 0.0014, |
| "reward": 0.18512499332427979, |
| "reward_std": 0.09986624866724014, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18512499332427979, |
| "step": 367 |
| }, |
| { |
| "completion_length": 518.625, |
| "epoch": 2.216867469879518, |
| "grad_norm": 0.14192265272140503, |
| "kl": 0.010741294361650944, |
| "learning_rate": 1.0024410037110358e-06, |
| "loss": 0.0004, |
| "reward": 0.421999990940094, |
| "reward_std": 0.7438033819198608, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171999990940094, |
| "step": 368 |
| }, |
| { |
| "completion_length": 653.75, |
| "epoch": 2.2228915662650603, |
| "grad_norm": 0.15892748534679413, |
| "kl": 0.010834253393113613, |
| "learning_rate": 9.88502212844063e-07, |
| "loss": 0.0004, |
| "reward": 0.453249990940094, |
| "reward_std": 0.7316948771476746, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203249990940094, |
| "step": 369 |
| }, |
| { |
| "completion_length": 343.625, |
| "epoch": 2.2289156626506026, |
| "grad_norm": 0.26513054966926575, |
| "kl": 0.024884607642889023, |
| "learning_rate": 9.746370902468311e-07, |
| "loss": 0.001, |
| "reward": 0.26649999618530273, |
| "reward_std": 0.07925906032323837, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26649999618530273, |
| "step": 370 |
| }, |
| { |
| "completion_length": 390.625, |
| "epoch": 2.2349397590361444, |
| "grad_norm": 0.2124333679676056, |
| "kl": 0.014746158383786678, |
| "learning_rate": 9.608463116858544e-07, |
| "loss": 0.0006, |
| "reward": 0.984375, |
| "reward_std": 1.0488886833190918, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 371 |
| }, |
| { |
| "completion_length": 248.0, |
| "epoch": 2.2409638554216866, |
| "grad_norm": 0.29546263813972473, |
| "kl": 0.021351803094148636, |
| "learning_rate": 9.471305493042243e-07, |
| "loss": 0.0009, |
| "reward": 0.33912500739097595, |
| "reward_std": 0.8690047860145569, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08912500739097595, |
| "step": 372 |
| }, |
| { |
| "completion_length": 288.0, |
| "epoch": 2.246987951807229, |
| "grad_norm": 0.3159976601600647, |
| "kl": 0.020781053230166435, |
| "learning_rate": 9.334904715888496e-07, |
| "loss": 0.0008, |
| "reward": 1.21875, |
| "reward_std": 1.0706098079681396, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 373 |
| }, |
| { |
| "completion_length": 780.0, |
| "epoch": 2.253012048192771, |
| "grad_norm": 0.1493767648935318, |
| "kl": 0.006651751697063446, |
| "learning_rate": 9.199267433378728e-07, |
| "loss": 0.0003, |
| "reward": 0.140749990940094, |
| "reward_std": 0.21551980078220367, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.140749990940094, |
| "step": 374 |
| }, |
| { |
| "completion_length": 366.875, |
| "epoch": 2.2590361445783134, |
| "grad_norm": 0.2316281646490097, |
| "kl": 0.0141691192984581, |
| "learning_rate": 9.064400256282757e-07, |
| "loss": 0.0006, |
| "reward": 0.71875, |
| "reward_std": 0.9466812610626221, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 375 |
| }, |
| { |
| "completion_length": 387.625, |
| "epoch": 2.2650602409638556, |
| "grad_norm": 0.2875477373600006, |
| "kl": 0.012642526999115944, |
| "learning_rate": 8.930309757836517e-07, |
| "loss": 0.0005, |
| "reward": 0.11912500113248825, |
| "reward_std": 0.3765973746776581, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11912499368190765, |
| "step": 376 |
| }, |
| { |
| "completion_length": 228.375, |
| "epoch": 2.2710843373493974, |
| "grad_norm": 0.3087570369243622, |
| "kl": 0.0358150415122509, |
| "learning_rate": 8.797002473421729e-07, |
| "loss": 0.0014, |
| "reward": 1.5157499313354492, |
| "reward_std": 1.0141069889068604, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.265749990940094, |
| "step": 377 |
| }, |
| { |
| "completion_length": 228.375, |
| "epoch": 2.2771084337349397, |
| "grad_norm": 0.36816123127937317, |
| "kl": 0.03920350223779678, |
| "learning_rate": 8.664484900247363e-07, |
| "loss": 0.0016, |
| "reward": 1.0, |
| "reward_std": 1.0350983142852783, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 378 |
| }, |
| { |
| "completion_length": 339.0, |
| "epoch": 2.283132530120482, |
| "grad_norm": 0.2836088240146637, |
| "kl": 0.028930485248565674, |
| "learning_rate": 8.532763497032987e-07, |
| "loss": 0.0012, |
| "reward": 0.734375, |
| "reward_std": 0.9364577531814575, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 379 |
| }, |
| { |
| "completion_length": 530.125, |
| "epoch": 2.289156626506024, |
| "grad_norm": 0.21830621361732483, |
| "kl": 0.009584854356944561, |
| "learning_rate": 8.40184468369396e-07, |
| "loss": 0.0004, |
| "reward": 0.008124999701976776, |
| "reward_std": 0.736358642578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.00812499225139618, |
| "step": 380 |
| }, |
| { |
| "completion_length": 542.625, |
| "epoch": 2.2951807228915664, |
| "grad_norm": 0.13557778298854828, |
| "kl": 0.009539497084915638, |
| "learning_rate": 8.271734841028553e-07, |
| "loss": 0.0004, |
| "reward": 0.250124990940094, |
| "reward_std": 0.0670829713344574, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 381 |
| }, |
| { |
| "completion_length": 508.75, |
| "epoch": 2.3012048192771086, |
| "grad_norm": 0.24548353254795074, |
| "kl": 0.014915486797690392, |
| "learning_rate": 8.142440310406923e-07, |
| "loss": 0.0006, |
| "reward": 0.203125, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 382 |
| }, |
| { |
| "completion_length": 754.875, |
| "epoch": 2.3072289156626504, |
| "grad_norm": 0.1483210176229477, |
| "kl": 0.0073943245224654675, |
| "learning_rate": 8.013967393462094e-07, |
| "loss": 0.0003, |
| "reward": -0.22075000405311584, |
| "reward_std": 0.8405213952064514, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.22075000405311584, |
| "step": 383 |
| }, |
| { |
| "completion_length": 506.125, |
| "epoch": 2.3132530120481927, |
| "grad_norm": 0.1965951770544052, |
| "kl": 0.00697671715170145, |
| "learning_rate": 7.886322351782782e-07, |
| "loss": 0.0003, |
| "reward": -0.23000000417232513, |
| "reward_std": 0.8609561324119568, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.23000001907348633, |
| "step": 384 |
| }, |
| { |
| "completion_length": 167.625, |
| "epoch": 2.319277108433735, |
| "grad_norm": 0.5482251644134521, |
| "kl": 0.04781109094619751, |
| "learning_rate": 7.759511406608255e-07, |
| "loss": 0.0019, |
| "reward": 1.4592499732971191, |
| "reward_std": 1.0946043729782104, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20925000309944153, |
| "step": 385 |
| }, |
| { |
| "completion_length": 344.875, |
| "epoch": 2.325301204819277, |
| "grad_norm": 0.3512763977050781, |
| "kl": 0.015125907957553864, |
| "learning_rate": 7.633540738525066e-07, |
| "loss": 0.0006, |
| "reward": 0.21875, |
| "reward_std": 0.0578637570142746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 386 |
| }, |
| { |
| "completion_length": 469.25, |
| "epoch": 2.3313253012048194, |
| "grad_norm": 0.1568324714899063, |
| "kl": 0.00942917913198471, |
| "learning_rate": 7.508416487165862e-07, |
| "loss": 0.0004, |
| "reward": 0.2574999928474426, |
| "reward_std": 0.02121320180594921, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2574999928474426, |
| "step": 387 |
| }, |
| { |
| "completion_length": 239.5, |
| "epoch": 2.337349397590361, |
| "grad_norm": 0.5860251188278198, |
| "kl": 0.018855417147278786, |
| "learning_rate": 7.384144750910133e-07, |
| "loss": 0.0008, |
| "reward": 0.484375, |
| "reward_std": 0.714759886264801, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 388 |
| }, |
| { |
| "completion_length": 602.25, |
| "epoch": 2.3433734939759034, |
| "grad_norm": 0.24529512226581573, |
| "kl": 0.01155894249677658, |
| "learning_rate": 7.260731586586983e-07, |
| "loss": 0.0005, |
| "reward": 0.1875, |
| "reward_std": 0.09449111670255661, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 389 |
| }, |
| { |
| "completion_length": 466.25, |
| "epoch": 2.3493975903614457, |
| "grad_norm": 0.16617080569267273, |
| "kl": 0.015526726841926575, |
| "learning_rate": 7.138183009179922e-07, |
| "loss": 0.0006, |
| "reward": 0.234499990940094, |
| "reward_std": 0.10450837016105652, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234499990940094, |
| "step": 390 |
| }, |
| { |
| "completion_length": 501.875, |
| "epoch": 2.355421686746988, |
| "grad_norm": 0.17602694034576416, |
| "kl": 0.015567664988338947, |
| "learning_rate": 7.016504991533727e-07, |
| "loss": 0.0006, |
| "reward": 0.203125, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 391 |
| }, |
| { |
| "completion_length": 552.125, |
| "epoch": 2.36144578313253, |
| "grad_norm": 0.2334936112165451, |
| "kl": 0.005338181275874376, |
| "learning_rate": 6.895703464063319e-07, |
| "loss": 0.0002, |
| "reward": 0.051375001668930054, |
| "reward_std": 0.4206688106060028, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.051375001668930054, |
| "step": 392 |
| }, |
| { |
| "completion_length": 602.5, |
| "epoch": 2.3674698795180724, |
| "grad_norm": 0.2039727419614792, |
| "kl": 0.008196253329515457, |
| "learning_rate": 6.775784314464717e-07, |
| "loss": 0.0003, |
| "reward": 0.08312499523162842, |
| "reward_std": 0.5247610211372375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08312500268220901, |
| "step": 393 |
| }, |
| { |
| "completion_length": 165.75, |
| "epoch": 2.3734939759036147, |
| "grad_norm": 0.42460864782333374, |
| "kl": 0.03733068332076073, |
| "learning_rate": 6.656753387428089e-07, |
| "loss": 0.0015, |
| "reward": 0.9026249647140503, |
| "reward_std": 1.0384610891342163, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15262499451637268, |
| "step": 394 |
| }, |
| { |
| "completion_length": 462.875, |
| "epoch": 2.3795180722891565, |
| "grad_norm": 0.37091249227523804, |
| "kl": 0.022075312212109566, |
| "learning_rate": 6.538616484352902e-07, |
| "loss": 0.0009, |
| "reward": 0.453125, |
| "reward_std": 0.6811803579330444, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 395 |
| }, |
| { |
| "completion_length": 587.875, |
| "epoch": 2.3855421686746987, |
| "grad_norm": 0.2852852940559387, |
| "kl": 0.020712627097964287, |
| "learning_rate": 6.421379363065142e-07, |
| "loss": 0.0008, |
| "reward": 0.195374995470047, |
| "reward_std": 0.09100539982318878, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.195374995470047, |
| "step": 396 |
| }, |
| { |
| "completion_length": 381.125, |
| "epoch": 2.391566265060241, |
| "grad_norm": 0.36693301796913147, |
| "kl": 0.017576757818460464, |
| "learning_rate": 6.305047737536707e-07, |
| "loss": 0.0007, |
| "reward": 0.16775000095367432, |
| "reward_std": 0.11186311393976212, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16775000095367432, |
| "step": 397 |
| }, |
| { |
| "completion_length": 335.125, |
| "epoch": 2.397590361445783, |
| "grad_norm": 0.006105829495936632, |
| "kl": 0.021454401314258575, |
| "learning_rate": 6.189627277606894e-07, |
| "loss": 0.0009, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 398 |
| }, |
| { |
| "completion_length": 568.25, |
| "epoch": 2.4036144578313254, |
| "grad_norm": 0.19439569115638733, |
| "kl": 0.005857834126800299, |
| "learning_rate": 6.075123608706093e-07, |
| "loss": 0.0002, |
| "reward": -0.14375001192092896, |
| "reward_std": 1.0640850067138672, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.14375001192092896, |
| "step": 399 |
| }, |
| { |
| "completion_length": 556.25, |
| "epoch": 2.4096385542168672, |
| "grad_norm": 0.19129334390163422, |
| "kl": 0.012769855558872223, |
| "learning_rate": 5.961542311581586e-07, |
| "loss": 0.0005, |
| "reward": 0.0625, |
| "reward_std": 0.481812059879303, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0625, |
| "step": 400 |
| }, |
| { |
| "completion_length": 678.0, |
| "epoch": 2.4156626506024095, |
| "grad_norm": 0.10480325669050217, |
| "kl": 0.011414820328354836, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.0005, |
| "reward": 0.250249981880188, |
| "reward_std": 0.06708363443613052, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250249981880188, |
| "step": 401 |
| }, |
| { |
| "completion_length": 369.125, |
| "epoch": 2.4216867469879517, |
| "grad_norm": 0.19944852590560913, |
| "kl": 0.013226395472884178, |
| "learning_rate": 5.737168930605272e-07, |
| "loss": 0.0005, |
| "reward": 0.5, |
| "reward_std": 0.7071067690849304, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 402 |
| }, |
| { |
| "completion_length": 698.625, |
| "epoch": 2.427710843373494, |
| "grad_norm": 0.18083348870277405, |
| "kl": 0.006614608224481344, |
| "learning_rate": 5.626387782395512e-07, |
| "loss": 0.0003, |
| "reward": 0.24324999749660492, |
| "reward_std": 0.053853634744882584, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24324999749660492, |
| "step": 403 |
| }, |
| { |
| "completion_length": 585.25, |
| "epoch": 2.433734939759036, |
| "grad_norm": 0.28779125213623047, |
| "kl": 0.009438473731279373, |
| "learning_rate": 5.516550876713142e-07, |
| "loss": 0.0004, |
| "reward": 0.09562499821186066, |
| "reward_std": 0.3886004090309143, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09562499821186066, |
| "step": 404 |
| }, |
| { |
| "completion_length": 585.125, |
| "epoch": 2.4397590361445785, |
| "grad_norm": 0.2503024637699127, |
| "kl": 0.010052215307950974, |
| "learning_rate": 5.407663566854008e-07, |
| "loss": 0.0004, |
| "reward": 0.2006250023841858, |
| "reward_std": 0.06298738718032837, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2006250023841858, |
| "step": 405 |
| }, |
| { |
| "completion_length": 449.875, |
| "epoch": 2.4457831325301207, |
| "grad_norm": 0.19435061514377594, |
| "kl": 0.01912347972393036, |
| "learning_rate": 5.299731159831953e-07, |
| "loss": 0.0008, |
| "reward": 0.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 406 |
| }, |
| { |
| "completion_length": 335.125, |
| "epoch": 2.4518072289156625, |
| "grad_norm": 0.3011760413646698, |
| "kl": 0.022993480786681175, |
| "learning_rate": 5.192758916120236e-07, |
| "loss": 0.0009, |
| "reward": 0.2548750042915344, |
| "reward_std": 0.013788589276373386, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2548750042915344, |
| "step": 407 |
| }, |
| { |
| "completion_length": 235.75, |
| "epoch": 2.4578313253012047, |
| "grad_norm": 0.509082555770874, |
| "kl": 0.024164235219359398, |
| "learning_rate": 5.086752049395094e-07, |
| "loss": 0.001, |
| "reward": 0.19037500023841858, |
| "reward_std": 0.08244986832141876, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19037500023841858, |
| "step": 408 |
| }, |
| { |
| "completion_length": 533.625, |
| "epoch": 2.463855421686747, |
| "grad_norm": 0.2011859267950058, |
| "kl": 0.012068904004991055, |
| "learning_rate": 4.981715726281666e-07, |
| "loss": 0.0005, |
| "reward": 0.2277500033378601, |
| "reward_std": 0.06811072677373886, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2277500033378601, |
| "step": 409 |
| }, |
| { |
| "completion_length": 506.5, |
| "epoch": 2.4698795180722892, |
| "grad_norm": 0.20587992668151855, |
| "kl": 0.012153996154665947, |
| "learning_rate": 4.87765506610215e-07, |
| "loss": 0.0005, |
| "reward": 0.203249990940094, |
| "reward_std": 0.09307561814785004, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203249990940094, |
| "step": 410 |
| }, |
| { |
| "completion_length": 322.5, |
| "epoch": 2.4759036144578315, |
| "grad_norm": 0.34467822313308716, |
| "kl": 0.028947116807103157, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 0.0012, |
| "reward": 0.16899999976158142, |
| "reward_std": 0.22910259664058685, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16899999976158142, |
| "step": 411 |
| }, |
| { |
| "completion_length": 378.5, |
| "epoch": 2.4819277108433733, |
| "grad_norm": 0.2705584168434143, |
| "kl": 0.017314178869128227, |
| "learning_rate": 4.672480973824312e-07, |
| "loss": 0.0007, |
| "reward": -0.12187500298023224, |
| "reward_std": 0.5589302182197571, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.12187501043081284, |
| "step": 412 |
| }, |
| { |
| "completion_length": 420.75, |
| "epoch": 2.4879518072289155, |
| "grad_norm": 0.25909072160720825, |
| "kl": 0.01942913979291916, |
| "learning_rate": 4.5713775416217884e-07, |
| "loss": 0.0008, |
| "reward": 0.21875, |
| "reward_std": 0.0578637570142746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 413 |
| }, |
| { |
| "completion_length": 513.375, |
| "epoch": 2.4939759036144578, |
| "grad_norm": 0.2140943855047226, |
| "kl": 0.01646452210843563, |
| "learning_rate": 4.4712697716573994e-07, |
| "loss": 0.0007, |
| "reward": -0.06062500178813934, |
| "reward_std": 0.7784535884857178, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.060624998062849045, |
| "step": 414 |
| }, |
| { |
| "completion_length": 450.25, |
| "epoch": 2.5, |
| "grad_norm": 0.20752555131912231, |
| "kl": 0.01588767021894455, |
| "learning_rate": 4.372162543042624e-07, |
| "loss": 0.0006, |
| "reward": 0.08799999952316284, |
| "reward_std": 0.3619901239871979, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08799999952316284, |
| "step": 415 |
| }, |
| { |
| "completion_length": 317.875, |
| "epoch": 2.5060240963855422, |
| "grad_norm": 0.32861968874931335, |
| "kl": 0.02544497698545456, |
| "learning_rate": 4.27406068612396e-07, |
| "loss": 0.001, |
| "reward": 0.6762499809265137, |
| "reward_std": 0.9924419522285461, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17624999582767487, |
| "step": 416 |
| }, |
| { |
| "completion_length": 545.875, |
| "epoch": 2.5120481927710845, |
| "grad_norm": 0.18259470164775848, |
| "kl": 0.01542817335575819, |
| "learning_rate": 4.1769689822475147e-07, |
| "loss": 0.0006, |
| "reward": -0.025749996304512024, |
| "reward_std": 0.6845365166664124, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.025749996304512024, |
| "step": 417 |
| }, |
| { |
| "completion_length": 240.875, |
| "epoch": 2.5180722891566267, |
| "grad_norm": 0.305070698261261, |
| "kl": 0.025023318827152252, |
| "learning_rate": 4.0808921635259595e-07, |
| "loss": 0.001, |
| "reward": 1.21875, |
| "reward_std": 1.1054855585098267, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 418 |
| }, |
| { |
| "completion_length": 457.125, |
| "epoch": 2.5240963855421685, |
| "grad_norm": 0.22730815410614014, |
| "kl": 0.0132564352825284, |
| "learning_rate": 3.9858349126078945e-07, |
| "loss": 0.0005, |
| "reward": 0.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 419 |
| }, |
| { |
| "completion_length": 304.5, |
| "epoch": 2.5301204819277108, |
| "grad_norm": 0.3886808156967163, |
| "kl": 0.017254332080483437, |
| "learning_rate": 3.891801862449629e-07, |
| "loss": 0.0007, |
| "reward": 0.47075000405311584, |
| "reward_std": 0.720878541469574, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22075000405311584, |
| "step": 420 |
| }, |
| { |
| "completion_length": 396.875, |
| "epoch": 2.536144578313253, |
| "grad_norm": 0.23020797967910767, |
| "kl": 0.01623387821018696, |
| "learning_rate": 3.798797596089351e-07, |
| "loss": 0.0006, |
| "reward": 0.234499990940094, |
| "reward_std": 0.10450837016105652, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234499990940094, |
| "step": 421 |
| }, |
| { |
| "completion_length": 416.5, |
| "epoch": 2.5421686746987953, |
| "grad_norm": 0.30661872029304504, |
| "kl": 0.019575409591197968, |
| "learning_rate": 3.7068266464238085e-07, |
| "loss": 0.0008, |
| "reward": 0.765749990940094, |
| "reward_std": 0.9171299934387207, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.265749990940094, |
| "step": 422 |
| }, |
| { |
| "completion_length": 597.25, |
| "epoch": 2.5481927710843375, |
| "grad_norm": 0.2281297743320465, |
| "kl": 0.008763095363974571, |
| "learning_rate": 3.615893495987335e-07, |
| "loss": 0.0004, |
| "reward": 0.12849999964237213, |
| "reward_std": 0.1161501482129097, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12849999964237213, |
| "step": 423 |
| }, |
| { |
| "completion_length": 568.125, |
| "epoch": 2.5542168674698793, |
| "grad_norm": 0.24079188704490662, |
| "kl": 0.008225774392485619, |
| "learning_rate": 3.5260025767333894e-07, |
| "loss": 0.0003, |
| "reward": -0.5896250009536743, |
| "reward_std": 1.1861082315444946, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5896250009536743, |
| "step": 424 |
| }, |
| { |
| "completion_length": 526.25, |
| "epoch": 2.5602409638554215, |
| "grad_norm": 0.19694213569164276, |
| "kl": 0.014266147278249264, |
| "learning_rate": 3.4371582698185636e-07, |
| "loss": 0.0006, |
| "reward": 0.4375, |
| "reward_std": 0.7379992604255676, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 425 |
| }, |
| { |
| "completion_length": 352.5, |
| "epoch": 2.566265060240964, |
| "grad_norm": 0.31402555108070374, |
| "kl": 0.012443519197404385, |
| "learning_rate": 3.3493649053890325e-07, |
| "loss": 0.0005, |
| "reward": 1.5001249313354492, |
| "reward_std": 1.034925937652588, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 426 |
| }, |
| { |
| "completion_length": 362.25, |
| "epoch": 2.572289156626506, |
| "grad_norm": 0.37671875953674316, |
| "kl": 0.015330496244132519, |
| "learning_rate": 3.262626762369525e-07, |
| "loss": 0.0006, |
| "reward": 0.08500000089406967, |
| "reward_std": 0.41847509145736694, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08500000089406967, |
| "step": 427 |
| }, |
| { |
| "completion_length": 224.5, |
| "epoch": 2.5783132530120483, |
| "grad_norm": 0.28751739859580994, |
| "kl": 0.031113017350435257, |
| "learning_rate": 3.176948068254762e-07, |
| "loss": 0.0012, |
| "reward": 0.734375, |
| "reward_std": 0.9364577531814575, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 428 |
| }, |
| { |
| "completion_length": 252.375, |
| "epoch": 2.5843373493975905, |
| "grad_norm": 0.4819236099720001, |
| "kl": 0.015910452231764793, |
| "learning_rate": 3.092332998903416e-07, |
| "loss": 0.0006, |
| "reward": 0.6326249837875366, |
| "reward_std": 1.0495240688323975, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13262499868869781, |
| "step": 429 |
| }, |
| { |
| "completion_length": 490.125, |
| "epoch": 2.5903614457831328, |
| "grad_norm": 0.006184490397572517, |
| "kl": 0.011435314081609249, |
| "learning_rate": 3.0087856783345916e-07, |
| "loss": 0.0005, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 430 |
| }, |
| { |
| "completion_length": 196.25, |
| "epoch": 2.5963855421686746, |
| "grad_norm": 0.3237186074256897, |
| "kl": 0.057896729558706284, |
| "learning_rate": 2.9263101785268253e-07, |
| "loss": 0.0023, |
| "reward": 0.9521250128746033, |
| "reward_std": 1.0781170129776, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20212499797344208, |
| "step": 431 |
| }, |
| { |
| "completion_length": 677.5, |
| "epoch": 2.602409638554217, |
| "grad_norm": 0.1705317348241806, |
| "kl": 0.010733860544860363, |
| "learning_rate": 2.844910519219632e-07, |
| "loss": 0.0004, |
| "reward": -0.016249999403953552, |
| "reward_std": 0.7049353718757629, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.016249999403953552, |
| "step": 432 |
| }, |
| { |
| "completion_length": 402.25, |
| "epoch": 2.608433734939759, |
| "grad_norm": 0.33006930351257324, |
| "kl": 0.009483945555984974, |
| "learning_rate": 2.764590667717562e-07, |
| "loss": 0.0004, |
| "reward": 0.71875, |
| "reward_std": 0.949036180973053, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 433 |
| }, |
| { |
| "completion_length": 393.75, |
| "epoch": 2.6144578313253013, |
| "grad_norm": 0.2071043848991394, |
| "kl": 0.028593460097908974, |
| "learning_rate": 2.6853545386968607e-07, |
| "loss": 0.0011, |
| "reward": 0.011875003576278687, |
| "reward_std": 0.6735191941261292, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.011875003576278687, |
| "step": 434 |
| }, |
| { |
| "completion_length": 496.125, |
| "epoch": 2.6204819277108435, |
| "grad_norm": 0.003087718039751053, |
| "kl": 0.013737998902797699, |
| "learning_rate": 2.6072059940146775e-07, |
| "loss": 0.0005, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 435 |
| }, |
| { |
| "completion_length": 286.25, |
| "epoch": 2.6265060240963853, |
| "grad_norm": 0.43593427538871765, |
| "kl": 0.01645815744996071, |
| "learning_rate": 2.53014884252083e-07, |
| "loss": 0.0007, |
| "reward": 0.17550000548362732, |
| "reward_std": 0.07825781404972076, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17550000548362732, |
| "step": 436 |
| }, |
| { |
| "completion_length": 526.875, |
| "epoch": 2.6325301204819276, |
| "grad_norm": 0.2566601634025574, |
| "kl": 0.019549060612916946, |
| "learning_rate": 2.454186839872158e-07, |
| "loss": 0.0008, |
| "reward": 0.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 437 |
| }, |
| { |
| "completion_length": 452.5, |
| "epoch": 2.63855421686747, |
| "grad_norm": 0.20801463723182678, |
| "kl": 0.006738721393048763, |
| "learning_rate": 2.3793236883495164e-07, |
| "loss": 0.0003, |
| "reward": 0.24324998259544373, |
| "reward_std": 0.08273148536682129, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24324999749660492, |
| "step": 438 |
| }, |
| { |
| "completion_length": 221.375, |
| "epoch": 2.644578313253012, |
| "grad_norm": 0.3174714744091034, |
| "kl": 0.0349612832069397, |
| "learning_rate": 2.3055630366772857e-07, |
| "loss": 0.0014, |
| "reward": 1.2501249313354492, |
| "reward_std": 1.0710142850875854, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 439 |
| }, |
| { |
| "completion_length": 533.25, |
| "epoch": 2.6506024096385543, |
| "grad_norm": 0.3137078583240509, |
| "kl": 0.013247030787169933, |
| "learning_rate": 2.2329084798455747e-07, |
| "loss": 0.0005, |
| "reward": 0.07062499225139618, |
| "reward_std": 0.504204273223877, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07062499225139618, |
| "step": 440 |
| }, |
| { |
| "completion_length": 158.375, |
| "epoch": 2.6566265060240966, |
| "grad_norm": 0.44325655698776245, |
| "kl": 0.03579949215054512, |
| "learning_rate": 2.1613635589349756e-07, |
| "loss": 0.0014, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 441 |
| }, |
| { |
| "completion_length": 335.625, |
| "epoch": 2.662650602409639, |
| "grad_norm": 0.00665801577270031, |
| "kl": 0.02036757580935955, |
| "learning_rate": 2.0909317609440093e-07, |
| "loss": 0.0008, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 442 |
| }, |
| { |
| "completion_length": 249.25, |
| "epoch": 2.6686746987951806, |
| "grad_norm": 0.35808223485946655, |
| "kl": 0.03606145828962326, |
| "learning_rate": 2.0216165186191406e-07, |
| "loss": 0.0014, |
| "reward": 0.11924999952316284, |
| "reward_std": 0.28267890214920044, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11924999952316284, |
| "step": 443 |
| }, |
| { |
| "completion_length": 594.0, |
| "epoch": 2.674698795180723, |
| "grad_norm": 0.1513955146074295, |
| "kl": 0.010759763419628143, |
| "learning_rate": 1.95342121028749e-07, |
| "loss": 0.0004, |
| "reward": -0.018874995410442352, |
| "reward_std": 0.7062042355537415, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.018874995410442352, |
| "step": 444 |
| }, |
| { |
| "completion_length": 105.25, |
| "epoch": 2.680722891566265, |
| "grad_norm": 0.5154651999473572, |
| "kl": 0.0334286130964756, |
| "learning_rate": 1.8863491596921745e-07, |
| "loss": 0.0013, |
| "reward": 0.75, |
| "reward_std": 0.9258201122283936, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 445 |
| }, |
| { |
| "completion_length": 227.25, |
| "epoch": 2.6867469879518073, |
| "grad_norm": 0.2869022488594055, |
| "kl": 0.043689362704753876, |
| "learning_rate": 1.8204036358303173e-07, |
| "loss": 0.0017, |
| "reward": 1.1618750095367432, |
| "reward_std": 1.1721407175064087, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16187500953674316, |
| "step": 446 |
| }, |
| { |
| "completion_length": 451.625, |
| "epoch": 2.692771084337349, |
| "grad_norm": 0.23079638183116913, |
| "kl": 0.01949518732726574, |
| "learning_rate": 1.7555878527937164e-07, |
| "loss": 0.0008, |
| "reward": 0.21875, |
| "reward_std": 0.0578637570142746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 447 |
| }, |
| { |
| "completion_length": 363.625, |
| "epoch": 2.6987951807228914, |
| "grad_norm": 0.2220403254032135, |
| "kl": 0.01806565374135971, |
| "learning_rate": 1.6919049696121957e-07, |
| "loss": 0.0007, |
| "reward": 1.2344999313354492, |
| "reward_std": 1.090507984161377, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234499990940094, |
| "step": 448 |
| }, |
| { |
| "completion_length": 269.125, |
| "epoch": 2.7048192771084336, |
| "grad_norm": 0.33472809195518494, |
| "kl": 0.026786940172314644, |
| "learning_rate": 1.629358090099639e-07, |
| "loss": 0.0011, |
| "reward": 0.13474999368190765, |
| "reward_std": 0.24177010357379913, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13474999368190765, |
| "step": 449 |
| }, |
| { |
| "completion_length": 487.0, |
| "epoch": 2.710843373493976, |
| "grad_norm": 0.18152180314064026, |
| "kl": 0.013146109879016876, |
| "learning_rate": 1.567950262702714e-07, |
| "loss": 0.0005, |
| "reward": -0.03137500584125519, |
| "reward_std": 0.5211835503578186, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.03137500584125519, |
| "step": 450 |
| }, |
| { |
| "completion_length": 437.125, |
| "epoch": 2.716867469879518, |
| "grad_norm": 0.22847002744674683, |
| "kl": 0.00889852736145258, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.0004, |
| "reward": 0.265749990940094, |
| "reward_std": 0.04454772174358368, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.265749990940094, |
| "step": 451 |
| }, |
| { |
| "completion_length": 586.125, |
| "epoch": 2.7228915662650603, |
| "grad_norm": 0.18244458734989166, |
| "kl": 0.012827915139496326, |
| "learning_rate": 1.4485636803175828e-07, |
| "loss": 0.0005, |
| "reward": -0.10275000333786011, |
| "reward_std": 0.6561883091926575, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.10275000333786011, |
| "step": 452 |
| }, |
| { |
| "completion_length": 549.625, |
| "epoch": 2.7289156626506026, |
| "grad_norm": 0.20121344923973083, |
| "kl": 0.012550292536616325, |
| "learning_rate": 1.3905907440629752e-07, |
| "loss": 0.0005, |
| "reward": 0.218874990940094, |
| "reward_std": 0.05794193223118782, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.218874990940094, |
| "step": 453 |
| }, |
| { |
| "completion_length": 351.75, |
| "epoch": 2.734939759036145, |
| "grad_norm": 0.26031172275543213, |
| "kl": 0.020667368546128273, |
| "learning_rate": 1.3337684971075932e-07, |
| "loss": 0.0008, |
| "reward": 0.734375, |
| "reward_std": 0.9364577531814575, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 454 |
| }, |
| { |
| "completion_length": 427.5, |
| "epoch": 2.7409638554216866, |
| "grad_norm": 0.20729762315750122, |
| "kl": 0.009626028127968311, |
| "learning_rate": 1.278099708887587e-07, |
| "loss": 0.0004, |
| "reward": 0.20737498998641968, |
| "reward_std": 0.15404817461967468, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20737498998641968, |
| "step": 455 |
| }, |
| { |
| "completion_length": 419.875, |
| "epoch": 2.746987951807229, |
| "grad_norm": 0.28601571917533875, |
| "kl": 0.017598113045096397, |
| "learning_rate": 1.223587092621162e-07, |
| "loss": 0.0007, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 456 |
| }, |
| { |
| "completion_length": 352.5, |
| "epoch": 2.753012048192771, |
| "grad_norm": 0.3590512275695801, |
| "kl": 0.012438575737178326, |
| "learning_rate": 1.1702333051763271e-07, |
| "loss": 0.0005, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 457 |
| }, |
| { |
| "completion_length": 570.0, |
| "epoch": 2.7590361445783134, |
| "grad_norm": 0.22990113496780396, |
| "kl": 0.009037652052938938, |
| "learning_rate": 1.1180409469414094e-07, |
| "loss": 0.0004, |
| "reward": 0.23362499475479126, |
| "reward_std": 0.08174863457679749, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23362499475479126, |
| "step": 458 |
| }, |
| { |
| "completion_length": 242.0, |
| "epoch": 2.765060240963855, |
| "grad_norm": 0.3716026842594147, |
| "kl": 0.029650242999196053, |
| "learning_rate": 1.067012561698319e-07, |
| "loss": 0.0012, |
| "reward": 0.250124990940094, |
| "reward_std": 0.0670829713344574, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 459 |
| }, |
| { |
| "completion_length": 646.5, |
| "epoch": 2.7710843373493974, |
| "grad_norm": 0.13931992650032043, |
| "kl": 0.005772537086158991, |
| "learning_rate": 1.0171506364985622e-07, |
| "loss": 0.0002, |
| "reward": -0.022499993443489075, |
| "reward_std": 0.575958788394928, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.022499993443489075, |
| "step": 460 |
| }, |
| { |
| "completion_length": 245.375, |
| "epoch": 2.7771084337349397, |
| "grad_norm": 0.2136068344116211, |
| "kl": 0.018364811316132545, |
| "learning_rate": 9.684576015420277e-08, |
| "loss": 0.0007, |
| "reward": 0.5, |
| "reward_std": 0.7071067690849304, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 461 |
| }, |
| { |
| "completion_length": 667.75, |
| "epoch": 2.783132530120482, |
| "grad_norm": 0.25874850153923035, |
| "kl": 0.011082552373409271, |
| "learning_rate": 9.209358300585474e-08, |
| "loss": 0.0004, |
| "reward": 0.1875, |
| "reward_std": 0.06681530922651291, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 462 |
| }, |
| { |
| "completion_length": 432.125, |
| "epoch": 2.789156626506024, |
| "grad_norm": 0.2160276621580124, |
| "kl": 0.015586335211992264, |
| "learning_rate": 8.745876381922147e-08, |
| "loss": 0.0006, |
| "reward": -0.11900000274181366, |
| "reward_std": 0.6917340755462646, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11900000274181366, |
| "step": 463 |
| }, |
| { |
| "completion_length": 385.875, |
| "epoch": 2.7951807228915664, |
| "grad_norm": 0.28281015157699585, |
| "kl": 0.013250474818050861, |
| "learning_rate": 8.294152848885156e-08, |
| "loss": 0.0005, |
| "reward": 0.46875, |
| "reward_std": 0.6706539988517761, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 464 |
| }, |
| { |
| "completion_length": 576.25, |
| "epoch": 2.8012048192771086, |
| "grad_norm": 0.3380723297595978, |
| "kl": 0.012658186256885529, |
| "learning_rate": 7.854209717842231e-08, |
| "loss": 0.0005, |
| "reward": 0.203125, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 465 |
| }, |
| { |
| "completion_length": 456.75, |
| "epoch": 2.807228915662651, |
| "grad_norm": 0.17416658997535706, |
| "kl": 0.016306335106492043, |
| "learning_rate": 7.426068431000883e-08, |
| "loss": 0.0007, |
| "reward": 0.250124990940094, |
| "reward_std": 0.0670829638838768, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 466 |
| }, |
| { |
| "completion_length": 387.875, |
| "epoch": 2.8132530120481927, |
| "grad_norm": 0.3264393210411072, |
| "kl": 0.018369905650615692, |
| "learning_rate": 7.009749855363457e-08, |
| "loss": 0.0007, |
| "reward": 0.058250002562999725, |
| "reward_std": 0.31987351179122925, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.058250002562999725, |
| "step": 467 |
| }, |
| { |
| "completion_length": 620.625, |
| "epoch": 2.819277108433735, |
| "grad_norm": 0.17490556836128235, |
| "kl": 0.01565961353480816, |
| "learning_rate": 6.605274281709929e-08, |
| "loss": 0.0006, |
| "reward": -0.017000004649162292, |
| "reward_std": 0.6620731353759766, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.017000004649162292, |
| "step": 468 |
| }, |
| { |
| "completion_length": 317.375, |
| "epoch": 2.825301204819277, |
| "grad_norm": 0.26670563220977783, |
| "kl": 0.021924596279859543, |
| "learning_rate": 6.212661423609184e-08, |
| "loss": 0.0009, |
| "reward": 0.5152499675750732, |
| "reward_std": 0.7022190690040588, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26524999737739563, |
| "step": 469 |
| }, |
| { |
| "completion_length": 433.75, |
| "epoch": 2.8313253012048194, |
| "grad_norm": 0.3066515326499939, |
| "kl": 0.016122814267873764, |
| "learning_rate": 5.83193041645802e-08, |
| "loss": 0.0006, |
| "reward": 0.421875, |
| "reward_std": 0.7468311190605164, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171875, |
| "step": 470 |
| }, |
| { |
| "completion_length": 216.0, |
| "epoch": 2.837349397590361, |
| "grad_norm": 0.39599770307540894, |
| "kl": 0.043192628771066666, |
| "learning_rate": 5.463099816548578e-08, |
| "loss": 0.0017, |
| "reward": 0.75, |
| "reward_std": 0.9258201122283936, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 471 |
| }, |
| { |
| "completion_length": 381.75, |
| "epoch": 2.8433734939759034, |
| "grad_norm": 0.4189399182796478, |
| "kl": 0.017973648384213448, |
| "learning_rate": 5.106187600163987e-08, |
| "loss": 0.0007, |
| "reward": 0.10974998772144318, |
| "reward_std": 0.30874618887901306, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10974999517202377, |
| "step": 472 |
| }, |
| { |
| "completion_length": 767.25, |
| "epoch": 2.8493975903614457, |
| "grad_norm": 0.2473141849040985, |
| "kl": 0.006026203744113445, |
| "learning_rate": 4.761211162702117e-08, |
| "loss": 0.0002, |
| "reward": 0.171875, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.171875, |
| "step": 473 |
| }, |
| { |
| "completion_length": 321.0, |
| "epoch": 2.855421686746988, |
| "grad_norm": 0.2813480794429779, |
| "kl": 0.02672071009874344, |
| "learning_rate": 4.428187317827848e-08, |
| "loss": 0.0011, |
| "reward": 0.71875, |
| "reward_std": 0.9081721901893616, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 474 |
| }, |
| { |
| "completion_length": 395.0, |
| "epoch": 2.86144578313253, |
| "grad_norm": 0.3251534402370453, |
| "kl": 0.016685165464878082, |
| "learning_rate": 4.1071322966535487e-08, |
| "loss": 0.0007, |
| "reward": 0.257999986410141, |
| "reward_std": 0.9987041354179382, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.007999997586011887, |
| "step": 475 |
| }, |
| { |
| "completion_length": 495.0, |
| "epoch": 2.8674698795180724, |
| "grad_norm": 0.24719910323619843, |
| "kl": 0.03484595939517021, |
| "learning_rate": 3.798061746947995e-08, |
| "loss": 0.0014, |
| "reward": 0.06424999982118607, |
| "reward_std": 0.3966638445854187, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06424999982118607, |
| "step": 476 |
| }, |
| { |
| "completion_length": 289.125, |
| "epoch": 2.8734939759036147, |
| "grad_norm": 0.28067588806152344, |
| "kl": 0.020761603489518166, |
| "learning_rate": 3.5009907323737826e-08, |
| "loss": 0.0008, |
| "reward": 1.0, |
| "reward_std": 1.0350983142852783, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 477 |
| }, |
| { |
| "completion_length": 171.25, |
| "epoch": 2.8795180722891565, |
| "grad_norm": 0.37774914503097534, |
| "kl": 0.03348740190267563, |
| "learning_rate": 3.2159337317530234e-08, |
| "loss": 0.0013, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 478 |
| }, |
| { |
| "completion_length": 237.5, |
| "epoch": 2.8855421686746987, |
| "grad_norm": 0.29570528864860535, |
| "kl": 0.024981847032904625, |
| "learning_rate": 2.9429046383618042e-08, |
| "loss": 0.001, |
| "reward": 0.643750011920929, |
| "reward_std": 1.033868432044983, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14374999701976776, |
| "step": 479 |
| }, |
| { |
| "completion_length": 357.75, |
| "epoch": 2.891566265060241, |
| "grad_norm": 0.20856322348117828, |
| "kl": 0.03461308404803276, |
| "learning_rate": 2.681916759252917e-08, |
| "loss": 0.0014, |
| "reward": -0.07449999451637268, |
| "reward_std": 0.8214806914329529, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07449999451637268, |
| "step": 480 |
| }, |
| { |
| "completion_length": 395.625, |
| "epoch": 2.897590361445783, |
| "grad_norm": 0.23173460364341736, |
| "kl": 0.02344985119998455, |
| "learning_rate": 2.4329828146074096e-08, |
| "loss": 0.0009, |
| "reward": 0.1002500057220459, |
| "reward_std": 0.4639940559864044, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1002500057220459, |
| "step": 481 |
| }, |
| { |
| "completion_length": 470.875, |
| "epoch": 2.9036144578313254, |
| "grad_norm": 0.26940688490867615, |
| "kl": 0.025867372751235962, |
| "learning_rate": 2.1961149371145795e-08, |
| "loss": 0.001, |
| "reward": 0.484499990940094, |
| "reward_std": 0.7209571003913879, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234499990940094, |
| "step": 482 |
| }, |
| { |
| "completion_length": 367.625, |
| "epoch": 2.9096385542168672, |
| "grad_norm": 0.36178308725357056, |
| "kl": 0.0094979889690876, |
| "learning_rate": 1.9713246713805588e-08, |
| "loss": 0.0004, |
| "reward": 0.5625, |
| "reward_std": 1.1630470752716064, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0625, |
| "step": 483 |
| }, |
| { |
| "completion_length": 495.25, |
| "epoch": 2.9156626506024095, |
| "grad_norm": 0.16893015801906586, |
| "kl": 0.021265776827931404, |
| "learning_rate": 1.7586229733657646e-08, |
| "loss": 0.0009, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 484 |
| }, |
| { |
| "completion_length": 166.625, |
| "epoch": 2.9216867469879517, |
| "grad_norm": 0.3928318917751312, |
| "kl": 0.029352502897381783, |
| "learning_rate": 1.5580202098509078e-08, |
| "loss": 0.0012, |
| "reward": 1.2105000019073486, |
| "reward_std": 1.1130155324935913, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21050000190734863, |
| "step": 485 |
| }, |
| { |
| "completion_length": 379.75, |
| "epoch": 2.927710843373494, |
| "grad_norm": 0.29452574253082275, |
| "kl": 0.028082868084311485, |
| "learning_rate": 1.3695261579316776e-08, |
| "loss": 0.0011, |
| "reward": 0.484375, |
| "reward_std": 0.714759886264801, |
| "rewards/correctness_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 486 |
| }, |
| { |
| "completion_length": 620.25, |
| "epoch": 2.933734939759036, |
| "grad_norm": 0.19965560734272003, |
| "kl": 0.010307521559298038, |
| "learning_rate": 1.193150004542204e-08, |
| "loss": 0.0004, |
| "reward": 0.037125006318092346, |
| "reward_std": 0.553325355052948, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.037125006318092346, |
| "step": 487 |
| }, |
| { |
| "completion_length": 771.875, |
| "epoch": 2.9397590361445785, |
| "grad_norm": 0.1377246379852295, |
| "kl": 0.007695982698351145, |
| "learning_rate": 1.0289003460074165e-08, |
| "loss": 0.0003, |
| "reward": 0.203125, |
| "reward_std": 0.0646936446428299, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 488 |
| }, |
| { |
| "completion_length": 417.875, |
| "epoch": 2.9457831325301207, |
| "grad_norm": 0.2390993982553482, |
| "kl": 0.02972429431974888, |
| "learning_rate": 8.767851876239075e-09, |
| "loss": 0.0012, |
| "reward": 0.703125, |
| "reward_std": 0.9588346481323242, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 489 |
| }, |
| { |
| "completion_length": 542.75, |
| "epoch": 2.9518072289156625, |
| "grad_norm": 0.2091827541589737, |
| "kl": 0.012457402423024178, |
| "learning_rate": 7.368119432699383e-09, |
| "loss": 0.0005, |
| "reward": 0.203125, |
| "reward_std": 0.09300297498703003, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203125, |
| "step": 490 |
| }, |
| { |
| "completion_length": 520.625, |
| "epoch": 2.9578313253012047, |
| "grad_norm": 0.13813644647598267, |
| "kl": 0.012580258771777153, |
| "learning_rate": 6.089874350439507e-09, |
| "loss": 0.0005, |
| "reward": 0.234375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.234375, |
| "step": 491 |
| }, |
| { |
| "completion_length": 326.25, |
| "epoch": 2.963855421686747, |
| "grad_norm": 0.2626722455024719, |
| "kl": 0.01562521792948246, |
| "learning_rate": 4.933178929321103e-09, |
| "loss": 0.0006, |
| "reward": 0.250124990940094, |
| "reward_std": 0.0670829713344574, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250124990940094, |
| "step": 492 |
| }, |
| { |
| "completion_length": 457.25, |
| "epoch": 2.9698795180722892, |
| "grad_norm": 0.24207204580307007, |
| "kl": 0.011039088480174541, |
| "learning_rate": 3.8980895450474455e-09, |
| "loss": 0.0004, |
| "reward": -0.34562498331069946, |
| "reward_std": 0.5172874331474304, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.34562501311302185, |
| "step": 493 |
| }, |
| { |
| "completion_length": 300.25, |
| "epoch": 2.9759036144578315, |
| "grad_norm": 0.3652004599571228, |
| "kl": 0.038945458829402924, |
| "learning_rate": 2.984656646415063e-09, |
| "loss": 0.0016, |
| "reward": 1.1518750190734863, |
| "reward_std": 1.0214972496032715, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15187500417232513, |
| "step": 494 |
| }, |
| { |
| "completion_length": 272.25, |
| "epoch": 2.9819277108433733, |
| "grad_norm": 0.3992674946784973, |
| "kl": 0.020435592159628868, |
| "learning_rate": 2.192924752854042e-09, |
| "loss": 0.0008, |
| "reward": 1.71875, |
| "reward_std": 0.9106267690658569, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 495 |
| }, |
| { |
| "completion_length": 276.0, |
| "epoch": 2.9879518072289155, |
| "grad_norm": 0.009798334911465645, |
| "kl": 0.028400778770446777, |
| "learning_rate": 1.5229324522605949e-09, |
| "loss": 0.0011, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 496 |
| }, |
| { |
| "completion_length": 422.75, |
| "epoch": 2.9939759036144578, |
| "grad_norm": 0.26674267649650574, |
| "kl": 0.0230987798422575, |
| "learning_rate": 9.747123991141193e-10, |
| "loss": 0.0009, |
| "reward": -0.2436250001192093, |
| "reward_std": 0.653490424156189, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.24362501502037048, |
| "step": 497 |
| }, |
| { |
| "completion_length": 523.125, |
| "epoch": 3.0, |
| "grad_norm": 0.17998848855495453, |
| "kl": 0.015568609349429607, |
| "learning_rate": 5.48291312886251e-10, |
| "loss": 0.0006, |
| "reward": -0.16837498545646667, |
| "reward_std": 1.085857629776001, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.16837498545646667, |
| "step": 498 |
| }, |
| { |
| "completion_length": 511.625, |
| "epoch": 3.0060240963855422, |
| "grad_norm": 0.20502515137195587, |
| "kl": 0.010177758522331715, |
| "learning_rate": 2.43689976739403e-10, |
| "loss": 0.0004, |
| "reward": -0.0806250125169754, |
| "reward_std": 0.8039577007293701, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.08062499761581421, |
| "step": 499 |
| }, |
| { |
| "completion_length": 536.75, |
| "epoch": 3.0120481927710845, |
| "grad_norm": 0.1854425072669983, |
| "kl": 0.007698327302932739, |
| "learning_rate": 6.092323651313293e-11, |
| "loss": 0.0003, |
| "reward": -0.23737502098083496, |
| "reward_std": 1.3410615921020508, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.23737502098083496, |
| "step": 500 |
| }, |
| { |
| "completion_length": 328.625, |
| "epoch": 3.0180722891566263, |
| "grad_norm": 0.39686211943626404, |
| "kl": 0.02566150762140751, |
| "learning_rate": 0.0, |
| "loss": 0.001, |
| "reward": 0.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21875, |
| "step": 501 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 501, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|