| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.032, |
| "eval_steps": 500, |
| "global_step": 126, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 142.08333587646484, |
| "epoch": 0.016, |
| "grad_norm": 0.9348842823066913, |
| "kl": 0.0, |
| "learning_rate": 1.25e-07, |
| "loss": -0.0, |
| "reward": 0.6988552212715149, |
| "reward_std": 0.5108021795749664, |
| "rewards/correct_code_reward_func": 0.4583333432674408, |
| "rewards/len_reward_func": 0.24052191525697708, |
| "step": 1 |
| }, |
| { |
| "completion_length": 126.56250762939453, |
| "epoch": 0.032, |
| "grad_norm": 0.8243337506143691, |
| "kl": 0.0, |
| "learning_rate": 2.5e-07, |
| "loss": -0.0, |
| "reward": 0.6203328222036362, |
| "reward_std": 0.45215627551078796, |
| "rewards/correct_code_reward_func": 0.3541666865348816, |
| "rewards/len_reward_func": 0.26616617292165756, |
| "step": 2 |
| }, |
| { |
| "completion_length": 108.79166793823242, |
| "epoch": 0.048, |
| "grad_norm": 0.9128499702856003, |
| "kl": 7.30752944946289e-05, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0, |
| "reward": 0.6249927878379822, |
| "reward_std": 0.5788741409778595, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.22915946692228317, |
| "step": 3 |
| }, |
| { |
| "completion_length": 142.2291717529297, |
| "epoch": 0.064, |
| "grad_norm": 0.9975440173663436, |
| "kl": 0.00012683868408203125, |
| "learning_rate": 5e-07, |
| "loss": 0.0, |
| "reward": 0.4923219382762909, |
| "reward_std": 0.5618958175182343, |
| "rewards/correct_code_reward_func": 0.2708333358168602, |
| "rewards/len_reward_func": 0.2214886099100113, |
| "step": 4 |
| }, |
| { |
| "completion_length": 167.9166717529297, |
| "epoch": 0.08, |
| "grad_norm": 0.6875029348712502, |
| "kl": 7.176399230957031e-05, |
| "learning_rate": 4.999157413258781e-07, |
| "loss": 0.0, |
| "reward": 0.3392188400030136, |
| "reward_std": 0.3500474542379379, |
| "rewards/correct_code_reward_func": 0.0833333358168602, |
| "rewards/len_reward_func": 0.255885511636734, |
| "step": 5 |
| }, |
| { |
| "completion_length": 132.9166717529297, |
| "epoch": 0.096, |
| "grad_norm": 0.7328221161754551, |
| "kl": 0.0001456737518310547, |
| "learning_rate": 4.996630220997057e-07, |
| "loss": 0.0, |
| "reward": 0.7516234815120697, |
| "reward_std": 0.5683842748403549, |
| "rewards/correct_code_reward_func": 0.4375000298023224, |
| "rewards/len_reward_func": 0.3141235262155533, |
| "step": 6 |
| }, |
| { |
| "completion_length": 178.0416717529297, |
| "epoch": 0.112, |
| "grad_norm": 0.7294488262352424, |
| "kl": 0.00011301040649414062, |
| "learning_rate": 4.992420126717784e-07, |
| "loss": 0.0, |
| "reward": 0.4081832021474838, |
| "reward_std": 0.4984392523765564, |
| "rewards/correct_code_reward_func": 0.1666666716337204, |
| "rewards/len_reward_func": 0.24151653796434402, |
| "step": 7 |
| }, |
| { |
| "completion_length": 190.81250762939453, |
| "epoch": 0.128, |
| "grad_norm": 0.774451422596215, |
| "kl": 0.00012731552124023438, |
| "learning_rate": 4.986529968316653e-07, |
| "loss": 0.0, |
| "reward": 0.5889585018157959, |
| "reward_std": 0.5029059052467346, |
| "rewards/correct_code_reward_func": 0.2500000149011612, |
| "rewards/len_reward_func": 0.3389585465192795, |
| "step": 8 |
| }, |
| { |
| "completion_length": 203.1041717529297, |
| "epoch": 0.144, |
| "grad_norm": 0.7610759960843761, |
| "kl": 0.0001308917999267578, |
| "learning_rate": 4.978963716169165e-07, |
| "loss": 0.0, |
| "reward": 0.5821144282817841, |
| "reward_std": 0.5114115178585052, |
| "rewards/correct_code_reward_func": 0.3333333432674408, |
| "rewards/len_reward_func": 0.24878107011318207, |
| "step": 9 |
| }, |
| { |
| "completion_length": 119.56250762939453, |
| "epoch": 0.16, |
| "grad_norm": 0.9290248509168798, |
| "kl": 0.00010132789611816406, |
| "learning_rate": 4.969726470454313e-07, |
| "loss": 0.0, |
| "reward": 0.7238170504570007, |
| "reward_std": 0.5344631671905518, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.30715033411979675, |
| "step": 10 |
| }, |
| { |
| "completion_length": 150.9166717529297, |
| "epoch": 0.176, |
| "grad_norm": 1.3590266587813116, |
| "kl": 0.00018453598022460938, |
| "learning_rate": 4.958824457716706e-07, |
| "loss": 0.0, |
| "reward": 0.6649805009365082, |
| "reward_std": 0.4508441388607025, |
| "rewards/correct_code_reward_func": 0.375, |
| "rewards/len_reward_func": 0.289980486035347, |
| "step": 11 |
| }, |
| { |
| "completion_length": 206.8541717529297, |
| "epoch": 0.192, |
| "grad_norm": 0.6744815448192079, |
| "kl": 0.000133514404296875, |
| "learning_rate": 4.946265026669454e-07, |
| "loss": 0.0, |
| "reward": 0.44046473503112793, |
| "reward_std": 0.4918256551027298, |
| "rewards/correct_code_reward_func": 0.2083333432674408, |
| "rewards/len_reward_func": 0.23213139921426773, |
| "step": 12 |
| }, |
| { |
| "completion_length": 100.33333587646484, |
| "epoch": 0.208, |
| "grad_norm": 1.1473694035346043, |
| "kl": 0.00011897087097167969, |
| "learning_rate": 4.932056643240618e-07, |
| "loss": 0.0, |
| "reward": 0.8099721968173981, |
| "reward_std": 0.5177792608737946, |
| "rewards/correct_code_reward_func": 0.4791666716337204, |
| "rewards/len_reward_func": 0.3308054953813553, |
| "step": 13 |
| }, |
| { |
| "completion_length": 174.52084350585938, |
| "epoch": 0.224, |
| "grad_norm": 0.7173952224852224, |
| "kl": 0.00014257431030273438, |
| "learning_rate": 4.916208884866592e-07, |
| "loss": 0.0, |
| "reward": 0.5219025313854218, |
| "reward_std": 0.43796107172966003, |
| "rewards/correct_code_reward_func": 0.229166679084301, |
| "rewards/len_reward_func": 0.29273584485054016, |
| "step": 14 |
| }, |
| { |
| "completion_length": 118.60417175292969, |
| "epoch": 0.24, |
| "grad_norm": 1.2540281005004774, |
| "kl": 0.00018072128295898438, |
| "learning_rate": 4.898732434036243e-07, |
| "loss": 0.0, |
| "reward": 0.7924503684043884, |
| "reward_std": 0.506152406334877, |
| "rewards/correct_code_reward_func": 0.4375000149011612, |
| "rewards/len_reward_func": 0.3549504280090332, |
| "step": 15 |
| }, |
| { |
| "completion_length": 105.31250381469727, |
| "epoch": 0.256, |
| "grad_norm": 0.9431489918373908, |
| "kl": 0.00018644332885742188, |
| "learning_rate": 4.879639071090173e-07, |
| "loss": 0.0, |
| "reward": 0.5001727938652039, |
| "reward_std": 0.45510660111904144, |
| "rewards/correct_code_reward_func": 0.2291666716337204, |
| "rewards/len_reward_func": 0.27100610733032227, |
| "step": 16 |
| }, |
| { |
| "completion_length": 130.85416793823242, |
| "epoch": 0.272, |
| "grad_norm": 0.7237713131170656, |
| "kl": 0.00011801719665527344, |
| "learning_rate": 4.858941666279955e-07, |
| "loss": 0.0, |
| "reward": 0.7888750731945038, |
| "reward_std": 0.5659129619598389, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.2680417224764824, |
| "step": 17 |
| }, |
| { |
| "completion_length": 134.89583587646484, |
| "epoch": 0.288, |
| "grad_norm": 0.8717728732040717, |
| "kl": 0.000164031982421875, |
| "learning_rate": 4.836654171092682e-07, |
| "loss": 0.0, |
| "reward": 0.5576134622097015, |
| "reward_std": 0.5418040752410889, |
| "rewards/correct_code_reward_func": 0.3125000149011612, |
| "rewards/len_reward_func": 0.24511344730854034, |
| "step": 18 |
| }, |
| { |
| "completion_length": 116.54167175292969, |
| "epoch": 0.304, |
| "grad_norm": 0.9377250817808462, |
| "kl": 0.0002789497375488281, |
| "learning_rate": 4.812791608846709e-07, |
| "loss": 0.0, |
| "reward": 0.7832874357700348, |
| "reward_std": 0.5703642070293427, |
| "rewards/correct_code_reward_func": 0.4583333432674408, |
| "rewards/len_reward_func": 0.3249540776014328, |
| "step": 19 |
| }, |
| { |
| "completion_length": 186.06250762939453, |
| "epoch": 0.32, |
| "grad_norm": 0.9812718853662215, |
| "kl": 0.00038433074951171875, |
| "learning_rate": 4.787370064564882e-07, |
| "loss": 0.0, |
| "reward": 0.5579635202884674, |
| "reward_std": 0.519572913646698, |
| "rewards/correct_code_reward_func": 0.3125, |
| "rewards/len_reward_func": 0.2454635202884674, |
| "step": 20 |
| }, |
| { |
| "completion_length": 110.75, |
| "epoch": 0.336, |
| "grad_norm": 1.0782358974361461, |
| "kl": 0.00030231475830078125, |
| "learning_rate": 4.7604066741321253e-07, |
| "loss": 0.0, |
| "reward": 0.9209870100021362, |
| "reward_std": 0.5739021599292755, |
| "rewards/correct_code_reward_func": 0.6458333432674408, |
| "rewards/len_reward_func": 0.2751536965370178, |
| "step": 21 |
| }, |
| { |
| "completion_length": 162.08334350585938, |
| "epoch": 0.352, |
| "grad_norm": 0.874006092063079, |
| "kl": 0.00022363662719726562, |
| "learning_rate": 4.731919612744659e-07, |
| "loss": 0.0, |
| "reward": 0.7241076529026031, |
| "reward_std": 0.5581964254379272, |
| "rewards/correct_code_reward_func": 0.4791666865348816, |
| "rewards/len_reward_func": 0.24494098126888275, |
| "step": 22 |
| }, |
| { |
| "completion_length": 204.14584350585938, |
| "epoch": 0.368, |
| "grad_norm": 0.82204687952658, |
| "kl": 0.00034046173095703125, |
| "learning_rate": 4.7019280826586604e-07, |
| "loss": 0.0, |
| "reward": 0.532179206609726, |
| "reward_std": 0.34643039107322693, |
| "rewards/correct_code_reward_func": 0.1666666716337204, |
| "rewards/len_reward_func": 0.36551257967948914, |
| "step": 23 |
| }, |
| { |
| "completion_length": 151.0208396911621, |
| "epoch": 0.384, |
| "grad_norm": 0.9960569412421229, |
| "kl": 0.00042057037353515625, |
| "learning_rate": 4.6704523002466094e-07, |
| "loss": 0.0, |
| "reward": 0.5193642377853394, |
| "reward_std": 0.40249721705913544, |
| "rewards/correct_code_reward_func": 0.2083333395421505, |
| "rewards/len_reward_func": 0.31103089451789856, |
| "step": 24 |
| }, |
| { |
| "completion_length": 187.20834350585938, |
| "epoch": 0.4, |
| "grad_norm": 0.6829787161734556, |
| "kl": 0.0003147125244140625, |
| "learning_rate": 4.6375134823700503e-07, |
| "loss": 0.0, |
| "reward": 0.43366140127182007, |
| "reward_std": 0.4143451601266861, |
| "rewards/correct_code_reward_func": 0.1666666716337204, |
| "rewards/len_reward_func": 0.2669947147369385, |
| "step": 25 |
| }, |
| { |
| "completion_length": 99.33333587646484, |
| "epoch": 0.416, |
| "grad_norm": 1.0546549364927, |
| "kl": 0.0007066726684570312, |
| "learning_rate": 4.603133832077953e-07, |
| "loss": 0.0, |
| "reward": 0.6213856935501099, |
| "reward_std": 0.5858824849128723, |
| "rewards/correct_code_reward_func": 0.395833358168602, |
| "rewards/len_reward_func": 0.22555235773324966, |
| "step": 26 |
| }, |
| { |
| "completion_length": 119.04167175292969, |
| "epoch": 0.432, |
| "grad_norm": 1.0435674697004191, |
| "kl": 0.000743865966796875, |
| "learning_rate": 4.5673365236403216e-07, |
| "loss": 0.0, |
| "reward": 0.6226212680339813, |
| "reward_std": 0.515766441822052, |
| "rewards/correct_code_reward_func": 0.4791666865348816, |
| "rewards/len_reward_func": 0.14345459267497063, |
| "step": 27 |
| }, |
| { |
| "completion_length": 204.02083587646484, |
| "epoch": 0.448, |
| "grad_norm": 1.1685217734586066, |
| "kl": 0.000659942626953125, |
| "learning_rate": 4.530145686927125e-07, |
| "loss": 0.0, |
| "reward": 0.6347978413105011, |
| "reward_std": 0.4856678545475006, |
| "rewards/correct_code_reward_func": 0.3333333432674408, |
| "rewards/len_reward_func": 0.3014644980430603, |
| "step": 28 |
| }, |
| { |
| "completion_length": 149.87500762939453, |
| "epoch": 0.464, |
| "grad_norm": 0.7804112511261988, |
| "kl": 0.0006847381591796875, |
| "learning_rate": 4.4915863911430897e-07, |
| "loss": 0.0, |
| "reward": 0.4445287883281708, |
| "reward_std": 0.4996263086795807, |
| "rewards/correct_code_reward_func": 0.1458333358168602, |
| "rewards/len_reward_func": 0.29869547486305237, |
| "step": 29 |
| }, |
| { |
| "completion_length": 132.0833396911621, |
| "epoch": 0.48, |
| "grad_norm": 0.8535635529221804, |
| "kl": 0.0006618499755859375, |
| "learning_rate": 4.45168462792932e-07, |
| "loss": 0.0, |
| "reward": 0.6259033381938934, |
| "reward_std": 0.4563398212194443, |
| "rewards/correct_code_reward_func": 0.3750000149011612, |
| "rewards/len_reward_func": 0.2509033679962158, |
| "step": 30 |
| }, |
| { |
| "completion_length": 126.79166793823242, |
| "epoch": 0.496, |
| "grad_norm": 0.8761490464403178, |
| "kl": 0.0010280609130859375, |
| "learning_rate": 4.4104672938431223e-07, |
| "loss": 0.0, |
| "reward": 0.7820720672607422, |
| "reward_std": 0.4476567506790161, |
| "rewards/correct_code_reward_func": 0.583333358168602, |
| "rewards/len_reward_func": 0.1987387351691723, |
| "step": 31 |
| }, |
| { |
| "completion_length": 95.87500381469727, |
| "epoch": 0.512, |
| "grad_norm": 1.0760263673729638, |
| "kl": 0.0013580322265625, |
| "learning_rate": 4.367962172227866e-07, |
| "loss": 0.0, |
| "reward": 0.7095580399036407, |
| "reward_std": 0.4921827018260956, |
| "rewards/correct_code_reward_func": 0.5000000149011612, |
| "rewards/len_reward_func": 0.20955805480480194, |
| "step": 32 |
| }, |
| { |
| "completion_length": 155.02083587646484, |
| "epoch": 0.528, |
| "grad_norm": 0.9305216291102568, |
| "kl": 0.001430511474609375, |
| "learning_rate": 4.324197914485075e-07, |
| "loss": 0.0, |
| "reward": 0.6416721642017365, |
| "reward_std": 0.4772767722606659, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.24583880603313446, |
| "step": 33 |
| }, |
| { |
| "completion_length": 215.0416717529297, |
| "epoch": 0.544, |
| "grad_norm": 0.8211396277152214, |
| "kl": 0.0010585784912109375, |
| "learning_rate": 4.2792040207614e-07, |
| "loss": 0.0, |
| "reward": 0.7173371911048889, |
| "reward_std": 0.5010073632001877, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.3215038478374481, |
| "step": 34 |
| }, |
| { |
| "completion_length": 128.8958396911621, |
| "epoch": 0.56, |
| "grad_norm": 0.9266230852523272, |
| "kl": 0.001445770263671875, |
| "learning_rate": 4.2330108200634723e-07, |
| "loss": 0.0, |
| "reward": 0.6981382668018341, |
| "reward_std": 0.5356450974941254, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.2814715951681137, |
| "step": 35 |
| }, |
| { |
| "completion_length": 174.62500762939453, |
| "epoch": 0.576, |
| "grad_norm": 0.8774350304368993, |
| "kl": 0.00160980224609375, |
| "learning_rate": 4.185649449814045e-07, |
| "loss": 0.0, |
| "reward": 0.819614589214325, |
| "reward_std": 0.5062630474567413, |
| "rewards/correct_code_reward_func": 0.5000000223517418, |
| "rewards/len_reward_func": 0.3196146488189697, |
| "step": 36 |
| }, |
| { |
| "completion_length": 79.29166793823242, |
| "epoch": 0.592, |
| "grad_norm": 1.1529766368540975, |
| "kl": 0.0024871826171875, |
| "learning_rate": 4.137151834863213e-07, |
| "loss": 0.0, |
| "reward": 0.615891844034195, |
| "reward_std": 0.6233284175395966, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.19922512769699097, |
| "step": 37 |
| }, |
| { |
| "completion_length": 85.97917175292969, |
| "epoch": 0.608, |
| "grad_norm": 1.2549480144397245, |
| "kl": 0.0028076171875, |
| "learning_rate": 4.087550665968846e-07, |
| "loss": 0.0, |
| "reward": 0.5825270563364029, |
| "reward_std": 0.45819054543972015, |
| "rewards/correct_code_reward_func": 0.35416667722165585, |
| "rewards/len_reward_func": 0.2283603698015213, |
| "step": 38 |
| }, |
| { |
| "completion_length": 85.68750381469727, |
| "epoch": 0.624, |
| "grad_norm": 1.4005267604268767, |
| "kl": 0.003387451171875, |
| "learning_rate": 4.036879377760752e-07, |
| "loss": 0.0, |
| "reward": 0.8216443359851837, |
| "reward_std": 0.548437625169754, |
| "rewards/correct_code_reward_func": 0.5833333432674408, |
| "rewards/len_reward_func": 0.23831100016832352, |
| "step": 39 |
| }, |
| { |
| "completion_length": 113.16667175292969, |
| "epoch": 0.64, |
| "grad_norm": 1.0341725323218647, |
| "kl": 0.003143310546875, |
| "learning_rate": 3.9851721262034157e-07, |
| "loss": 0.0, |
| "reward": 0.520989790558815, |
| "reward_std": 0.47860175371170044, |
| "rewards/correct_code_reward_func": 0.25, |
| "rewards/len_reward_func": 0.270989790558815, |
| "step": 40 |
| }, |
| { |
| "completion_length": 109.70833587646484, |
| "epoch": 0.656, |
| "grad_norm": 1.0974308815666283, |
| "kl": 0.00255584716796875, |
| "learning_rate": 3.932463765572505e-07, |
| "loss": 0.0, |
| "reward": 0.5007437467575073, |
| "reward_std": 0.4586540758609772, |
| "rewards/correct_code_reward_func": 0.3333333432674408, |
| "rewards/len_reward_func": 0.16741041094064713, |
| "step": 41 |
| }, |
| { |
| "completion_length": 113.50000381469727, |
| "epoch": 0.672, |
| "grad_norm": 0.8007467983812053, |
| "kl": 0.00237274169921875, |
| "learning_rate": 3.8787898249606767e-07, |
| "loss": 0.0, |
| "reward": 0.4927902817726135, |
| "reward_std": 0.4271247088909149, |
| "rewards/correct_code_reward_func": 0.2708333395421505, |
| "rewards/len_reward_func": 0.22195692360401154, |
| "step": 42 |
| }, |
| { |
| "completion_length": 79.85416793823242, |
| "epoch": 0.688, |
| "grad_norm": 1.2179865851315288, |
| "kl": 0.00423431396484375, |
| "learning_rate": 3.8241864843284964e-07, |
| "loss": 0.0, |
| "reward": 0.8101656138896942, |
| "reward_std": 0.4996851086616516, |
| "rewards/correct_code_reward_func": 0.5416666716337204, |
| "rewards/len_reward_func": 0.2684989869594574, |
| "step": 43 |
| }, |
| { |
| "completion_length": 91.54167175292969, |
| "epoch": 0.704, |
| "grad_norm": 1.240439347309207, |
| "kl": 0.005615234375, |
| "learning_rate": 3.768690550116639e-07, |
| "loss": 0.0, |
| "reward": 0.7196292579174042, |
| "reward_std": 0.5478431880474091, |
| "rewards/correct_code_reward_func": 0.4166666865348816, |
| "rewards/len_reward_func": 0.3029625713825226, |
| "step": 44 |
| }, |
| { |
| "completion_length": 121.97916793823242, |
| "epoch": 0.72, |
| "grad_norm": 0.9464336788144472, |
| "kl": 0.0048980712890625, |
| "learning_rate": 3.712339430435792e-07, |
| "loss": 0.0, |
| "reward": 0.8251023292541504, |
| "reward_std": 0.5142593383789062, |
| "rewards/correct_code_reward_func": 0.4791666716337204, |
| "rewards/len_reward_func": 0.34593565762043, |
| "step": 45 |
| }, |
| { |
| "completion_length": 88.20833587646484, |
| "epoch": 0.736, |
| "grad_norm": 0.944700095686709, |
| "kl": 0.0039215087890625, |
| "learning_rate": 3.65517110985099e-07, |
| "loss": 0.0, |
| "reward": 0.6876890659332275, |
| "reward_std": 0.4308091402053833, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.27102240175008774, |
| "step": 46 |
| }, |
| { |
| "completion_length": 69.75000381469727, |
| "epoch": 0.752, |
| "grad_norm": 1.2034725464007063, |
| "kl": 0.0064544677734375, |
| "learning_rate": 3.597224123777389e-07, |
| "loss": 0.0, |
| "reward": 0.9030305743217468, |
| "reward_std": 0.5125944316387177, |
| "rewards/correct_code_reward_func": 0.6458333730697632, |
| "rewards/len_reward_func": 0.25719721615314484, |
| "step": 47 |
| }, |
| { |
| "completion_length": 84.00000381469727, |
| "epoch": 0.768, |
| "grad_norm": 1.188785130607309, |
| "kl": 0.0072174072265625, |
| "learning_rate": 3.5385375325047163e-07, |
| "loss": 0.0, |
| "reward": 0.6972790956497192, |
| "reward_std": 0.6147326231002808, |
| "rewards/correct_code_reward_func": 0.3750000149011612, |
| "rewards/len_reward_func": 0.32227905094623566, |
| "step": 48 |
| }, |
| { |
| "completion_length": 54.35416793823242, |
| "epoch": 0.784, |
| "grad_norm": 1.5392691471218216, |
| "kl": 0.011932373046875, |
| "learning_rate": 3.479150894867926e-07, |
| "loss": 0.0, |
| "reward": 0.7476979494094849, |
| "reward_std": 0.5232938826084137, |
| "rewards/correct_code_reward_func": 0.520833358168602, |
| "rewards/len_reward_func": 0.22686457633972168, |
| "step": 49 |
| }, |
| { |
| "completion_length": 105.375, |
| "epoch": 0.8, |
| "grad_norm": 0.9867137272930855, |
| "kl": 0.0070037841796875, |
| "learning_rate": 3.4191042415818e-07, |
| "loss": 0.0, |
| "reward": 0.7631438374519348, |
| "reward_std": 0.5120529979467392, |
| "rewards/correct_code_reward_func": 0.4375, |
| "rewards/len_reward_func": 0.3256438076496124, |
| "step": 50 |
| }, |
| { |
| "completion_length": 79.79166793823242, |
| "epoch": 0.816, |
| "grad_norm": 1.2765736460552886, |
| "kl": 0.009490966796875, |
| "learning_rate": 3.3584380482574717e-07, |
| "loss": 0.0, |
| "reward": 0.8190419375896454, |
| "reward_std": 0.5837846994400024, |
| "rewards/correct_code_reward_func": 0.583333358168602, |
| "rewards/len_reward_func": 0.23570860922336578, |
| "step": 51 |
| }, |
| { |
| "completion_length": 66.62500381469727, |
| "epoch": 0.832, |
| "grad_norm": 1.2357424740774945, |
| "kl": 0.012359619140625, |
| "learning_rate": 3.297193208119047e-07, |
| "loss": 0.0, |
| "reward": 0.9992968440055847, |
| "reward_std": 0.38213877379894257, |
| "rewards/correct_code_reward_func": 0.7500000298023224, |
| "rewards/len_reward_func": 0.24929680675268173, |
| "step": 52 |
| }, |
| { |
| "completion_length": 99.83333587646484, |
| "epoch": 0.848, |
| "grad_norm": 1.1189426445940507, |
| "kl": 0.01153564453125, |
| "learning_rate": 3.235411004438741e-07, |
| "loss": 0.0, |
| "reward": 0.8771131932735443, |
| "reward_std": 0.5585960447788239, |
| "rewards/correct_code_reward_func": 0.520833358168602, |
| "rewards/len_reward_func": 0.3562798500061035, |
| "step": 53 |
| }, |
| { |
| "completion_length": 115.89583587646484, |
| "epoch": 0.864, |
| "grad_norm": 1.3030164084383453, |
| "kl": 0.013397216796875, |
| "learning_rate": 3.173133082709086e-07, |
| "loss": 0.0, |
| "reward": 0.9209796786308289, |
| "reward_std": 0.5127619951963425, |
| "rewards/correct_code_reward_func": 0.5416666865348816, |
| "rewards/len_reward_func": 0.37931299209594727, |
| "step": 54 |
| }, |
| { |
| "completion_length": 45.81250190734863, |
| "epoch": 0.88, |
| "grad_norm": 1.2758138396906251, |
| "kl": 0.02093505859375, |
| "learning_rate": 3.1104014225709784e-07, |
| "loss": 0.0, |
| "reward": 1.014316976070404, |
| "reward_std": 0.5054647028446198, |
| "rewards/correct_code_reward_func": 0.6666666865348816, |
| "rewards/len_reward_func": 0.34765030443668365, |
| "step": 55 |
| }, |
| { |
| "completion_length": 177.89584350585938, |
| "epoch": 0.896, |
| "grad_norm": 0.9790523295873852, |
| "kl": 0.0159149169921875, |
| "learning_rate": 3.0472583095164873e-07, |
| "loss": 0.0, |
| "reward": 0.5877984166145325, |
| "reward_std": 0.4042231887578964, |
| "rewards/correct_code_reward_func": 0.3125000149011612, |
| "rewards/len_reward_func": 0.2752983868122101, |
| "step": 56 |
| }, |
| { |
| "completion_length": 46.97916793823242, |
| "epoch": 0.912, |
| "grad_norm": 2.352715010123481, |
| "kl": 0.0218505859375, |
| "learning_rate": 2.983746306385499e-07, |
| "loss": 0.0, |
| "reward": 0.9101540148258209, |
| "reward_std": 0.5364184379577637, |
| "rewards/correct_code_reward_func": 0.5625, |
| "rewards/len_reward_func": 0.3476540297269821, |
| "step": 57 |
| }, |
| { |
| "completion_length": 86.5, |
| "epoch": 0.928, |
| "grad_norm": 1.3590745378537943, |
| "kl": 0.018341064453125, |
| "learning_rate": 2.919908224675412e-07, |
| "loss": 0.0, |
| "reward": 0.7466456890106201, |
| "reward_std": 0.4038194566965103, |
| "rewards/correct_code_reward_func": 0.458333358168602, |
| "rewards/len_reward_func": 0.2883123308420181, |
| "step": 58 |
| }, |
| { |
| "completion_length": 93.35416793823242, |
| "epoch": 0.944, |
| "grad_norm": 1.0657115005336795, |
| "kl": 0.0169677734375, |
| "learning_rate": 2.8557870956832133e-07, |
| "loss": 0.0, |
| "reward": 0.8307068645954132, |
| "reward_std": 0.41568903625011444, |
| "rewards/correct_code_reward_func": 0.458333358168602, |
| "rewards/len_reward_func": 0.37237347662448883, |
| "step": 59 |
| }, |
| { |
| "completion_length": 96.04166793823242, |
| "epoch": 0.96, |
| "grad_norm": 1.1894982274411716, |
| "kl": 0.0167236328125, |
| "learning_rate": 2.7914261414993976e-07, |
| "loss": 0.0, |
| "reward": 0.9781838655471802, |
| "reward_std": 0.5707030892372131, |
| "rewards/correct_code_reward_func": 0.6458333730697632, |
| "rewards/len_reward_func": 0.33235056698322296, |
| "step": 60 |
| }, |
| { |
| "completion_length": 57.041669845581055, |
| "epoch": 0.976, |
| "grad_norm": 1.6373435351232113, |
| "kl": 0.02020263671875, |
| "learning_rate": 2.726868745873286e-07, |
| "loss": 0.0, |
| "reward": 0.7698030173778534, |
| "reward_std": 0.5640588998794556, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.2489696592092514, |
| "step": 61 |
| }, |
| { |
| "completion_length": 84.89583587646484, |
| "epoch": 0.992, |
| "grad_norm": 1.223004703854309, |
| "kl": 0.017547607421875, |
| "learning_rate": 2.662158424969357e-07, |
| "loss": 0.0, |
| "reward": 0.8090977072715759, |
| "reward_std": 0.6418364942073822, |
| "rewards/correct_code_reward_func": 0.4791666716337204, |
| "rewards/len_reward_func": 0.32993103563785553, |
| "step": 62 |
| }, |
| { |
| "completion_length": 52.958335876464844, |
| "epoch": 1.0, |
| "grad_norm": 1.223004703854309, |
| "kl": 0.03564453125, |
| "learning_rate": 2.597338798034344e-07, |
| "loss": 0.0, |
| "reward": 0.9149335622787476, |
| "reward_std": 0.3469616770744324, |
| "rewards/correct_code_reward_func": 0.5416666865348816, |
| "rewards/len_reward_func": 0.3732668161392212, |
| "step": 63 |
| }, |
| { |
| "completion_length": 76.31250381469727, |
| "epoch": 1.016, |
| "grad_norm": 1.234231990060427, |
| "kl": 0.0224609375, |
| "learning_rate": 2.532453557994827e-07, |
| "loss": 0.0, |
| "reward": 0.7154572010040283, |
| "reward_std": 0.5053917020559311, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.31962384283542633, |
| "step": 64 |
| }, |
| { |
| "completion_length": 116.52083587646484, |
| "epoch": 1.032, |
| "grad_norm": 1.0750099724368811, |
| "kl": 0.01617431640625, |
| "learning_rate": 2.467546442005173e-07, |
| "loss": 0.0, |
| "reward": 0.7226243913173676, |
| "reward_std": 0.5376951545476913, |
| "rewards/correct_code_reward_func": 0.3750000149011612, |
| "rewards/len_reward_func": 0.34762439131736755, |
| "step": 65 |
| }, |
| { |
| "completion_length": 116.10416793823242, |
| "epoch": 1.048, |
| "grad_norm": 1.0133791335296118, |
| "kl": 0.01983642578125, |
| "learning_rate": 2.4026612019656556e-07, |
| "loss": 0.0, |
| "reward": 0.9590997993946075, |
| "reward_std": 0.4536292999982834, |
| "rewards/correct_code_reward_func": 0.6041666865348816, |
| "rewards/len_reward_func": 0.35493315756320953, |
| "step": 66 |
| }, |
| { |
| "completion_length": 66.625, |
| "epoch": 1.064, |
| "grad_norm": 1.2625208133395418, |
| "kl": 0.030517578125, |
| "learning_rate": 2.337841575030642e-07, |
| "loss": 0.0, |
| "reward": 0.9440672397613525, |
| "reward_std": 0.42680656909942627, |
| "rewards/correct_code_reward_func": 0.583333358168602, |
| "rewards/len_reward_func": 0.36073388159275055, |
| "step": 67 |
| }, |
| { |
| "completion_length": 60.458335876464844, |
| "epoch": 1.08, |
| "grad_norm": 1.174665706452754, |
| "kl": 0.02490234375, |
| "learning_rate": 2.2731312541267143e-07, |
| "loss": 0.0, |
| "reward": 0.5735488831996918, |
| "reward_std": 0.48749370872974396, |
| "rewards/correct_code_reward_func": 0.2500000149011612, |
| "rewards/len_reward_func": 0.3235488831996918, |
| "step": 68 |
| }, |
| { |
| "completion_length": 74.10416984558105, |
| "epoch": 1.096, |
| "grad_norm": 1.5260410634207344, |
| "kl": 0.029052734375, |
| "learning_rate": 2.2085738585006021e-07, |
| "loss": 0.0, |
| "reward": 1.0659485459327698, |
| "reward_std": 0.3796464204788208, |
| "rewards/correct_code_reward_func": 0.6666666865348816, |
| "rewards/len_reward_func": 0.3992818146944046, |
| "step": 69 |
| }, |
| { |
| "completion_length": 71.89583396911621, |
| "epoch": 1.112, |
| "grad_norm": 1.059976697715823, |
| "kl": 0.0418701171875, |
| "learning_rate": 2.1442129043167873e-07, |
| "loss": 0.0, |
| "reward": 0.7276312112808228, |
| "reward_std": 0.5470257103443146, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.33179786801338196, |
| "step": 70 |
| }, |
| { |
| "completion_length": 96.29166793823242, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.9278761590759935, |
| "kl": 0.02056884765625, |
| "learning_rate": 2.0800917753245875e-07, |
| "loss": 0.0, |
| "reward": 0.8413068056106567, |
| "reward_std": 0.3271617591381073, |
| "rewards/correct_code_reward_func": 0.4375000298023224, |
| "rewards/len_reward_func": 0.40380676090717316, |
| "step": 71 |
| }, |
| { |
| "completion_length": 79.33333587646484, |
| "epoch": 1.144, |
| "grad_norm": 0.9890378359951959, |
| "kl": 0.025390625, |
| "learning_rate": 2.0162536936145008e-07, |
| "loss": 0.0, |
| "reward": 0.7865794003009796, |
| "reward_std": 0.3915296047925949, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.3699127733707428, |
| "step": 72 |
| }, |
| { |
| "completion_length": 72.95833587646484, |
| "epoch": 1.16, |
| "grad_norm": 1.3690756427464768, |
| "kl": 0.0413818359375, |
| "learning_rate": 1.9527416904835132e-07, |
| "loss": 0.0, |
| "reward": 0.9756404757499695, |
| "reward_std": 0.3930533230304718, |
| "rewards/correct_code_reward_func": 0.5416666865348816, |
| "rewards/len_reward_func": 0.4339737892150879, |
| "step": 73 |
| }, |
| { |
| "completion_length": 79.89583396911621, |
| "epoch": 1.176, |
| "grad_norm": 1.1734737313856272, |
| "kl": 0.02801513671875, |
| "learning_rate": 1.889598577429022e-07, |
| "loss": 0.0, |
| "reward": 0.7878330945968628, |
| "reward_std": 0.4340344965457916, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.3711664080619812, |
| "step": 74 |
| }, |
| { |
| "completion_length": 58.270835876464844, |
| "epoch": 1.192, |
| "grad_norm": 1.2871892634157702, |
| "kl": 0.0377197265625, |
| "learning_rate": 1.8268669172909136e-07, |
| "loss": 0.0, |
| "reward": 1.1113883256912231, |
| "reward_std": 0.5605219900608063, |
| "rewards/correct_code_reward_func": 0.7291666865348816, |
| "rewards/len_reward_func": 0.3822215795516968, |
| "step": 75 |
| }, |
| { |
| "completion_length": 97.25, |
| "epoch": 1.208, |
| "grad_norm": 2.007411904832845, |
| "kl": 0.04949951171875, |
| "learning_rate": 1.7645889955612592e-07, |
| "loss": 0.0, |
| "reward": 1.0362713038921356, |
| "reward_std": 0.3777136504650116, |
| "rewards/correct_code_reward_func": 0.6666666865348816, |
| "rewards/len_reward_func": 0.3696046322584152, |
| "step": 76 |
| }, |
| { |
| "completion_length": 42.60416793823242, |
| "epoch": 1.224, |
| "grad_norm": 1.196008727333628, |
| "kl": 0.0433349609375, |
| "learning_rate": 1.7028067918809535e-07, |
| "loss": 0.0, |
| "reward": 0.8968790173530579, |
| "reward_std": 0.47050511837005615, |
| "rewards/correct_code_reward_func": 0.520833358168602, |
| "rewards/len_reward_func": 0.3760456293821335, |
| "step": 77 |
| }, |
| { |
| "completion_length": 44.97916793823242, |
| "epoch": 1.24, |
| "grad_norm": 1.4303810146633773, |
| "kl": 0.0574951171875, |
| "learning_rate": 1.6415619517425294e-07, |
| "loss": 0.0001, |
| "reward": 0.9696877598762512, |
| "reward_std": 0.33823370933532715, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.4488544166088104, |
| "step": 78 |
| }, |
| { |
| "completion_length": 66.0625, |
| "epoch": 1.256, |
| "grad_norm": 1.5543919802377633, |
| "kl": 0.03875732421875, |
| "learning_rate": 1.5808957584181994e-07, |
| "loss": 0.0, |
| "reward": 0.9006170630455017, |
| "reward_std": 0.37997904419898987, |
| "rewards/correct_code_reward_func": 0.4791666716337204, |
| "rewards/len_reward_func": 0.4214503914117813, |
| "step": 79 |
| }, |
| { |
| "completion_length": 76.06250381469727, |
| "epoch": 1.272, |
| "grad_norm": 1.8800678172876588, |
| "kl": 0.047607421875, |
| "learning_rate": 1.5208491051320744e-07, |
| "loss": 0.0, |
| "reward": 0.9962214827537537, |
| "reward_std": 0.3772743344306946, |
| "rewards/correct_code_reward_func": 0.5625000149011612, |
| "rewards/len_reward_func": 0.4337214529514313, |
| "step": 80 |
| }, |
| { |
| "completion_length": 43.27083396911621, |
| "epoch": 1.288, |
| "grad_norm": 1.3749781365688138, |
| "kl": 0.055908203125, |
| "learning_rate": 1.461462467495284e-07, |
| "loss": 0.0001, |
| "reward": 1.1482934355735779, |
| "reward_std": 0.4654877036809921, |
| "rewards/correct_code_reward_func": 0.7291666865348816, |
| "rewards/len_reward_func": 0.41912680864334106, |
| "step": 81 |
| }, |
| { |
| "completion_length": 64.25000381469727, |
| "epoch": 1.304, |
| "grad_norm": 1.2603902864707686, |
| "kl": 0.041259765625, |
| "learning_rate": 1.4027758762226107e-07, |
| "loss": 0.0, |
| "reward": 0.8633248507976532, |
| "reward_std": 0.44106219708919525, |
| "rewards/correct_code_reward_func": 0.4583333432674408, |
| "rewards/len_reward_func": 0.40499147772789, |
| "step": 82 |
| }, |
| { |
| "completion_length": 84.87500190734863, |
| "epoch": 1.32, |
| "grad_norm": 1.3553050400485256, |
| "kl": 0.037109375, |
| "learning_rate": 1.3448288901490092e-07, |
| "loss": 0.0, |
| "reward": 0.930885374546051, |
| "reward_std": 0.28970160335302353, |
| "rewards/correct_code_reward_func": 0.4791666865348816, |
| "rewards/len_reward_func": 0.45171867311000824, |
| "step": 83 |
| }, |
| { |
| "completion_length": 44.12500190734863, |
| "epoch": 1.336, |
| "grad_norm": 1.3471283821112554, |
| "kl": 0.05126953125, |
| "learning_rate": 1.2876605695642084e-07, |
| "loss": 0.0001, |
| "reward": 0.8797213435173035, |
| "reward_std": 0.4040851444005966, |
| "rewards/correct_code_reward_func": 0.4375000149011612, |
| "rewards/len_reward_func": 0.44222137331962585, |
| "step": 84 |
| }, |
| { |
| "completion_length": 77.27083587646484, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 1.3204773489251114, |
| "kl": 0.0401611328125, |
| "learning_rate": 1.231309449883361e-07, |
| "loss": 0.0, |
| "reward": 0.9848110675811768, |
| "reward_std": 0.5164197236299515, |
| "rewards/correct_code_reward_func": 0.5833333432674408, |
| "rewards/len_reward_func": 0.40147776901721954, |
| "step": 85 |
| }, |
| { |
| "completion_length": 58.729169845581055, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 1.6764054808630413, |
| "kl": 0.0513916015625, |
| "learning_rate": 1.1758135156715041e-07, |
| "loss": 0.0001, |
| "reward": 1.1368677616119385, |
| "reward_std": 0.3856023848056793, |
| "rewards/correct_code_reward_func": 0.7708333432674408, |
| "rewards/len_reward_func": 0.3660343587398529, |
| "step": 86 |
| }, |
| { |
| "completion_length": 90.02083396911621, |
| "epoch": 1.384, |
| "grad_norm": 1.1184416966735806, |
| "kl": 0.03411865234375, |
| "learning_rate": 1.1212101750393235e-07, |
| "loss": 0.0, |
| "reward": 1.00173819065094, |
| "reward_std": 0.49202577769756317, |
| "rewards/correct_code_reward_func": 0.5625000298023224, |
| "rewards/len_reward_func": 0.43923819065093994, |
| "step": 87 |
| }, |
| { |
| "completion_length": 46.60416793823242, |
| "epoch": 1.4, |
| "grad_norm": 1.1504631747132115, |
| "kl": 0.0496826171875, |
| "learning_rate": 1.0675362344274952e-07, |
| "loss": 0.0, |
| "reward": 0.9823802709579468, |
| "reward_std": 0.46341855823993683, |
| "rewards/correct_code_reward_func": 0.6041666865348816, |
| "rewards/len_reward_func": 0.3782135844230652, |
| "step": 88 |
| }, |
| { |
| "completion_length": 59.937503814697266, |
| "epoch": 1.416, |
| "grad_norm": 1.3546823329176554, |
| "kl": 0.0450439453125, |
| "learning_rate": 1.0148278737965844e-07, |
| "loss": 0.0, |
| "reward": 1.116898536682129, |
| "reward_std": 0.3632017821073532, |
| "rewards/correct_code_reward_func": 0.6250000298023224, |
| "rewards/len_reward_func": 0.4918985068798065, |
| "step": 89 |
| }, |
| { |
| "completion_length": 38.08333396911621, |
| "epoch": 1.432, |
| "grad_norm": 1.4742606396398155, |
| "kl": 0.06396484375, |
| "learning_rate": 9.631206222392479e-08, |
| "loss": 0.0001, |
| "reward": 1.079964131116867, |
| "reward_std": 0.47177985310554504, |
| "rewards/correct_code_reward_func": 0.6666666865348816, |
| "rewards/len_reward_func": 0.4132973849773407, |
| "step": 90 |
| }, |
| { |
| "completion_length": 72.18750190734863, |
| "epoch": 1.448, |
| "grad_norm": 1.4885040482346985, |
| "kl": 0.0491943359375, |
| "learning_rate": 9.124493340311537e-08, |
| "loss": 0.0, |
| "reward": 1.0042789578437805, |
| "reward_std": 0.3533863425254822, |
| "rewards/correct_code_reward_func": 0.5625000298023224, |
| "rewards/len_reward_func": 0.44177892804145813, |
| "step": 91 |
| }, |
| { |
| "completion_length": 69.35416984558105, |
| "epoch": 1.464, |
| "grad_norm": 1.1925308190836468, |
| "kl": 0.0478515625, |
| "learning_rate": 8.628481651367875e-08, |
| "loss": 0.0, |
| "reward": 1.0887993574142456, |
| "reward_std": 0.5487662255764008, |
| "rewards/correct_code_reward_func": 0.6666666865348816, |
| "rewards/len_reward_func": 0.422132670879364, |
| "step": 92 |
| }, |
| { |
| "completion_length": 48.41666793823242, |
| "epoch": 1.48, |
| "grad_norm": 1.6549689653260358, |
| "kl": 0.051025390625, |
| "learning_rate": 8.143505501859551e-08, |
| "loss": 0.0001, |
| "reward": 0.920165479183197, |
| "reward_std": 0.415915310382843, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.39933212101459503, |
| "step": 93 |
| }, |
| { |
| "completion_length": 75.87500381469727, |
| "epoch": 1.496, |
| "grad_norm": 1.586423514577547, |
| "kl": 0.053466796875, |
| "learning_rate": 7.669891799365282e-08, |
| "loss": 0.0001, |
| "reward": 0.8828703761100769, |
| "reward_std": 0.49832169711589813, |
| "rewards/correct_code_reward_func": 0.4375000149011612, |
| "rewards/len_reward_func": 0.4453703910112381, |
| "step": 94 |
| }, |
| { |
| "completion_length": 79.06250190734863, |
| "epoch": 1.512, |
| "grad_norm": 1.7029251090199986, |
| "kl": 0.0489501953125, |
| "learning_rate": 7.207959792385998e-08, |
| "loss": 0.0, |
| "reward": 1.0712128281593323, |
| "reward_std": 0.39892764389514923, |
| "rewards/correct_code_reward_func": 0.6041666865348816, |
| "rewards/len_reward_func": 0.4670460820198059, |
| "step": 95 |
| }, |
| { |
| "completion_length": 77.66666984558105, |
| "epoch": 1.528, |
| "grad_norm": 1.068625601865938, |
| "kl": 0.03729248046875, |
| "learning_rate": 6.758020855149249e-08, |
| "loss": 0.0, |
| "reward": 0.839818924665451, |
| "reward_std": 0.4449944496154785, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.44398559629917145, |
| "step": 96 |
| }, |
| { |
| "completion_length": 46.00000190734863, |
| "epoch": 1.544, |
| "grad_norm": 1.6059040022360391, |
| "kl": 0.0543212890625, |
| "learning_rate": 6.320378277721342e-08, |
| "loss": 0.0001, |
| "reward": 0.8210516273975372, |
| "reward_std": 0.34706588089466095, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.42521825432777405, |
| "step": 97 |
| }, |
| { |
| "completion_length": 47.35416793823242, |
| "epoch": 1.56, |
| "grad_norm": 1.0536706533475677, |
| "kl": 0.06640625, |
| "learning_rate": 5.895327061568775e-08, |
| "loss": 0.0001, |
| "reward": 0.9913617670536041, |
| "reward_std": 0.24820256233215332, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.47052840888500214, |
| "step": 98 |
| }, |
| { |
| "completion_length": 34.83333396911621, |
| "epoch": 1.576, |
| "grad_norm": 1.7119720585841143, |
| "kl": 0.0626220703125, |
| "learning_rate": 5.483153720706798e-08, |
| "loss": 0.0001, |
| "reward": 0.8738410770893097, |
| "reward_std": 0.5085435211658478, |
| "rewards/correct_code_reward_func": 0.4375000149011612, |
| "rewards/len_reward_func": 0.4363410472869873, |
| "step": 99 |
| }, |
| { |
| "completion_length": 57.04166793823242, |
| "epoch": 1.592, |
| "grad_norm": 1.3955252614043234, |
| "kl": 0.036865234375, |
| "learning_rate": 5.0841360885690996e-08, |
| "loss": 0.0, |
| "reward": 1.1610032320022583, |
| "reward_std": 0.3681405633687973, |
| "rewards/correct_code_reward_func": 0.6666666865348816, |
| "rewards/len_reward_func": 0.4943365752696991, |
| "step": 100 |
| }, |
| { |
| "completion_length": 71.41666984558105, |
| "epoch": 1.608, |
| "grad_norm": 0.9360155945219467, |
| "kl": 0.055419921875, |
| "learning_rate": 4.698543130728755e-08, |
| "loss": 0.0001, |
| "reward": 1.013959676027298, |
| "reward_std": 0.39147183299064636, |
| "rewards/correct_code_reward_func": 0.625, |
| "rewards/len_reward_func": 0.3889596611261368, |
| "step": 101 |
| }, |
| { |
| "completion_length": 43.5625, |
| "epoch": 1.624, |
| "grad_norm": 1.620965651402336, |
| "kl": 0.054443359375, |
| "learning_rate": 4.326634763596784e-08, |
| "loss": 0.0001, |
| "reward": 0.8748385310173035, |
| "reward_std": 0.3259390592575073, |
| "rewards/correct_code_reward_func": 0.4166666679084301, |
| "rewards/len_reward_func": 0.45817187428474426, |
| "step": 102 |
| }, |
| { |
| "completion_length": 54.125003814697266, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 2.035180505859693, |
| "kl": 0.056640625, |
| "learning_rate": 3.968661679220467e-08, |
| "loss": 0.0001, |
| "reward": 1.2046028971672058, |
| "reward_std": 0.4274601340293884, |
| "rewards/correct_code_reward_func": 0.7708333730697632, |
| "rewards/len_reward_func": 0.433769553899765, |
| "step": 103 |
| }, |
| { |
| "completion_length": 37.72916793823242, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 1.3110502429273858, |
| "kl": 0.07470703125, |
| "learning_rate": 3.624865176299499e-08, |
| "loss": 0.0001, |
| "reward": 1.3125, |
| "reward_std": 0.2893980145454407, |
| "rewards/correct_code_reward_func": 0.8333333432674408, |
| "rewards/len_reward_func": 0.4791666716337204, |
| "step": 104 |
| }, |
| { |
| "completion_length": 85.20833587646484, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 0.7548512888732658, |
| "kl": 0.0379638671875, |
| "learning_rate": 3.295476997533905e-08, |
| "loss": 0.0, |
| "reward": 0.8693651854991913, |
| "reward_std": 0.467289537191391, |
| "rewards/correct_code_reward_func": 0.4166666865348816, |
| "rewards/len_reward_func": 0.4526985138654709, |
| "step": 105 |
| }, |
| { |
| "completion_length": 60.72916793823242, |
| "epoch": 1.688, |
| "grad_norm": 0.7605372623815415, |
| "kl": 0.03662109375, |
| "learning_rate": 2.980719173413396e-08, |
| "loss": 0.0, |
| "reward": 1.058800995349884, |
| "reward_std": 0.4301101863384247, |
| "rewards/correct_code_reward_func": 0.5625000298023224, |
| "rewards/len_reward_func": 0.49630098044872284, |
| "step": 106 |
| }, |
| { |
| "completion_length": 49.08333396911621, |
| "epoch": 1.704, |
| "grad_norm": 1.0207996418513696, |
| "kl": 0.07421875, |
| "learning_rate": 2.680803872553408e-08, |
| "loss": 0.0001, |
| "reward": 0.9703012108802795, |
| "reward_std": 0.23036788403987885, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.44946780800819397, |
| "step": 107 |
| }, |
| { |
| "completion_length": 57.62500190734863, |
| "epoch": 1.72, |
| "grad_norm": 1.5035291383359601, |
| "kl": 0.0594482421875, |
| "learning_rate": 2.395933258678745e-08, |
| "loss": 0.0001, |
| "reward": 1.1501469016075134, |
| "reward_std": 0.34404293447732925, |
| "rewards/correct_code_reward_func": 0.7083333432674408, |
| "rewards/len_reward_func": 0.44181351363658905, |
| "step": 108 |
| }, |
| { |
| "completion_length": 31.312501907348633, |
| "epoch": 1.736, |
| "grad_norm": 1.6167712080198857, |
| "kl": 0.0712890625, |
| "learning_rate": 2.1262993543511715e-08, |
| "loss": 0.0001, |
| "reward": 1.3109871745109558, |
| "reward_std": 0.3160089999437332, |
| "rewards/correct_code_reward_func": 0.875, |
| "rewards/len_reward_func": 0.4359871447086334, |
| "step": 109 |
| }, |
| { |
| "completion_length": 87.22916793823242, |
| "epoch": 1.752, |
| "grad_norm": 1.2313006391246961, |
| "kl": 0.0474853515625, |
| "learning_rate": 1.872083911532907e-08, |
| "loss": 0.0, |
| "reward": 0.8871810734272003, |
| "reward_std": 0.43447698652744293, |
| "rewards/correct_code_reward_func": 0.4166666865348816, |
| "rewards/len_reward_func": 0.4705143868923187, |
| "step": 110 |
| }, |
| { |
| "completion_length": 38.52083396911621, |
| "epoch": 1.768, |
| "grad_norm": 1.1333291307805236, |
| "kl": 0.0531005859375, |
| "learning_rate": 1.6334582890731697e-08, |
| "loss": 0.0001, |
| "reward": 1.3089049458503723, |
| "reward_std": 0.39449335634708405, |
| "rewards/correct_code_reward_func": 0.8541666865348816, |
| "rewards/len_reward_func": 0.45473821461200714, |
| "step": 111 |
| }, |
| { |
| "completion_length": 99.85416984558105, |
| "epoch": 1.784, |
| "grad_norm": 1.1649171723265364, |
| "kl": 0.04095458984375, |
| "learning_rate": 1.4105833372004523e-08, |
| "loss": 0.0, |
| "reward": 0.8114411234855652, |
| "reward_std": 0.3455437570810318, |
| "rewards/correct_code_reward_func": 0.4166666716337204, |
| "rewards/len_reward_func": 0.3947744071483612, |
| "step": 112 |
| }, |
| { |
| "completion_length": 62.97916793823242, |
| "epoch": 1.8, |
| "grad_norm": 1.3270422235967783, |
| "kl": 0.042236328125, |
| "learning_rate": 1.2036092890982619e-08, |
| "loss": 0.0, |
| "reward": 0.8704836964607239, |
| "reward_std": 0.37642528116703033, |
| "rewards/correct_code_reward_func": 0.3958333432674408, |
| "rewards/len_reward_func": 0.4746503531932831, |
| "step": 113 |
| }, |
| { |
| "completion_length": 61.854169845581055, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 1.47111563325612, |
| "kl": 0.0511474609375, |
| "learning_rate": 1.0126756596375685e-08, |
| "loss": 0.0001, |
| "reward": 1.0473325848579407, |
| "reward_std": 0.5612048506736755, |
| "rewards/correct_code_reward_func": 0.5833333730697632, |
| "rewards/len_reward_func": 0.4639992117881775, |
| "step": 114 |
| }, |
| { |
| "completion_length": 44.22916793823242, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 1.3437447624267829, |
| "kl": 0.0599365234375, |
| "learning_rate": 8.379111513340753e-09, |
| "loss": 0.0001, |
| "reward": 0.9941486120223999, |
| "reward_std": 0.5003164112567902, |
| "rewards/correct_code_reward_func": 0.5625000149011612, |
| "rewards/len_reward_func": 0.4316485822200775, |
| "step": 115 |
| }, |
| { |
| "completion_length": 51.45833396911621, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 1.3897204537445644, |
| "kl": 0.0416259765625, |
| "learning_rate": 6.7943356759381785e-09, |
| "loss": 0.0, |
| "reward": 1.0625000596046448, |
| "reward_std": 0.4191845655441284, |
| "rewards/correct_code_reward_func": 0.5833333730697632, |
| "rewards/len_reward_func": 0.4791666716337204, |
| "step": 116 |
| }, |
| { |
| "completion_length": 64.45833587646484, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 2.0280580958010224, |
| "kl": 0.07666015625, |
| "learning_rate": 5.373497333054616e-09, |
| "loss": 0.0001, |
| "reward": 0.9957386255264282, |
| "reward_std": 0.49953845143318176, |
| "rewards/correct_code_reward_func": 0.5, |
| "rewards/len_reward_func": 0.4957386255264282, |
| "step": 117 |
| }, |
| { |
| "completion_length": 61.833335876464844, |
| "epoch": 1.88, |
| "grad_norm": 1.4575505392586763, |
| "kl": 0.06689453125, |
| "learning_rate": 4.117554228329406e-09, |
| "loss": 0.0001, |
| "reward": 1.2341400384902954, |
| "reward_std": 0.472938671708107, |
| "rewards/correct_code_reward_func": 0.8125000298023224, |
| "rewards/len_reward_func": 0.421640083193779, |
| "step": 118 |
| }, |
| { |
| "completion_length": 54.43750286102295, |
| "epoch": 1.896, |
| "grad_norm": 1.6449474004844165, |
| "kl": 0.0567626953125, |
| "learning_rate": 3.0273529545687125e-09, |
| "loss": 0.0001, |
| "reward": 1.0164108276367188, |
| "reward_std": 0.4117661267518997, |
| "rewards/correct_code_reward_func": 0.5208333432674408, |
| "rewards/len_reward_func": 0.49557754397392273, |
| "step": 119 |
| }, |
| { |
| "completion_length": 59.8125, |
| "epoch": 1.912, |
| "grad_norm": 1.2181441627902125, |
| "kl": 0.052978515625, |
| "learning_rate": 2.1036283830834224e-09, |
| "loss": 0.0001, |
| "reward": 1.1736499071121216, |
| "reward_std": 0.41473129391670227, |
| "rewards/correct_code_reward_func": 0.7083333432674408, |
| "rewards/len_reward_func": 0.465316578745842, |
| "step": 120 |
| }, |
| { |
| "completion_length": 29.125001907348633, |
| "epoch": 1.928, |
| "grad_norm": 1.6515660045888947, |
| "kl": 0.0712890625, |
| "learning_rate": 1.347003168334665e-09, |
| "loss": 0.0001, |
| "reward": 1.2279411554336548, |
| "reward_std": 0.212213896214962, |
| "rewards/correct_code_reward_func": 0.7500000298023224, |
| "rewards/len_reward_func": 0.477941170334816, |
| "step": 121 |
| }, |
| { |
| "completion_length": 40.29166793823242, |
| "epoch": 1.944, |
| "grad_norm": 2.3395660695095883, |
| "kl": 0.0679931640625, |
| "learning_rate": 7.579873282216598e-10, |
| "loss": 0.0001, |
| "reward": 0.9375000298023224, |
| "reward_std": 0.37034808099269867, |
| "rewards/correct_code_reward_func": 0.4583333432674408, |
| "rewards/len_reward_func": 0.4791666716337204, |
| "step": 122 |
| }, |
| { |
| "completion_length": 70.83333396911621, |
| "epoch": 1.96, |
| "grad_norm": 1.119825823829922, |
| "kl": 0.057861328125, |
| "learning_rate": 3.3697790029424413e-10, |
| "loss": 0.0001, |
| "reward": 1.2083333730697632, |
| "reward_std": 0.39485183358192444, |
| "rewards/correct_code_reward_func": 0.7083333730697632, |
| "rewards/len_reward_func": 0.5, |
| "step": 123 |
| }, |
| { |
| "completion_length": 52.750003814697266, |
| "epoch": 1.976, |
| "grad_norm": 2.4270573554119026, |
| "kl": 0.0513916015625, |
| "learning_rate": 8.425867412190091e-11, |
| "loss": 0.0001, |
| "reward": 1.1584753692150116, |
| "reward_std": 0.37459391355514526, |
| "rewards/correct_code_reward_func": 0.7083333432674408, |
| "rewards/len_reward_func": 0.4501419961452484, |
| "step": 124 |
| }, |
| { |
| "completion_length": 95.9375, |
| "epoch": 1.992, |
| "grad_norm": 1.1107259696791534, |
| "kl": 0.038330078125, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "reward": 1.0544261932373047, |
| "reward_std": 0.4110799580812454, |
| "rewards/correct_code_reward_func": 0.6041666865348816, |
| "rewards/len_reward_func": 0.4502594470977783, |
| "step": 125 |
| }, |
| { |
| "completion_length": 70.72916984558105, |
| "epoch": 2.032, |
| "grad_norm": 0.921072382478298, |
| "kl": 0.067626953125, |
| "learning_rate": 8.425867412190091e-11, |
| "loss": 0.0001, |
| "reward": 0.8045242130756378, |
| "reward_std": 0.41052111983299255, |
| "rewards/correct_code_reward_func": 0.3750000149011612, |
| "rewards/len_reward_func": 0.42952418327331543, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.032, |
| "step": 126, |
| "total_flos": 0.0, |
| "train_loss": 5.371543848591428e-07, |
| "train_runtime": 142.6302, |
| "train_samples_per_second": 5.258, |
| "train_steps_per_second": 0.876 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 125, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|