| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.03131411294676601, |
| "eval_steps": 500, |
| "global_step": 290, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 149.0, |
| "epoch": 0.00010797969981643452, |
| "grad_norm": 3.35890793800354, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 1 |
| }, |
| { |
| "completion_length": 87.25, |
| "epoch": 0.00021595939963286903, |
| "grad_norm": 10.705058097839355, |
| "kl": 0.0, |
| "learning_rate": 2e-07, |
| "loss": -0.0, |
| "reward": 4.5625, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 2 |
| }, |
| { |
| "completion_length": 125.0, |
| "epoch": 0.00032393909944930353, |
| "grad_norm": 7.079329013824463, |
| "kl": 0.00016307830810546875, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.758794367313385, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 3 |
| }, |
| { |
| "completion_length": 99.5, |
| "epoch": 0.00043191879926573806, |
| "grad_norm": 4.656938552856445, |
| "kl": 0.0004425048828125, |
| "learning_rate": 6e-07, |
| "loss": 0.0, |
| "reward": 3.6875, |
| "reward_std": 0.6178992986679077, |
| "rewards/gpt4o_reward_model": 3.6875, |
| "step": 4 |
| }, |
| { |
| "completion_length": 175.75, |
| "epoch": 0.0005398984990821725, |
| "grad_norm": 4.849682807922363, |
| "kl": 7.62939453125e-05, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.579224169254303, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 5 |
| }, |
| { |
| "completion_length": 667.0, |
| "epoch": 0.0006478781988986071, |
| "grad_norm": 4.602669715881348, |
| "kl": 5.936622619628906e-05, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "reward": 3.125, |
| "reward_std": 0.9418070316314697, |
| "rewards/gpt4o_reward_model": 3.125, |
| "step": 6 |
| }, |
| { |
| "completion_length": 220.5, |
| "epoch": 0.0007558578987150416, |
| "grad_norm": 4.340234756469727, |
| "kl": 0.00010204315185546875, |
| "learning_rate": 9.999999396664822e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.7619017362594604, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 7 |
| }, |
| { |
| "completion_length": 89.75, |
| "epoch": 0.0008638375985314761, |
| "grad_norm": 3.9225738048553467, |
| "kl": 0.0001468658447265625, |
| "learning_rate": 9.999997586659434e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.8765935897827148, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 8 |
| }, |
| { |
| "completion_length": 135.0, |
| "epoch": 0.0009718172983479105, |
| "grad_norm": 4.508596897125244, |
| "kl": 0.00015354156494140625, |
| "learning_rate": 9.999994569984275e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 9 |
| }, |
| { |
| "completion_length": 199.25, |
| "epoch": 0.001079796998164345, |
| "grad_norm": 5.958744525909424, |
| "kl": 0.0003070831298828125, |
| "learning_rate": 9.99999034664007e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.81220543384552, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 10 |
| }, |
| { |
| "completion_length": 111.5, |
| "epoch": 0.0011877766979807797, |
| "grad_norm": 5.480228900909424, |
| "kl": 0.0001983642578125, |
| "learning_rate": 9.999984916627839e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 11 |
| }, |
| { |
| "completion_length": 185.25, |
| "epoch": 0.0012957563977972141, |
| "grad_norm": 3.4712541103363037, |
| "kl": 0.000213623046875, |
| "learning_rate": 9.999978279948895e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 12 |
| }, |
| { |
| "completion_length": 112.75, |
| "epoch": 0.0014037360976136485, |
| "grad_norm": 3.926478147506714, |
| "kl": 0.0001087188720703125, |
| "learning_rate": 9.999970436604836e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.7286534309387207, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 13 |
| }, |
| { |
| "completion_length": 147.75, |
| "epoch": 0.0015117157974300832, |
| "grad_norm": 3.4515721797943115, |
| "kl": 0.0002498626708984375, |
| "learning_rate": 9.999961386597556e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 14 |
| }, |
| { |
| "completion_length": 117.5, |
| "epoch": 0.0016196954972465176, |
| "grad_norm": 6.8424906730651855, |
| "kl": 0.000476837158203125, |
| "learning_rate": 9.99995112992924e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 15 |
| }, |
| { |
| "completion_length": 159.5, |
| "epoch": 0.0017276751970629522, |
| "grad_norm": 3.0959248542785645, |
| "kl": 0.000247955322265625, |
| "learning_rate": 9.999939666602364e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 16 |
| }, |
| { |
| "completion_length": 534.0, |
| "epoch": 0.0018356548968793867, |
| "grad_norm": 5.143649101257324, |
| "kl": 0.000560760498046875, |
| "learning_rate": 9.999926996619692e-07, |
| "loss": 0.0, |
| "reward": 3.5625, |
| "reward_std": 0.8751000165939331, |
| "rewards/gpt4o_reward_model": 3.5625, |
| "step": 17 |
| }, |
| { |
| "completion_length": 181.25, |
| "epoch": 0.001943634596695821, |
| "grad_norm": 3.3201894760131836, |
| "kl": 0.0004825592041015625, |
| "learning_rate": 9.999913119984283e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 18 |
| }, |
| { |
| "completion_length": 107.0, |
| "epoch": 0.0020516142965122555, |
| "grad_norm": 4.407830238342285, |
| "kl": 0.000476837158203125, |
| "learning_rate": 9.999898036699488e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 19 |
| }, |
| { |
| "completion_length": 106.5, |
| "epoch": 0.00215959399632869, |
| "grad_norm": 5.270094394683838, |
| "kl": 0.0008392333984375, |
| "learning_rate": 9.999881746768941e-07, |
| "loss": 0.0, |
| "reward": 3.6875, |
| "reward_std": 0.633794367313385, |
| "rewards/gpt4o_reward_model": 3.6875, |
| "step": 20 |
| }, |
| { |
| "completion_length": 67.0, |
| "epoch": 0.0022675736961451248, |
| "grad_norm": 3.9482433795928955, |
| "kl": 0.000579833984375, |
| "learning_rate": 9.99986425019658e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.758794367313385, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 21 |
| }, |
| { |
| "completion_length": 163.5, |
| "epoch": 0.0023755533959615594, |
| "grad_norm": 3.5114428997039795, |
| "kl": 0.000675201416015625, |
| "learning_rate": 9.999845546986625e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 22 |
| }, |
| { |
| "completion_length": 80.5, |
| "epoch": 0.0024835330957779936, |
| "grad_norm": 2.4469258785247803, |
| "kl": 0.00099945068359375, |
| "learning_rate": 9.99982563714359e-07, |
| "loss": 0.0, |
| "reward": 3.1875, |
| "reward_std": 0.23945678770542145, |
| "rewards/gpt4o_reward_model": 3.1875, |
| "step": 23 |
| }, |
| { |
| "completion_length": 136.25, |
| "epoch": 0.0025915127955944283, |
| "grad_norm": 2.949350357055664, |
| "kl": 0.000576019287109375, |
| "learning_rate": 9.999804520672277e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.28877514600753784, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 24 |
| }, |
| { |
| "completion_length": 56.75, |
| "epoch": 0.002699492495410863, |
| "grad_norm": 4.914743423461914, |
| "kl": 0.00127410888671875, |
| "learning_rate": 9.999782197577788e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.5387751460075378, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 25 |
| }, |
| { |
| "completion_length": 83.0, |
| "epoch": 0.002807472195227297, |
| "grad_norm": 4.330339431762695, |
| "kl": 0.00090789794921875, |
| "learning_rate": 9.999758667865504e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 26 |
| }, |
| { |
| "completion_length": 111.25, |
| "epoch": 0.0029154518950437317, |
| "grad_norm": 7.491921901702881, |
| "kl": 0.00148773193359375, |
| "learning_rate": 9.999733931541108e-07, |
| "loss": 0.0, |
| "reward": 3.6875, |
| "reward_std": 0.796310305595398, |
| "rewards/gpt4o_reward_model": 3.6875, |
| "step": 27 |
| }, |
| { |
| "completion_length": 99.5, |
| "epoch": 0.0030234315948601664, |
| "grad_norm": 1.0942319631576538, |
| "kl": 0.00092315673828125, |
| "learning_rate": 9.999707988610568e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 28 |
| }, |
| { |
| "completion_length": 107.5, |
| "epoch": 0.003131411294676601, |
| "grad_norm": 4.034415245056152, |
| "kl": 0.001800537109375, |
| "learning_rate": 9.999680839080146e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.579224169254303, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 29 |
| }, |
| { |
| "completion_length": 183.5, |
| "epoch": 0.003239390994493035, |
| "grad_norm": 5.061659336090088, |
| "kl": 0.0013885498046875, |
| "learning_rate": 9.999652482956392e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 30 |
| }, |
| { |
| "completion_length": 116.75, |
| "epoch": 0.00334737069430947, |
| "grad_norm": 4.587920188903809, |
| "kl": 0.002685546875, |
| "learning_rate": 9.99962292024615e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 0.7394567728042603, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 31 |
| }, |
| { |
| "completion_length": 92.0, |
| "epoch": 0.0034553503941259045, |
| "grad_norm": 4.07750129699707, |
| "kl": 0.001800537109375, |
| "learning_rate": 9.999592150956556e-07, |
| "loss": 0.0, |
| "reward": 3.6875, |
| "reward_std": 0.47356173396110535, |
| "rewards/gpt4o_reward_model": 3.6875, |
| "step": 32 |
| }, |
| { |
| "completion_length": 92.0, |
| "epoch": 0.0035633300939423387, |
| "grad_norm": 4.351490020751953, |
| "kl": 0.0020294189453125, |
| "learning_rate": 9.999560175095034e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.6115237474441528, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 33 |
| }, |
| { |
| "completion_length": 92.5, |
| "epoch": 0.0036713097937587733, |
| "grad_norm": 5.623327255249023, |
| "kl": 0.0019378662109375, |
| "learning_rate": 9.9995269926693e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.34856173396110535, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 34 |
| }, |
| { |
| "completion_length": 90.5, |
| "epoch": 0.003779289493575208, |
| "grad_norm": 7.087028503417969, |
| "kl": 0.00164031982421875, |
| "learning_rate": 9.999492603687366e-07, |
| "loss": 0.0, |
| "reward": 4.8125, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 35 |
| }, |
| { |
| "completion_length": 117.5, |
| "epoch": 0.003887269193391642, |
| "grad_norm": 3.4546611309051514, |
| "kl": 0.00173187255859375, |
| "learning_rate": 9.999457008157528e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 36 |
| }, |
| { |
| "completion_length": 164.25, |
| "epoch": 0.003995248893208077, |
| "grad_norm": 3.795534610748291, |
| "kl": 0.00201416015625, |
| "learning_rate": 9.999420206088379e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 37 |
| }, |
| { |
| "completion_length": 168.75, |
| "epoch": 0.004103228593024511, |
| "grad_norm": 4.190282344818115, |
| "kl": 0.00183868408203125, |
| "learning_rate": 9.999382197488796e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 38 |
| }, |
| { |
| "completion_length": 140.0, |
| "epoch": 0.004211208292840946, |
| "grad_norm": 2.999417781829834, |
| "kl": 0.0020904541015625, |
| "learning_rate": 9.999342982367957e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.36445680260658264, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 39 |
| }, |
| { |
| "completion_length": 102.5, |
| "epoch": 0.00431918799265738, |
| "grad_norm": 5.018375873565674, |
| "kl": 0.0027313232421875, |
| "learning_rate": 9.99930256073532e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 40 |
| }, |
| { |
| "completion_length": 94.5, |
| "epoch": 0.004427167692473815, |
| "grad_norm": 5.892095565795898, |
| "kl": 0.0028839111328125, |
| "learning_rate": 9.999260932600648e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 1.1827775239944458, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 41 |
| }, |
| { |
| "completion_length": 88.0, |
| "epoch": 0.0045351473922902496, |
| "grad_norm": 3.4567017555236816, |
| "kl": 0.0014190673828125, |
| "learning_rate": 9.99921809797398e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.47356173396110535, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 42 |
| }, |
| { |
| "completion_length": 148.75, |
| "epoch": 0.004643127092106684, |
| "grad_norm": 4.687886714935303, |
| "kl": 0.0025177001953125, |
| "learning_rate": 9.999174056865658e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.5581127405166626, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 43 |
| }, |
| { |
| "completion_length": 188.0, |
| "epoch": 0.004751106791923119, |
| "grad_norm": 3.5290260314941406, |
| "kl": 0.00238037109375, |
| "learning_rate": 9.999128809286309e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 44 |
| }, |
| { |
| "completion_length": 86.5, |
| "epoch": 0.004859086491739553, |
| "grad_norm": 4.073955535888672, |
| "kl": 0.002655029296875, |
| "learning_rate": 9.99908235524685e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 45 |
| }, |
| { |
| "completion_length": 87.25, |
| "epoch": 0.004967066191555987, |
| "grad_norm": 4.2338104248046875, |
| "kl": 0.0042724609375, |
| "learning_rate": 9.9990346947585e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 46 |
| }, |
| { |
| "completion_length": 127.25, |
| "epoch": 0.005075045891372422, |
| "grad_norm": 18.224328994750977, |
| "kl": 0.01458740234375, |
| "learning_rate": 9.998985827832752e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.7365237474441528, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 47 |
| }, |
| { |
| "completion_length": 224.75, |
| "epoch": 0.0051830255911888565, |
| "grad_norm": 3.0918076038360596, |
| "kl": 0.0031585693359375, |
| "learning_rate": 9.998935754481404e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.4435809552669525, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 48 |
| }, |
| { |
| "completion_length": 98.25, |
| "epoch": 0.005291005291005291, |
| "grad_norm": 6.054563522338867, |
| "kl": 0.0042724609375, |
| "learning_rate": 9.998884474716539e-07, |
| "loss": 0.0, |
| "reward": 2.875, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 2.875, |
| "step": 49 |
| }, |
| { |
| "completion_length": 103.75, |
| "epoch": 0.005398984990821726, |
| "grad_norm": 5.9451069831848145, |
| "kl": 0.0028533935546875, |
| "learning_rate": 9.998831988550533e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 50 |
| }, |
| { |
| "completion_length": 133.75, |
| "epoch": 0.00550696469063816, |
| "grad_norm": 6.4553728103637695, |
| "kl": 0.006317138671875, |
| "learning_rate": 9.998778295996054e-07, |
| "loss": 0.0, |
| "reward": 3.5625, |
| "reward_std": 0.9518133401870728, |
| "rewards/gpt4o_reward_model": 3.5625, |
| "step": 51 |
| }, |
| { |
| "completion_length": 81.5, |
| "epoch": 0.005614944390454594, |
| "grad_norm": 6.15158748626709, |
| "kl": 0.002899169921875, |
| "learning_rate": 9.998723397066058e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 1.060096263885498, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 52 |
| }, |
| { |
| "completion_length": 134.0, |
| "epoch": 0.005722924090271029, |
| "grad_norm": 5.659409999847412, |
| "kl": 0.0040283203125, |
| "learning_rate": 9.998667291773794e-07, |
| "loss": 0.0, |
| "reward": 3.5625, |
| "reward_std": 0.8831573724746704, |
| "rewards/gpt4o_reward_model": 3.5625, |
| "step": 53 |
| }, |
| { |
| "completion_length": 100.0, |
| "epoch": 0.0058309037900874635, |
| "grad_norm": 5.585843563079834, |
| "kl": 0.004150390625, |
| "learning_rate": 9.998609980132803e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 54 |
| }, |
| { |
| "completion_length": 158.0, |
| "epoch": 0.005938883489903898, |
| "grad_norm": 4.5382161140441895, |
| "kl": 0.004302978515625, |
| "learning_rate": 9.998551462156917e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 55 |
| }, |
| { |
| "completion_length": 83.75, |
| "epoch": 0.006046863189720333, |
| "grad_norm": 3.957919120788574, |
| "kl": 0.0040283203125, |
| "learning_rate": 9.998491737860255e-07, |
| "loss": 0.0, |
| "reward": 4.375, |
| "reward_std": 0.5840140581130981, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 56 |
| }, |
| { |
| "completion_length": 175.5, |
| "epoch": 0.006154842889536767, |
| "grad_norm": 4.4697747230529785, |
| "kl": 0.004302978515625, |
| "learning_rate": 9.998430807257234e-07, |
| "loss": 0.0, |
| "reward": 4.375, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 57 |
| }, |
| { |
| "completion_length": 145.75, |
| "epoch": 0.006262822589353202, |
| "grad_norm": 4.669873237609863, |
| "kl": 0.004364013671875, |
| "learning_rate": 9.998368670362557e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 58 |
| }, |
| { |
| "completion_length": 172.25, |
| "epoch": 0.006370802289169636, |
| "grad_norm": 4.383346080780029, |
| "kl": 0.003509521484375, |
| "learning_rate": 9.998305327191222e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.971612811088562, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 59 |
| }, |
| { |
| "completion_length": 125.5, |
| "epoch": 0.00647878198898607, |
| "grad_norm": 4.461704254150391, |
| "kl": 0.0030670166015625, |
| "learning_rate": 9.998240777758514e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 60 |
| }, |
| { |
| "completion_length": 220.25, |
| "epoch": 0.0065867616888025055, |
| "grad_norm": 4.459292888641357, |
| "kl": 0.00457763671875, |
| "learning_rate": 9.99817502208001e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.883794367313385, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 61 |
| }, |
| { |
| "completion_length": 124.0, |
| "epoch": 0.00669474138861894, |
| "grad_norm": 4.459859848022461, |
| "kl": 0.00347900390625, |
| "learning_rate": 9.998108060171579e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.7283515930175781, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 62 |
| }, |
| { |
| "completion_length": 202.0, |
| "epoch": 0.006802721088435374, |
| "grad_norm": 4.133177757263184, |
| "kl": 0.0035858154296875, |
| "learning_rate": 9.998039892049383e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 63 |
| }, |
| { |
| "completion_length": 71.5, |
| "epoch": 0.006910700788251809, |
| "grad_norm": 5.423212051391602, |
| "kl": 0.0069580078125, |
| "learning_rate": 9.997970517729874e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.8220234513282776, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 64 |
| }, |
| { |
| "completion_length": 171.25, |
| "epoch": 0.007018680488068243, |
| "grad_norm": 3.992265462875366, |
| "kl": 0.0023040771484375, |
| "learning_rate": 9.997899937229792e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.5387751460075378, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 65 |
| }, |
| { |
| "completion_length": 118.25, |
| "epoch": 0.007126660187884677, |
| "grad_norm": 4.649938583374023, |
| "kl": 0.00506591796875, |
| "learning_rate": 9.997828150566171e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.704224169254303, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 66 |
| }, |
| { |
| "completion_length": 112.5, |
| "epoch": 0.0072346398877011124, |
| "grad_norm": 4.159398078918457, |
| "kl": 0.005767822265625, |
| "learning_rate": 9.997755157756337e-07, |
| "loss": 0.0, |
| "reward": 3.6875, |
| "reward_std": 0.5188006162643433, |
| "rewards/gpt4o_reward_model": 3.6875, |
| "step": 67 |
| }, |
| { |
| "completion_length": 66.5, |
| "epoch": 0.007342619587517547, |
| "grad_norm": 3.303025484085083, |
| "kl": 0.00494384765625, |
| "learning_rate": 9.997680958817907e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.454224169254303, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 68 |
| }, |
| { |
| "completion_length": 151.0, |
| "epoch": 0.007450599287333981, |
| "grad_norm": 3.7500741481781006, |
| "kl": 0.004150390625, |
| "learning_rate": 9.99760555376878e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.5685809850692749, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 69 |
| }, |
| { |
| "completion_length": 131.5, |
| "epoch": 0.007558578987150416, |
| "grad_norm": 5.136856555938721, |
| "kl": 0.00537109375, |
| "learning_rate": 9.997528942627165e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 0.8678992986679077, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 70 |
| }, |
| { |
| "completion_length": 249.25, |
| "epoch": 0.00766655868696685, |
| "grad_norm": 3.3416953086853027, |
| "kl": 0.00469970703125, |
| "learning_rate": 9.997451125411542e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.5646764636039734, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 71 |
| }, |
| { |
| "completion_length": 134.0, |
| "epoch": 0.007774538386783284, |
| "grad_norm": 3.785247564315796, |
| "kl": 0.0054931640625, |
| "learning_rate": 9.997372102140694e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 72 |
| }, |
| { |
| "completion_length": 82.0, |
| "epoch": 0.007882518086599719, |
| "grad_norm": 5.987561225891113, |
| "kl": 0.00604248046875, |
| "learning_rate": 9.997291872833694e-07, |
| "loss": 0.0, |
| "reward": 4.375, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 73 |
| }, |
| { |
| "completion_length": 80.75, |
| "epoch": 0.007990497786416154, |
| "grad_norm": 4.2266130447387695, |
| "kl": 0.00677490234375, |
| "learning_rate": 9.9972104375099e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 74 |
| }, |
| { |
| "completion_length": 120.5, |
| "epoch": 0.008098477486232589, |
| "grad_norm": 4.478511333465576, |
| "kl": 0.005523681640625, |
| "learning_rate": 9.997127796188967e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.6831127405166626, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 75 |
| }, |
| { |
| "completion_length": 238.75, |
| "epoch": 0.008206457186049022, |
| "grad_norm": 3.8930044174194336, |
| "kl": 0.005279541015625, |
| "learning_rate": 9.997043948890839e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 76 |
| }, |
| { |
| "completion_length": 357.0, |
| "epoch": 0.008314436885865457, |
| "grad_norm": 4.40590763092041, |
| "kl": 0.00555419921875, |
| "learning_rate": 9.996958895635754e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.9470234513282776, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 77 |
| }, |
| { |
| "completion_length": 127.25, |
| "epoch": 0.008422416585681892, |
| "grad_norm": 4.700560092926025, |
| "kl": 0.0064697265625, |
| "learning_rate": 9.996872636444235e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.5985617637634277, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 78 |
| }, |
| { |
| "completion_length": 231.25, |
| "epoch": 0.008530396285498325, |
| "grad_norm": 3.062976360321045, |
| "kl": 0.00421142578125, |
| "learning_rate": 9.996785171337101e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 79 |
| }, |
| { |
| "completion_length": 102.5, |
| "epoch": 0.00863837598531476, |
| "grad_norm": 4.180944919586182, |
| "kl": 0.00518798828125, |
| "learning_rate": 9.996696500335458e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 80 |
| }, |
| { |
| "completion_length": 111.75, |
| "epoch": 0.008746355685131196, |
| "grad_norm": 4.435964107513428, |
| "kl": 0.00726318359375, |
| "learning_rate": 9.996606623460707e-07, |
| "loss": 0.0, |
| "reward": 4.375, |
| "reward_std": 0.7887752056121826, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 81 |
| }, |
| { |
| "completion_length": 77.0, |
| "epoch": 0.00885433538494763, |
| "grad_norm": 3.1188766956329346, |
| "kl": 0.007171630859375, |
| "learning_rate": 9.99651554073454e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 82 |
| }, |
| { |
| "completion_length": 104.75, |
| "epoch": 0.008962315084764064, |
| "grad_norm": 5.255317687988281, |
| "kl": 0.0072021484375, |
| "learning_rate": 9.996423252178933e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.6935809850692749, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 83 |
| }, |
| { |
| "completion_length": 106.75, |
| "epoch": 0.009070294784580499, |
| "grad_norm": 4.521153926849365, |
| "kl": 0.00860595703125, |
| "learning_rate": 9.996329757816166e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.6790332794189453, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 84 |
| }, |
| { |
| "completion_length": 177.75, |
| "epoch": 0.009178274484396934, |
| "grad_norm": 4.019938945770264, |
| "kl": 0.00653076171875, |
| "learning_rate": 9.996235057668797e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 85 |
| }, |
| { |
| "completion_length": 116.5, |
| "epoch": 0.009286254184213368, |
| "grad_norm": 4.606100559234619, |
| "kl": 0.0078125, |
| "learning_rate": 9.99613915175968e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.8233708143234253, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 86 |
| }, |
| { |
| "completion_length": 310.25, |
| "epoch": 0.009394233884029803, |
| "grad_norm": 5.037557601928711, |
| "kl": 0.00836181640625, |
| "learning_rate": 9.996042040111962e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.7481511831283569, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 87 |
| }, |
| { |
| "completion_length": 135.25, |
| "epoch": 0.009502213583846238, |
| "grad_norm": 5.936065196990967, |
| "kl": 0.014404296875, |
| "learning_rate": 9.99594372274908e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 1.006845474243164, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 88 |
| }, |
| { |
| "completion_length": 165.25, |
| "epoch": 0.009610193283662671, |
| "grad_norm": 4.330586910247803, |
| "kl": 0.007537841796875, |
| "learning_rate": 9.995844199694763e-07, |
| "loss": 0.0, |
| "reward": 3.1875, |
| "reward_std": 0.796310305595398, |
| "rewards/gpt4o_reward_model": 3.1875, |
| "step": 89 |
| }, |
| { |
| "completion_length": 87.25, |
| "epoch": 0.009718172983479106, |
| "grad_norm": 3.3461477756500244, |
| "kl": 0.0103759765625, |
| "learning_rate": 9.995743470973024e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.28877514600753784, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 90 |
| }, |
| { |
| "completion_length": 64.5, |
| "epoch": 0.009826152683295541, |
| "grad_norm": 3.6653449535369873, |
| "kl": 0.0084228515625, |
| "learning_rate": 9.995641536608176e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.579224169254303, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 91 |
| }, |
| { |
| "completion_length": 134.5, |
| "epoch": 0.009934132383111974, |
| "grad_norm": 3.128530979156494, |
| "kl": 0.00848388671875, |
| "learning_rate": 9.99553839662482e-07, |
| "loss": 0.0, |
| "reward": 4.8125, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 92 |
| }, |
| { |
| "completion_length": 121.25, |
| "epoch": 0.01004211208292841, |
| "grad_norm": 4.361353397369385, |
| "kl": 0.0054931640625, |
| "learning_rate": 9.995434051047845e-07, |
| "loss": 0.0, |
| "reward": 3.625, |
| "reward_std": 0.6831127405166626, |
| "rewards/gpt4o_reward_model": 3.625, |
| "step": 93 |
| }, |
| { |
| "completion_length": 109.25, |
| "epoch": 0.010150091782744845, |
| "grad_norm": 4.838723182678223, |
| "kl": 0.013916015625, |
| "learning_rate": 9.995328499902433e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.8485617637634277, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 94 |
| }, |
| { |
| "completion_length": 179.5, |
| "epoch": 0.010258071482561278, |
| "grad_norm": 3.451291084289551, |
| "kl": 0.01019287109375, |
| "learning_rate": 9.99522174321406e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.454224169254303, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 95 |
| }, |
| { |
| "completion_length": 99.5, |
| "epoch": 0.010366051182377713, |
| "grad_norm": 2.002145528793335, |
| "kl": 0.01031494140625, |
| "learning_rate": 9.995113781008485e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 96 |
| }, |
| { |
| "completion_length": 127.0, |
| "epoch": 0.010474030882194148, |
| "grad_norm": 4.460968494415283, |
| "kl": 0.01141357421875, |
| "learning_rate": 9.995004613311768e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.4928992986679077, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 97 |
| }, |
| { |
| "completion_length": 193.5, |
| "epoch": 0.010582010582010581, |
| "grad_norm": 5.521016597747803, |
| "kl": 0.01336669921875, |
| "learning_rate": 9.994894240150252e-07, |
| "loss": 0.0, |
| "reward": 2.375, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 2.375, |
| "step": 98 |
| }, |
| { |
| "completion_length": 126.5, |
| "epoch": 0.010689990281827016, |
| "grad_norm": 7.110630989074707, |
| "kl": 0.010009765625, |
| "learning_rate": 9.994782661550573e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 99 |
| }, |
| { |
| "completion_length": 253.75, |
| "epoch": 0.010797969981643452, |
| "grad_norm": 3.6743593215942383, |
| "kl": 0.00982666015625, |
| "learning_rate": 9.994669877539664e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.5879185199737549, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 100 |
| }, |
| { |
| "completion_length": 68.5, |
| "epoch": 0.010905949681459885, |
| "grad_norm": 4.462601661682129, |
| "kl": 0.01251220703125, |
| "learning_rate": 9.994555888144736e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.6827775835990906, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 101 |
| }, |
| { |
| "completion_length": 123.0, |
| "epoch": 0.01101392938127632, |
| "grad_norm": 4.228979110717773, |
| "kl": 0.009033203125, |
| "learning_rate": 9.994440693393305e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.6251000165939331, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 102 |
| }, |
| { |
| "completion_length": 95.5, |
| "epoch": 0.011121909081092755, |
| "grad_norm": 4.379354476928711, |
| "kl": 0.01068115234375, |
| "learning_rate": 9.994324293313169e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.5728486180305481, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 103 |
| }, |
| { |
| "completion_length": 198.5, |
| "epoch": 0.011229888780909188, |
| "grad_norm": 4.31371545791626, |
| "kl": 0.01171875, |
| "learning_rate": 9.994206687932418e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.454224169254303, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 104 |
| }, |
| { |
| "completion_length": 210.0, |
| "epoch": 0.011337868480725623, |
| "grad_norm": 3.053598642349243, |
| "kl": 0.00994873046875, |
| "learning_rate": 9.994087877279436e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.7887751460075378, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 105 |
| }, |
| { |
| "completion_length": 144.75, |
| "epoch": 0.011445848180542059, |
| "grad_norm": 4.309823036193848, |
| "kl": 0.0159912109375, |
| "learning_rate": 9.993967861382895e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.454224169254303, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 106 |
| }, |
| { |
| "completion_length": 170.0, |
| "epoch": 0.011553827880358492, |
| "grad_norm": 4.035120487213135, |
| "kl": 0.00762939453125, |
| "learning_rate": 9.99384664027176e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.6790332198143005, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 107 |
| }, |
| { |
| "completion_length": 219.0, |
| "epoch": 0.011661807580174927, |
| "grad_norm": 3.480762004852295, |
| "kl": 0.0137939453125, |
| "learning_rate": 9.993724213975286e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.329224169254303, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 108 |
| }, |
| { |
| "completion_length": 95.75, |
| "epoch": 0.011769787279991362, |
| "grad_norm": 5.104645252227783, |
| "kl": 0.017578125, |
| "learning_rate": 9.993600582523015e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 0.8625079393386841, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 109 |
| }, |
| { |
| "completion_length": 198.0, |
| "epoch": 0.011877766979807795, |
| "grad_norm": 5.45714807510376, |
| "kl": 0.0157470703125, |
| "learning_rate": 9.993475745944787e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 0.7694376111030579, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 110 |
| }, |
| { |
| "completion_length": 242.25, |
| "epoch": 0.01198574667962423, |
| "grad_norm": 3.7047228813171387, |
| "kl": 0.0146484375, |
| "learning_rate": 9.99334970427073e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.8751000165939331, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 111 |
| }, |
| { |
| "completion_length": 163.75, |
| "epoch": 0.012093726379440665, |
| "grad_norm": 4.073879718780518, |
| "kl": 0.0111083984375, |
| "learning_rate": 9.993222457531262e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.6724694967269897, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 112 |
| }, |
| { |
| "completion_length": 113.5, |
| "epoch": 0.012201706079257099, |
| "grad_norm": 4.12031888961792, |
| "kl": 0.01385498046875, |
| "learning_rate": 9.99309400575709e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.6178992986679077, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 113 |
| }, |
| { |
| "completion_length": 43.5, |
| "epoch": 0.012309685779073534, |
| "grad_norm": 0.003000754164531827, |
| "kl": 0.0172119140625, |
| "learning_rate": 9.992964348979213e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 9.999999747378752e-05, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 114 |
| }, |
| { |
| "completion_length": 270.0, |
| "epoch": 0.012417665478889969, |
| "grad_norm": 3.4506990909576416, |
| "kl": 0.0166015625, |
| "learning_rate": 9.992833487228923e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.5840140581130981, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 115 |
| }, |
| { |
| "completion_length": 232.25, |
| "epoch": 0.012525645178706404, |
| "grad_norm": 3.1857428550720215, |
| "kl": 0.0169677734375, |
| "learning_rate": 9.992701420537803e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 116 |
| }, |
| { |
| "completion_length": 128.0, |
| "epoch": 0.012633624878522837, |
| "grad_norm": 3.1388204097747803, |
| "kl": 0.01611328125, |
| "learning_rate": 9.992568148937722e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 117 |
| }, |
| { |
| "completion_length": 125.75, |
| "epoch": 0.012741604578339272, |
| "grad_norm": 3.442460298538208, |
| "kl": 0.01611328125, |
| "learning_rate": 9.992433672460844e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.4435809552669525, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 118 |
| }, |
| { |
| "completion_length": 143.75, |
| "epoch": 0.012849584278155708, |
| "grad_norm": 4.008980751037598, |
| "kl": 0.0235595703125, |
| "learning_rate": 9.992297991139627e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 119 |
| }, |
| { |
| "completion_length": 100.5, |
| "epoch": 0.01295756397797214, |
| "grad_norm": 3.958385705947876, |
| "kl": 0.01470947265625, |
| "learning_rate": 9.992161105006809e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.5581127405166626, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 120 |
| }, |
| { |
| "completion_length": 175.0, |
| "epoch": 0.013065543677788576, |
| "grad_norm": 3.9238104820251465, |
| "kl": 0.017333984375, |
| "learning_rate": 9.99202301409543e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.6637751460075378, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 121 |
| }, |
| { |
| "completion_length": 64.0, |
| "epoch": 0.013173523377605011, |
| "grad_norm": 5.058772087097168, |
| "kl": 0.023681640625, |
| "learning_rate": 9.991883718438813e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 122 |
| }, |
| { |
| "completion_length": 92.5, |
| "epoch": 0.013281503077421444, |
| "grad_norm": 6.274599552154541, |
| "kl": 0.026123046875, |
| "learning_rate": 9.991743218070577e-07, |
| "loss": 0.0, |
| "reward": 3.5, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 3.5, |
| "step": 123 |
| }, |
| { |
| "completion_length": 108.0, |
| "epoch": 0.01338948277723788, |
| "grad_norm": 4.530334949493408, |
| "kl": 0.022705078125, |
| "learning_rate": 9.991601513024628e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.7581573724746704, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 124 |
| }, |
| { |
| "completion_length": 103.25, |
| "epoch": 0.013497462477054314, |
| "grad_norm": 2.2850399017333984, |
| "kl": 0.01556396484375, |
| "learning_rate": 9.991458603335165e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.4331127107143402, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 125 |
| }, |
| { |
| "completion_length": 114.25, |
| "epoch": 0.013605442176870748, |
| "grad_norm": 4.146212100982666, |
| "kl": 0.02392578125, |
| "learning_rate": 9.991314489036677e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 126 |
| }, |
| { |
| "completion_length": 229.0, |
| "epoch": 0.013713421876687183, |
| "grad_norm": 4.601395130157471, |
| "kl": 0.0205078125, |
| "learning_rate": 9.991169170163943e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 127 |
| }, |
| { |
| "completion_length": 310.5, |
| "epoch": 0.013821401576503618, |
| "grad_norm": 3.042806625366211, |
| "kl": 0.015869140625, |
| "learning_rate": 9.991022646752035e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 128 |
| }, |
| { |
| "completion_length": 145.0, |
| "epoch": 0.013929381276320051, |
| "grad_norm": 3.71138858795166, |
| "kl": 0.0240478515625, |
| "learning_rate": 9.990874918836313e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.7974694967269897, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 129 |
| }, |
| { |
| "completion_length": 269.5, |
| "epoch": 0.014037360976136486, |
| "grad_norm": 4.026693820953369, |
| "kl": 0.0189208984375, |
| "learning_rate": 9.990725986452426e-07, |
| "loss": 0.0, |
| "reward": 4.375, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 130 |
| }, |
| { |
| "completion_length": 186.75, |
| "epoch": 0.014145340675952921, |
| "grad_norm": 3.600174903869629, |
| "kl": 0.0284423828125, |
| "learning_rate": 9.99057584963632e-07, |
| "loss": 0.0, |
| "reward": 3.625, |
| "reward_std": 0.454224169254303, |
| "rewards/gpt4o_reward_model": 3.625, |
| "step": 131 |
| }, |
| { |
| "completion_length": 86.5, |
| "epoch": 0.014253320375769355, |
| "grad_norm": 4.374396324157715, |
| "kl": 0.0257568359375, |
| "learning_rate": 9.99042450842423e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.7288135886192322, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 132 |
| }, |
| { |
| "completion_length": 208.0, |
| "epoch": 0.01436130007558579, |
| "grad_norm": 5.3308610916137695, |
| "kl": 0.0211181640625, |
| "learning_rate": 9.990271962852676e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.8081126809120178, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 133 |
| }, |
| { |
| "completion_length": 98.5, |
| "epoch": 0.014469279775402225, |
| "grad_norm": 3.5027971267700195, |
| "kl": 0.02197265625, |
| "learning_rate": 9.990118212958473e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 134 |
| }, |
| { |
| "completion_length": 41.5, |
| "epoch": 0.014577259475218658, |
| "grad_norm": 8.935701370239258, |
| "kl": 0.02490234375, |
| "learning_rate": 9.989963258778728e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.6896764636039734, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 135 |
| }, |
| { |
| "completion_length": 85.25, |
| "epoch": 0.014685239175035093, |
| "grad_norm": 4.740475177764893, |
| "kl": 0.047119140625, |
| "learning_rate": 9.989807100350833e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.7090140581130981, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 136 |
| }, |
| { |
| "completion_length": 109.5, |
| "epoch": 0.014793218874851528, |
| "grad_norm": 4.925447463989258, |
| "kl": 0.0284423828125, |
| "learning_rate": 9.989649737712478e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 137 |
| }, |
| { |
| "completion_length": 82.75, |
| "epoch": 0.014901198574667962, |
| "grad_norm": 2.8169710636138916, |
| "kl": 0.026611328125, |
| "learning_rate": 9.98949117090164e-07, |
| "loss": 0.0, |
| "reward": 4.9375, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.9375, |
| "step": 138 |
| }, |
| { |
| "completion_length": 147.5, |
| "epoch": 0.015009178274484397, |
| "grad_norm": 4.605223178863525, |
| "kl": 0.032470703125, |
| "learning_rate": 9.989331399956583e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 139 |
| }, |
| { |
| "completion_length": 239.75, |
| "epoch": 0.015117157974300832, |
| "grad_norm": 3.3013551235198975, |
| "kl": 0.025634765625, |
| "learning_rate": 9.98917042491587e-07, |
| "loss": 0.0, |
| "reward": 4.375, |
| "reward_std": 0.6231511831283569, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 140 |
| }, |
| { |
| "completion_length": 129.75, |
| "epoch": 0.015225137674117265, |
| "grad_norm": 2.192405939102173, |
| "kl": 0.0244140625, |
| "learning_rate": 9.989008245818347e-07, |
| "loss": 0.0, |
| "reward": 4.875, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 141 |
| }, |
| { |
| "completion_length": 253.75, |
| "epoch": 0.0153331173739337, |
| "grad_norm": 3.278887987136841, |
| "kl": 0.02197265625, |
| "learning_rate": 9.988844862703152e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 142 |
| }, |
| { |
| "completion_length": 199.0, |
| "epoch": 0.015441097073750135, |
| "grad_norm": 3.9546308517456055, |
| "kl": 0.02001953125, |
| "learning_rate": 9.988680275609717e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 143 |
| }, |
| { |
| "completion_length": 179.0, |
| "epoch": 0.015549076773566569, |
| "grad_norm": 5.07952880859375, |
| "kl": 0.032470703125, |
| "learning_rate": 9.988514484577761e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.8644567728042603, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 144 |
| }, |
| { |
| "completion_length": 75.5, |
| "epoch": 0.015657056473383005, |
| "grad_norm": 5.506921291351318, |
| "kl": 0.0361328125, |
| "learning_rate": 9.988347489647298e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.5333483219146729, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 145 |
| }, |
| { |
| "completion_length": 209.25, |
| "epoch": 0.015765036173199437, |
| "grad_norm": 4.1503119468688965, |
| "kl": 0.030029296875, |
| "learning_rate": 9.988179290858627e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 146 |
| }, |
| { |
| "completion_length": 156.25, |
| "epoch": 0.015873015873015872, |
| "grad_norm": 3.442382335662842, |
| "kl": 0.041015625, |
| "learning_rate": 9.98800988825234e-07, |
| "loss": 0.0, |
| "reward": 4.875, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 147 |
| }, |
| { |
| "completion_length": 103.5, |
| "epoch": 0.015980995572832307, |
| "grad_norm": 3.5556161403656006, |
| "kl": 0.025390625, |
| "learning_rate": 9.987839281869321e-07, |
| "loss": 0.0, |
| "reward": 4.8125, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 148 |
| }, |
| { |
| "completion_length": 94.5, |
| "epoch": 0.016088975272648742, |
| "grad_norm": 5.813422203063965, |
| "kl": 0.0341796875, |
| "learning_rate": 9.987667471750743e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 149 |
| }, |
| { |
| "completion_length": 114.75, |
| "epoch": 0.016196954972465177, |
| "grad_norm": 2.6386795043945312, |
| "kl": 0.026123046875, |
| "learning_rate": 9.987494457938066e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.28877514600753784, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 150 |
| }, |
| { |
| "completion_length": 237.5, |
| "epoch": 0.016304934672281612, |
| "grad_norm": 5.137205123901367, |
| "kl": 0.033447265625, |
| "learning_rate": 9.987320240473049e-07, |
| "loss": 0.0, |
| "reward": 3.5625, |
| "reward_std": 0.6770563125610352, |
| "rewards/gpt4o_reward_model": 3.5625, |
| "step": 151 |
| }, |
| { |
| "completion_length": 61.25, |
| "epoch": 0.016412914372098044, |
| "grad_norm": 2.3743293285369873, |
| "kl": 0.00823974609375, |
| "learning_rate": 9.987144819397735e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.6827775835990906, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 152 |
| }, |
| { |
| "completion_length": 130.75, |
| "epoch": 0.01652089407191448, |
| "grad_norm": 3.382657289505005, |
| "kl": 0.032958984375, |
| "learning_rate": 9.98696819475446e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 153 |
| }, |
| { |
| "completion_length": 98.75, |
| "epoch": 0.016628873771730914, |
| "grad_norm": 5.266931533813477, |
| "kl": 0.025146484375, |
| "learning_rate": 9.986790366585847e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 154 |
| }, |
| { |
| "completion_length": 170.25, |
| "epoch": 0.01673685347154735, |
| "grad_norm": 4.579905033111572, |
| "kl": 0.03369140625, |
| "learning_rate": 9.986611334934814e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.8146764636039734, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 155 |
| }, |
| { |
| "completion_length": 258.25, |
| "epoch": 0.016844833171363784, |
| "grad_norm": 2.3568973541259766, |
| "kl": 0.029052734375, |
| "learning_rate": 9.986431099844567e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 156 |
| }, |
| { |
| "completion_length": 205.75, |
| "epoch": 0.01695281287118022, |
| "grad_norm": 4.3519158363342285, |
| "kl": 0.02392578125, |
| "learning_rate": 9.9862496613586e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.8421862125396729, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 157 |
| }, |
| { |
| "completion_length": 168.75, |
| "epoch": 0.01706079257099665, |
| "grad_norm": 3.8407411575317383, |
| "kl": 0.0308837890625, |
| "learning_rate": 9.986067019520707e-07, |
| "loss": 0.0, |
| "reward": 3.4375, |
| "reward_std": 0.6251000165939331, |
| "rewards/gpt4o_reward_model": 3.4375, |
| "step": 158 |
| }, |
| { |
| "completion_length": 143.75, |
| "epoch": 0.017168772270813086, |
| "grad_norm": 0.003290753811597824, |
| "kl": 0.0281982421875, |
| "learning_rate": 9.98588317437496e-07, |
| "loss": 0.0, |
| "reward": 2.5, |
| "reward_std": 9.999999747378752e-05, |
| "rewards/gpt4o_reward_model": 2.5, |
| "step": 159 |
| }, |
| { |
| "completion_length": 68.25, |
| "epoch": 0.01727675197062952, |
| "grad_norm": 4.90841817855835, |
| "kl": 0.023193359375, |
| "learning_rate": 9.98569812596573e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 160 |
| }, |
| { |
| "completion_length": 121.75, |
| "epoch": 0.017384731670445956, |
| "grad_norm": 3.6256470680236816, |
| "kl": 0.033935546875, |
| "learning_rate": 9.985511874337672e-07, |
| "loss": 0.0, |
| "reward": 4.9375, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.9375, |
| "step": 161 |
| }, |
| { |
| "completion_length": 118.75, |
| "epoch": 0.01749271137026239, |
| "grad_norm": 2.8302996158599854, |
| "kl": 0.03662109375, |
| "learning_rate": 9.98532441953574e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.23945678770542145, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 162 |
| }, |
| { |
| "completion_length": 160.75, |
| "epoch": 0.017600691070078826, |
| "grad_norm": 3.4306154251098633, |
| "kl": 0.027099609375, |
| "learning_rate": 9.985135761605167e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 163 |
| }, |
| { |
| "completion_length": 160.5, |
| "epoch": 0.01770867076989526, |
| "grad_norm": 3.66644287109375, |
| "kl": 0.0283203125, |
| "learning_rate": 9.984945900591486e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.47356173396110535, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 164 |
| }, |
| { |
| "completion_length": 166.75, |
| "epoch": 0.017816650469711693, |
| "grad_norm": 3.7970166206359863, |
| "kl": 0.0267333984375, |
| "learning_rate": 9.98475483654052e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 165 |
| }, |
| { |
| "completion_length": 272.75, |
| "epoch": 0.017924630169528128, |
| "grad_norm": 4.074111461639404, |
| "kl": 0.033935546875, |
| "learning_rate": 9.984562569498373e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.6637752056121826, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 166 |
| }, |
| { |
| "completion_length": 107.5, |
| "epoch": 0.018032609869344563, |
| "grad_norm": 5.892201900482178, |
| "kl": 0.040283203125, |
| "learning_rate": 9.984369099511452e-07, |
| "loss": 0.0, |
| "reward": 3.375, |
| "reward_std": 0.8623477220535278, |
| "rewards/gpt4o_reward_model": 3.375, |
| "step": 167 |
| }, |
| { |
| "completion_length": 93.5, |
| "epoch": 0.018140589569160998, |
| "grad_norm": 3.144777297973633, |
| "kl": 0.02099609375, |
| "learning_rate": 9.984174426626443e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 168 |
| }, |
| { |
| "completion_length": 184.5, |
| "epoch": 0.018248569268977433, |
| "grad_norm": 4.250265598297119, |
| "kl": 0.036376953125, |
| "learning_rate": 9.98397855089033e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 169 |
| }, |
| { |
| "completion_length": 25.5, |
| "epoch": 0.01835654896879387, |
| "grad_norm": 4.1492600440979, |
| "kl": 0.033447265625, |
| "learning_rate": 9.983781472350382e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.383794367313385, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 170 |
| }, |
| { |
| "completion_length": 219.5, |
| "epoch": 0.0184645286686103, |
| "grad_norm": 3.171717405319214, |
| "kl": 0.0380859375, |
| "learning_rate": 9.983583191054162e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.5281319618225098, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 171 |
| }, |
| { |
| "completion_length": 121.75, |
| "epoch": 0.018572508368426735, |
| "grad_norm": 4.934228897094727, |
| "kl": 0.048828125, |
| "learning_rate": 9.983383707049522e-07, |
| "loss": 0.0, |
| "reward": 4.0625, |
| "reward_std": 0.5879185199737549, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 172 |
| }, |
| { |
| "completion_length": 191.0, |
| "epoch": 0.01868048806824317, |
| "grad_norm": 2.89996600151062, |
| "kl": 0.02490234375, |
| "learning_rate": 9.983183020384605e-07, |
| "loss": 0.0, |
| "reward": 3.875, |
| "reward_std": 0.34856173396110535, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 173 |
| }, |
| { |
| "completion_length": 215.75, |
| "epoch": 0.018788467768059605, |
| "grad_norm": 3.5805468559265137, |
| "kl": 0.038330078125, |
| "learning_rate": 9.982981131107842e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 0.6229909658432007, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 174 |
| }, |
| { |
| "completion_length": 178.25, |
| "epoch": 0.01889644746787604, |
| "grad_norm": 2.7104833126068115, |
| "kl": 0.04150390625, |
| "learning_rate": 9.982778039267958e-07, |
| "loss": 0.0, |
| "reward": 4.9375, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.9375, |
| "step": 175 |
| }, |
| { |
| "completion_length": 76.5, |
| "epoch": 0.019004427167692475, |
| "grad_norm": 4.9249267578125, |
| "kl": 0.02392578125, |
| "learning_rate": 9.982573744913964e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.7694375514984131, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 176 |
| }, |
| { |
| "completion_length": 385.75, |
| "epoch": 0.019112406867508907, |
| "grad_norm": 4.761181354522705, |
| "kl": 0.0703125, |
| "learning_rate": 9.982368248095164e-07, |
| "loss": 0.0001, |
| "reward": 4.6875, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 177 |
| }, |
| { |
| "completion_length": 101.0, |
| "epoch": 0.019220386567325342, |
| "grad_norm": 3.50384259223938, |
| "kl": 0.042236328125, |
| "learning_rate": 9.982161548861152e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 178 |
| }, |
| { |
| "completion_length": 103.25, |
| "epoch": 0.019328366267141777, |
| "grad_norm": 2.0653398036956787, |
| "kl": 0.0439453125, |
| "learning_rate": 9.98195364726181e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 179 |
| }, |
| { |
| "completion_length": 63.75, |
| "epoch": 0.019436345966958212, |
| "grad_norm": 5.203299522399902, |
| "kl": 0.044677734375, |
| "learning_rate": 9.981744543347312e-07, |
| "loss": 0.0, |
| "reward": 3.9375, |
| "reward_std": 0.6251000165939331, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 180 |
| }, |
| { |
| "completion_length": 100.25, |
| "epoch": 0.019544325666774647, |
| "grad_norm": 2.8786332607269287, |
| "kl": 0.04052734375, |
| "learning_rate": 9.981534237168124e-07, |
| "loss": 0.0, |
| "reward": 4.5625, |
| "reward_std": 0.6250999569892883, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 181 |
| }, |
| { |
| "completion_length": 231.5, |
| "epoch": 0.019652305366591082, |
| "grad_norm": 2.2581024169921875, |
| "kl": 0.0284423828125, |
| "learning_rate": 9.981322728774997e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.776972770690918, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 182 |
| }, |
| { |
| "completion_length": 106.5, |
| "epoch": 0.019760285066407514, |
| "grad_norm": 4.273443698883057, |
| "kl": 0.03173828125, |
| "learning_rate": 9.981110018218977e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 183 |
| }, |
| { |
| "completion_length": 94.5, |
| "epoch": 0.01986826476622395, |
| "grad_norm": 4.629092693328857, |
| "kl": 0.07568359375, |
| "learning_rate": 9.9808961055514e-07, |
| "loss": 0.0001, |
| "reward": 4.6875, |
| "reward_std": 0.329224169254303, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 184 |
| }, |
| { |
| "completion_length": 148.0, |
| "epoch": 0.019976244466040384, |
| "grad_norm": 3.932042360305786, |
| "kl": 0.03173828125, |
| "learning_rate": 9.980680990823886e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.5581127405166626, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 185 |
| }, |
| { |
| "completion_length": 236.25, |
| "epoch": 0.02008422416585682, |
| "grad_norm": 3.403113842010498, |
| "kl": 0.0380859375, |
| "learning_rate": 9.980464674088354e-07, |
| "loss": 0.0, |
| "reward": 4.6875, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 186 |
| }, |
| { |
| "completion_length": 155.25, |
| "epoch": 0.020192203865673254, |
| "grad_norm": 3.3250508308410645, |
| "kl": 0.05419921875, |
| "learning_rate": 9.980247155397004e-07, |
| "loss": 0.0001, |
| "reward": 4.5, |
| "reward_std": 0.36445680260658264, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 187 |
| }, |
| { |
| "completion_length": 336.0, |
| "epoch": 0.02030018356548969, |
| "grad_norm": 2.428285837173462, |
| "kl": 0.03515625, |
| "learning_rate": 9.980028434802334e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 188 |
| }, |
| { |
| "completion_length": 106.75, |
| "epoch": 0.02040816326530612, |
| "grad_norm": 4.417919635772705, |
| "kl": 0.028076171875, |
| "learning_rate": 9.979808512357129e-07, |
| "loss": 0.0, |
| "reward": 4.3125, |
| "reward_std": 0.633794367313385, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 189 |
| }, |
| { |
| "completion_length": 94.75, |
| "epoch": 0.020516142965122556, |
| "grad_norm": 3.8756754398345947, |
| "kl": 0.042724609375, |
| "learning_rate": 9.979587388114464e-07, |
| "loss": 0.0, |
| "reward": 4.75, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 190 |
| }, |
| { |
| "completion_length": 119.75, |
| "epoch": 0.02062412266493899, |
| "grad_norm": 4.661324501037598, |
| "kl": 0.0517578125, |
| "learning_rate": 9.9793650621277e-07, |
| "loss": 0.0001, |
| "reward": 3.8125, |
| "reward_std": 0.829224169254303, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 191 |
| }, |
| { |
| "completion_length": 417.0, |
| "epoch": 0.020732102364755426, |
| "grad_norm": 2.5745935440063477, |
| "kl": 0.0291748046875, |
| "learning_rate": 9.979141534450495e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 192 |
| }, |
| { |
| "completion_length": 178.5, |
| "epoch": 0.02084008206457186, |
| "grad_norm": 1.9205126762390137, |
| "kl": 0.041259765625, |
| "learning_rate": 9.978916805136794e-07, |
| "loss": 0.0, |
| "reward": 4.9375, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.9375, |
| "step": 193 |
| }, |
| { |
| "completion_length": 223.5, |
| "epoch": 0.020948061764388296, |
| "grad_norm": 3.269942283630371, |
| "kl": 0.045166015625, |
| "learning_rate": 9.97869087424083e-07, |
| "loss": 0.0, |
| "reward": 4.1875, |
| "reward_std": 0.6229909658432007, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 194 |
| }, |
| { |
| "completion_length": 101.25, |
| "epoch": 0.02105604146420473, |
| "grad_norm": 4.042669296264648, |
| "kl": 0.0498046875, |
| "learning_rate": 9.97846374181713e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.7479909658432007, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 195 |
| }, |
| { |
| "completion_length": 124.0, |
| "epoch": 0.021164021164021163, |
| "grad_norm": 4.541233539581299, |
| "kl": 0.0478515625, |
| "learning_rate": 9.978235407920506e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.5985617637634277, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 196 |
| }, |
| { |
| "completion_length": 218.25, |
| "epoch": 0.021272000863837598, |
| "grad_norm": 3.685286521911621, |
| "kl": 0.03369140625, |
| "learning_rate": 9.978005872606065e-07, |
| "loss": 0.0, |
| "reward": 4.5, |
| "reward_std": 0.6231511831283569, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 197 |
| }, |
| { |
| "completion_length": 301.5, |
| "epoch": 0.021379980563654033, |
| "grad_norm": 2.64717173576355, |
| "kl": 0.0478515625, |
| "learning_rate": 9.977775135929202e-07, |
| "loss": 0.0, |
| "reward": 4.8125, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 198 |
| }, |
| { |
| "completion_length": 101.5, |
| "epoch": 0.021487960263470468, |
| "grad_norm": 3.8811116218566895, |
| "kl": 0.04248046875, |
| "learning_rate": 9.977543197945599e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.6251000165939331, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 199 |
| }, |
| { |
| "completion_length": 110.25, |
| "epoch": 0.021595939963286903, |
| "grad_norm": 4.5807342529296875, |
| "kl": 0.037109375, |
| "learning_rate": 9.977310058711235e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.704224169254303, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 200 |
| }, |
| { |
| "completion_length": 207.5, |
| "epoch": 0.021703919663103338, |
| "grad_norm": 5.462955951690674, |
| "kl": 0.09619140625, |
| "learning_rate": 9.97707571828237e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 201 |
| }, |
| { |
| "completion_length": 138.5, |
| "epoch": 0.02181189936291977, |
| "grad_norm": 6.441577911376953, |
| "kl": 0.0478515625, |
| "learning_rate": 9.97684017671556e-07, |
| "loss": 0.0, |
| "reward": 3.4375, |
| "reward_std": 0.9414719343185425, |
| "rewards/gpt4o_reward_model": 3.4375, |
| "step": 202 |
| }, |
| { |
| "completion_length": 140.0, |
| "epoch": 0.021919879062736205, |
| "grad_norm": 3.3736348152160645, |
| "kl": 0.037841796875, |
| "learning_rate": 9.97660343406765e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.8872368931770325, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 203 |
| }, |
| { |
| "completion_length": 172.25, |
| "epoch": 0.02202785876255264, |
| "grad_norm": 3.9529149532318115, |
| "kl": 0.044921875, |
| "learning_rate": 9.97636549039577e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 204 |
| }, |
| { |
| "completion_length": 667.25, |
| "epoch": 0.022135838462369075, |
| "grad_norm": 4.095589637756348, |
| "kl": 0.046630859375, |
| "learning_rate": 9.976126345757351e-07, |
| "loss": 0.0, |
| "reward": 3.625, |
| "reward_std": 0.454224169254303, |
| "rewards/gpt4o_reward_model": 3.625, |
| "step": 205 |
| }, |
| { |
| "completion_length": 187.0, |
| "epoch": 0.02224381816218551, |
| "grad_norm": 5.007861137390137, |
| "kl": 0.04150390625, |
| "learning_rate": 9.975886000210102e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.7286534309387207, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 206 |
| }, |
| { |
| "completion_length": 37.0, |
| "epoch": 0.022351797862001945, |
| "grad_norm": 5.55353307723999, |
| "kl": 0.0673828125, |
| "learning_rate": 9.975644453812028e-07, |
| "loss": 0.0001, |
| "reward": 4.0625, |
| "reward_std": 0.633794367313385, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 207 |
| }, |
| { |
| "completion_length": 210.75, |
| "epoch": 0.022459777561818377, |
| "grad_norm": 3.5840399265289307, |
| "kl": 0.06494140625, |
| "learning_rate": 9.97540170662142e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 208 |
| }, |
| { |
| "completion_length": 75.0, |
| "epoch": 0.022567757261634812, |
| "grad_norm": 4.6481828689575195, |
| "kl": 0.048095703125, |
| "learning_rate": 9.975157758696866e-07, |
| "loss": 0.0, |
| "reward": 3.8125, |
| "reward_std": 0.9524502754211426, |
| "rewards/gpt4o_reward_model": 3.8125, |
| "step": 209 |
| }, |
| { |
| "completion_length": 179.75, |
| "epoch": 0.022675736961451247, |
| "grad_norm": 3.486043691635132, |
| "kl": 0.036376953125, |
| "learning_rate": 9.974912610097235e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.6036534309387207, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 210 |
| }, |
| { |
| "completion_length": 146.25, |
| "epoch": 0.022783716661267682, |
| "grad_norm": 2.9825496673583984, |
| "kl": 0.0673828125, |
| "learning_rate": 9.97466626088169e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.36445680260658264, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 211 |
| }, |
| { |
| "completion_length": 188.0, |
| "epoch": 0.022891696361084117, |
| "grad_norm": 5.593983173370361, |
| "kl": 0.05615234375, |
| "learning_rate": 9.974418711109684e-07, |
| "loss": 0.0001, |
| "reward": 4.125, |
| "reward_std": 0.7974694967269897, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 212 |
| }, |
| { |
| "completion_length": 265.25, |
| "epoch": 0.022999676060900552, |
| "grad_norm": 4.792067527770996, |
| "kl": 0.038330078125, |
| "learning_rate": 9.97416996084096e-07, |
| "loss": 0.0, |
| "reward": 4.125, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 213 |
| }, |
| { |
| "completion_length": 131.25, |
| "epoch": 0.023107655760716984, |
| "grad_norm": 5.125412940979004, |
| "kl": 0.041259765625, |
| "learning_rate": 9.973920010135547e-07, |
| "loss": 0.0, |
| "reward": 3.75, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 214 |
| }, |
| { |
| "completion_length": 397.5, |
| "epoch": 0.02321563546053342, |
| "grad_norm": 4.1176910400390625, |
| "kl": 0.0517578125, |
| "learning_rate": 9.973668859053772e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 215 |
| }, |
| { |
| "completion_length": 145.25, |
| "epoch": 0.023323615160349854, |
| "grad_norm": 6.796933650970459, |
| "kl": 0.134765625, |
| "learning_rate": 9.973416507656243e-07, |
| "loss": 0.0001, |
| "reward": 4.4375, |
| "reward_std": 0.6637752056121826, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 216 |
| }, |
| { |
| "completion_length": 204.75, |
| "epoch": 0.02343159486016629, |
| "grad_norm": 2.643352746963501, |
| "kl": 0.049072265625, |
| "learning_rate": 9.97316295600386e-07, |
| "loss": 0.0, |
| "reward": 4.0, |
| "reward_std": 0.28877514600753784, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 217 |
| }, |
| { |
| "completion_length": 73.0, |
| "epoch": 0.023539574559982724, |
| "grad_norm": 1.9597053527832031, |
| "kl": 0.0390625, |
| "learning_rate": 9.972908204157815e-07, |
| "loss": 0.0, |
| "reward": 4.4375, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 218 |
| }, |
| { |
| "completion_length": 317.5, |
| "epoch": 0.02364755425979916, |
| "grad_norm": 3.215393304824829, |
| "kl": 0.033447265625, |
| "learning_rate": 9.972652252179589e-07, |
| "loss": 0.0, |
| "reward": 3.625, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 3.625, |
| "step": 219 |
| }, |
| { |
| "completion_length": 168.75, |
| "epoch": 0.02375553395961559, |
| "grad_norm": 2.8032920360565186, |
| "kl": 0.052001953125, |
| "learning_rate": 9.97239510013095e-07, |
| "loss": 0.0001, |
| "reward": 4.875, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 220 |
| }, |
| { |
| "completion_length": 118.75, |
| "epoch": 0.023863513659432026, |
| "grad_norm": 3.416057586669922, |
| "kl": 0.0537109375, |
| "learning_rate": 9.97213674807396e-07, |
| "loss": 0.0001, |
| "reward": 4.875, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 221 |
| }, |
| { |
| "completion_length": 88.5, |
| "epoch": 0.02397149335924846, |
| "grad_norm": 3.6612114906311035, |
| "kl": 0.05908203125, |
| "learning_rate": 9.971877196070967e-07, |
| "loss": 0.0001, |
| "reward": 4.5, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 222 |
| }, |
| { |
| "completion_length": 117.5, |
| "epoch": 0.024079473059064896, |
| "grad_norm": 2.947577714920044, |
| "kl": 0.04345703125, |
| "learning_rate": 9.971616444184607e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.6036534309387207, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 223 |
| }, |
| { |
| "completion_length": 155.0, |
| "epoch": 0.02418745275888133, |
| "grad_norm": 2.8114304542541504, |
| "kl": 0.048583984375, |
| "learning_rate": 9.971354492477812e-07, |
| "loss": 0.0, |
| "reward": 4.625, |
| "reward_std": 0.36445680260658264, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 224 |
| }, |
| { |
| "completion_length": 105.75, |
| "epoch": 0.024295432458697766, |
| "grad_norm": 2.7532522678375244, |
| "kl": 0.05224609375, |
| "learning_rate": 9.9710913410138e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 225 |
| }, |
| { |
| "completion_length": 251.25, |
| "epoch": 0.024403412158514198, |
| "grad_norm": 3.266697883605957, |
| "kl": 0.052001953125, |
| "learning_rate": 9.970826989856076e-07, |
| "loss": 0.0001, |
| "reward": 4.6875, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 226 |
| }, |
| { |
| "completion_length": 199.5, |
| "epoch": 0.024511391858330633, |
| "grad_norm": 3.5519814491271973, |
| "kl": 0.05908203125, |
| "learning_rate": 9.970561439068438e-07, |
| "loss": 0.0001, |
| "reward": 3.9375, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 3.9375, |
| "step": 227 |
| }, |
| { |
| "completion_length": 61.25, |
| "epoch": 0.024619371558147068, |
| "grad_norm": 3.1293420791625977, |
| "kl": 0.07080078125, |
| "learning_rate": 9.970294688714975e-07, |
| "loss": 0.0001, |
| "reward": 4.6875, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 228 |
| }, |
| { |
| "completion_length": 207.25, |
| "epoch": 0.024727351257963503, |
| "grad_norm": 4.59749174118042, |
| "kl": 0.04931640625, |
| "learning_rate": 9.970026738860058e-07, |
| "loss": 0.0, |
| "reward": 3.96875, |
| "reward_std": 0.8234953880310059, |
| "rewards/gpt4o_reward_model": 3.96875, |
| "step": 229 |
| }, |
| { |
| "completion_length": 137.0, |
| "epoch": 0.024835330957779938, |
| "grad_norm": 3.6132426261901855, |
| "kl": 0.04833984375, |
| "learning_rate": 9.969757589568354e-07, |
| "loss": 0.0, |
| "reward": 4.25, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 230 |
| }, |
| { |
| "completion_length": 115.75, |
| "epoch": 0.024943310657596373, |
| "grad_norm": 4.572238922119141, |
| "kl": 0.06201171875, |
| "learning_rate": 9.96948724090482e-07, |
| "loss": 0.0001, |
| "reward": 4.5, |
| "reward_std": 0.5774502754211426, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 231 |
| }, |
| { |
| "completion_length": 49.5, |
| "epoch": 0.025051290357412808, |
| "grad_norm": 3.109471559524536, |
| "kl": 0.0751953125, |
| "learning_rate": 9.969215692934702e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.4331127107143402, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 232 |
| }, |
| { |
| "completion_length": 336.0, |
| "epoch": 0.02515927005722924, |
| "grad_norm": 4.591429710388184, |
| "kl": 0.05322265625, |
| "learning_rate": 9.968942945723529e-07, |
| "loss": 0.0001, |
| "reward": 4.375, |
| "reward_std": 0.8185809850692749, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 233 |
| }, |
| { |
| "completion_length": 174.25, |
| "epoch": 0.025267249757045675, |
| "grad_norm": 3.161344289779663, |
| "kl": 0.06103515625, |
| "learning_rate": 9.968668999337124e-07, |
| "loss": 0.0001, |
| "reward": 4.75, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 234 |
| }, |
| { |
| "completion_length": 117.75, |
| "epoch": 0.02537522945686211, |
| "grad_norm": 3.8266708850860596, |
| "kl": 0.05615234375, |
| "learning_rate": 9.968393853841605e-07, |
| "loss": 0.0001, |
| "reward": 4.0625, |
| "reward_std": 0.23945678770542145, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 235 |
| }, |
| { |
| "completion_length": 144.75, |
| "epoch": 0.025483209156678545, |
| "grad_norm": 3.3600192070007324, |
| "kl": 0.05712890625, |
| "learning_rate": 9.96811750930337e-07, |
| "loss": 0.0001, |
| "reward": 4.125, |
| "reward_std": 0.9463939070701599, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 236 |
| }, |
| { |
| "completion_length": 260.5, |
| "epoch": 0.02559118885649498, |
| "grad_norm": 4.064725875854492, |
| "kl": 0.05859375, |
| "learning_rate": 9.96783996578911e-07, |
| "loss": 0.0001, |
| "reward": 4.75, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 237 |
| }, |
| { |
| "completion_length": 151.5, |
| "epoch": 0.025699168556311415, |
| "grad_norm": 2.2150022983551025, |
| "kl": 0.04931640625, |
| "learning_rate": 9.967561223365806e-07, |
| "loss": 0.0, |
| "reward": 4.875, |
| "reward_std": 0.14443756639957428, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 238 |
| }, |
| { |
| "completion_length": 140.5, |
| "epoch": 0.025807148256127847, |
| "grad_norm": 3.777392625808716, |
| "kl": 0.09326171875, |
| "learning_rate": 9.96728128210073e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.5387751460075378, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 239 |
| }, |
| { |
| "completion_length": 179.5, |
| "epoch": 0.02591512795594428, |
| "grad_norm": 5.373637676239014, |
| "kl": 0.0849609375, |
| "learning_rate": 9.967000142061439e-07, |
| "loss": 0.0001, |
| "reward": 4.3125, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 240 |
| }, |
| { |
| "completion_length": 133.0, |
| "epoch": 0.026023107655760717, |
| "grad_norm": 1.9456915855407715, |
| "kl": 0.061767578125, |
| "learning_rate": 9.966717803315785e-07, |
| "loss": 0.0001, |
| "reward": 4.3125, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 241 |
| }, |
| { |
| "completion_length": 335.0, |
| "epoch": 0.026131087355577152, |
| "grad_norm": 3.1238064765930176, |
| "kl": 0.04443359375, |
| "learning_rate": 9.966434265931902e-07, |
| "loss": 0.0, |
| "reward": 4.8125, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 242 |
| }, |
| { |
| "completion_length": 186.5, |
| "epoch": 0.026239067055393587, |
| "grad_norm": 4.6672492027282715, |
| "kl": 0.0595703125, |
| "learning_rate": 9.966149529978221e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 243 |
| }, |
| { |
| "completion_length": 192.25, |
| "epoch": 0.026347046755210022, |
| "grad_norm": 3.2580738067626953, |
| "kl": 0.052001953125, |
| "learning_rate": 9.965863595523454e-07, |
| "loss": 0.0001, |
| "reward": 4.6875, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 244 |
| }, |
| { |
| "completion_length": 152.75, |
| "epoch": 0.026455026455026454, |
| "grad_norm": 4.204739570617676, |
| "kl": 0.0771484375, |
| "learning_rate": 9.96557646263661e-07, |
| "loss": 0.0001, |
| "reward": 4.1875, |
| "reward_std": 0.41377514600753784, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 245 |
| }, |
| { |
| "completion_length": 120.0, |
| "epoch": 0.02656300615484289, |
| "grad_norm": 4.965406894683838, |
| "kl": 0.055419921875, |
| "learning_rate": 9.965288131386984e-07, |
| "loss": 0.0001, |
| "reward": 4.3125, |
| "reward_std": 1.0930101871490479, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 246 |
| }, |
| { |
| "completion_length": 67.25, |
| "epoch": 0.026670985854659324, |
| "grad_norm": 3.763925552368164, |
| "kl": 0.07470703125, |
| "learning_rate": 9.964998601844158e-07, |
| "loss": 0.0001, |
| "reward": 4.4375, |
| "reward_std": 0.6251000165939331, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 247 |
| }, |
| { |
| "completion_length": 81.5, |
| "epoch": 0.02677896555447576, |
| "grad_norm": 9.182212829589844, |
| "kl": 0.055908203125, |
| "learning_rate": 9.96470787407801e-07, |
| "loss": 0.0001, |
| "reward": 4.0625, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 248 |
| }, |
| { |
| "completion_length": 97.0, |
| "epoch": 0.026886945254292194, |
| "grad_norm": 2.3663017749786377, |
| "kl": 0.04931640625, |
| "learning_rate": 9.964415948158696e-07, |
| "loss": 0.0, |
| "reward": 4.875, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 249 |
| }, |
| { |
| "completion_length": 100.75, |
| "epoch": 0.02699492495410863, |
| "grad_norm": 3.542591094970703, |
| "kl": 0.08837890625, |
| "learning_rate": 9.964122824156672e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.383794367313385, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 250 |
| }, |
| { |
| "completion_length": 178.75, |
| "epoch": 0.02710290465392506, |
| "grad_norm": 3.1234817504882812, |
| "kl": 0.061279296875, |
| "learning_rate": 9.963828502142677e-07, |
| "loss": 0.0001, |
| "reward": 4.8125, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 251 |
| }, |
| { |
| "completion_length": 146.0, |
| "epoch": 0.027210884353741496, |
| "grad_norm": 3.096550226211548, |
| "kl": 0.060546875, |
| "learning_rate": 9.963532982187743e-07, |
| "loss": 0.0001, |
| "reward": 4.5, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 252 |
| }, |
| { |
| "completion_length": 339.75, |
| "epoch": 0.02731886405355793, |
| "grad_norm": 4.885226726531982, |
| "kl": 0.050048828125, |
| "learning_rate": 9.963236264363188e-07, |
| "loss": 0.0001, |
| "reward": 3.875, |
| "reward_std": 0.5985617637634277, |
| "rewards/gpt4o_reward_model": 3.875, |
| "step": 253 |
| }, |
| { |
| "completion_length": 141.75, |
| "epoch": 0.027426843753374366, |
| "grad_norm": 4.643560886383057, |
| "kl": 0.09033203125, |
| "learning_rate": 9.962938348740617e-07, |
| "loss": 0.0001, |
| "reward": 4.3125, |
| "reward_std": 0.633794367313385, |
| "rewards/gpt4o_reward_model": 4.3125, |
| "step": 254 |
| }, |
| { |
| "completion_length": 150.75, |
| "epoch": 0.0275348234531908, |
| "grad_norm": 8.297577857971191, |
| "kl": 0.1337890625, |
| "learning_rate": 9.962639235391932e-07, |
| "loss": 0.0001, |
| "reward": 4.1875, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 255 |
| }, |
| { |
| "completion_length": 122.5, |
| "epoch": 0.027642803153007236, |
| "grad_norm": 3.3994829654693604, |
| "kl": 0.055419921875, |
| "learning_rate": 9.962338924389318e-07, |
| "loss": 0.0001, |
| "reward": 4.4375, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 256 |
| }, |
| { |
| "completion_length": 166.0, |
| "epoch": 0.027750782852823667, |
| "grad_norm": 4.4432244300842285, |
| "kl": 0.0712890625, |
| "learning_rate": 9.962037415805248e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.6444375514984131, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 257 |
| }, |
| { |
| "completion_length": 104.0, |
| "epoch": 0.027858762552640103, |
| "grad_norm": 4.164874076843262, |
| "kl": 0.0546875, |
| "learning_rate": 9.961734709712488e-07, |
| "loss": 0.0001, |
| "reward": 3.625, |
| "reward_std": 0.9396764636039734, |
| "rewards/gpt4o_reward_model": 3.625, |
| "step": 258 |
| }, |
| { |
| "completion_length": 334.25, |
| "epoch": 0.027966742252456538, |
| "grad_norm": 5.587090015411377, |
| "kl": 0.08642578125, |
| "learning_rate": 9.961430806184093e-07, |
| "loss": 0.0001, |
| "reward": 4.4375, |
| "reward_std": 0.5921862125396729, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 259 |
| }, |
| { |
| "completion_length": 50.25, |
| "epoch": 0.028074721952272973, |
| "grad_norm": 3.80084490776062, |
| "kl": 0.0908203125, |
| "learning_rate": 9.9611257052934e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.4331127107143402, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 260 |
| }, |
| { |
| "completion_length": 216.5, |
| "epoch": 0.028182701652089408, |
| "grad_norm": 5.016978740692139, |
| "kl": 0.05322265625, |
| "learning_rate": 9.960819407114046e-07, |
| "loss": 0.0001, |
| "reward": 4.375, |
| "reward_std": 0.6896764636039734, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 261 |
| }, |
| { |
| "completion_length": 45.0, |
| "epoch": 0.028290681351905843, |
| "grad_norm": 9.00546932220459, |
| "kl": 0.0830078125, |
| "learning_rate": 9.960511911719949e-07, |
| "loss": 0.0001, |
| "reward": 3.3125, |
| "reward_std": 0.9137751460075378, |
| "rewards/gpt4o_reward_model": 3.3125, |
| "step": 262 |
| }, |
| { |
| "completion_length": 113.5, |
| "epoch": 0.028398661051722278, |
| "grad_norm": 2.5115816593170166, |
| "kl": 0.0673828125, |
| "learning_rate": 9.960203219185314e-07, |
| "loss": 0.0001, |
| "reward": 4.8125, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 263 |
| }, |
| { |
| "completion_length": 466.75, |
| "epoch": 0.02850664075153871, |
| "grad_norm": 4.739973545074463, |
| "kl": 0.07861328125, |
| "learning_rate": 9.959893329584647e-07, |
| "loss": 0.0001, |
| "reward": 4.1875, |
| "reward_std": 0.633794367313385, |
| "rewards/gpt4o_reward_model": 4.1875, |
| "step": 264 |
| }, |
| { |
| "completion_length": 188.25, |
| "epoch": 0.028614620451355145, |
| "grad_norm": 3.5349810123443604, |
| "kl": 0.083984375, |
| "learning_rate": 9.95958224299273e-07, |
| "loss": 0.0001, |
| "reward": 4.75, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 265 |
| }, |
| { |
| "completion_length": 79.0, |
| "epoch": 0.02872260015117158, |
| "grad_norm": 4.118009567260742, |
| "kl": 0.055908203125, |
| "learning_rate": 9.95926995948464e-07, |
| "loss": 0.0001, |
| "reward": 4.4375, |
| "reward_std": 0.7286534309387207, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 266 |
| }, |
| { |
| "completion_length": 63.25, |
| "epoch": 0.028830579850988015, |
| "grad_norm": 4.015948295593262, |
| "kl": 0.111328125, |
| "learning_rate": 9.95895647913574e-07, |
| "loss": 0.0001, |
| "reward": 4.8125, |
| "reward_std": 0.2694375813007355, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 267 |
| }, |
| { |
| "completion_length": 85.75, |
| "epoch": 0.02893855955080445, |
| "grad_norm": 4.853457927703857, |
| "kl": 0.07763671875, |
| "learning_rate": 9.958641802021685e-07, |
| "loss": 0.0001, |
| "reward": 4.4375, |
| "reward_std": 0.6038135886192322, |
| "rewards/gpt4o_reward_model": 4.4375, |
| "step": 268 |
| }, |
| { |
| "completion_length": 103.0, |
| "epoch": 0.029046539250620885, |
| "grad_norm": 2.9576172828674316, |
| "kl": 0.05126953125, |
| "learning_rate": 9.958325928218419e-07, |
| "loss": 0.0001, |
| "reward": 4.125, |
| "reward_std": 0.4788135886192322, |
| "rewards/gpt4o_reward_model": 4.125, |
| "step": 269 |
| }, |
| { |
| "completion_length": 192.0, |
| "epoch": 0.029154518950437316, |
| "grad_norm": 3.628898859024048, |
| "kl": 0.0869140625, |
| "learning_rate": 9.958008857802169e-07, |
| "loss": 0.0001, |
| "reward": 4.8125, |
| "reward_std": 0.3751000165939331, |
| "rewards/gpt4o_reward_model": 4.8125, |
| "step": 270 |
| }, |
| { |
| "completion_length": 321.0, |
| "epoch": 0.02926249865025375, |
| "grad_norm": 3.3045108318328857, |
| "kl": 0.06982421875, |
| "learning_rate": 9.957690590849457e-07, |
| "loss": 0.0001, |
| "reward": 4.6875, |
| "reward_std": 0.5194375514984131, |
| "rewards/gpt4o_reward_model": 4.6875, |
| "step": 271 |
| }, |
| { |
| "completion_length": 123.5, |
| "epoch": 0.029370478350070187, |
| "grad_norm": 14.728436470031738, |
| "kl": 6.5, |
| "learning_rate": 9.957371127437093e-07, |
| "loss": 0.0065, |
| "reward": 3.75, |
| "reward_std": 0.8185809850692749, |
| "rewards/gpt4o_reward_model": 3.75, |
| "step": 272 |
| }, |
| { |
| "completion_length": 169.5, |
| "epoch": 0.02947845804988662, |
| "grad_norm": 2.554886817932129, |
| "kl": 0.1025390625, |
| "learning_rate": 9.957050467642172e-07, |
| "loss": 0.0001, |
| "reward": 4.875, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.875, |
| "step": 273 |
| }, |
| { |
| "completion_length": 119.25, |
| "epoch": 0.029586437749703057, |
| "grad_norm": 2.4691953659057617, |
| "kl": 0.0673828125, |
| "learning_rate": 9.956728611542082e-07, |
| "loss": 0.0001, |
| "reward": 4.9375, |
| "reward_std": 0.1251000016927719, |
| "rewards/gpt4o_reward_model": 4.9375, |
| "step": 274 |
| }, |
| { |
| "completion_length": 181.0, |
| "epoch": 0.029694417449519492, |
| "grad_norm": 4.251060485839844, |
| "kl": 0.0791015625, |
| "learning_rate": 9.956405559214498e-07, |
| "loss": 0.0001, |
| "reward": 4.5, |
| "reward_std": 0.6144567728042603, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 275 |
| }, |
| { |
| "completion_length": 181.25, |
| "epoch": 0.029802397149335923, |
| "grad_norm": 3.3034920692443848, |
| "kl": 0.07958984375, |
| "learning_rate": 9.956081310737382e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.47356173396110535, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 276 |
| }, |
| { |
| "completion_length": 105.75, |
| "epoch": 0.02991037684915236, |
| "grad_norm": 4.519341945648193, |
| "kl": 0.08642578125, |
| "learning_rate": 9.955755866188986e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.5001000165939331, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 277 |
| }, |
| { |
| "completion_length": 115.0, |
| "epoch": 0.030018356548968794, |
| "grad_norm": 3.830291509628296, |
| "kl": 0.0966796875, |
| "learning_rate": 9.955429225647854e-07, |
| "loss": 0.0001, |
| "reward": 3.6875, |
| "reward_std": 0.5281319618225098, |
| "rewards/gpt4o_reward_model": 3.6875, |
| "step": 278 |
| }, |
| { |
| "completion_length": 342.75, |
| "epoch": 0.03012633624878523, |
| "grad_norm": 3.2819526195526123, |
| "kl": 0.0712890625, |
| "learning_rate": 9.95510138919281e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 279 |
| }, |
| { |
| "completion_length": 107.75, |
| "epoch": 0.030234315948601664, |
| "grad_norm": 4.989029884338379, |
| "kl": 0.0771484375, |
| "learning_rate": 9.95477235690298e-07, |
| "loss": 0.0001, |
| "reward": 4.0, |
| "reward_std": 0.7501000165939331, |
| "rewards/gpt4o_reward_model": 4.0, |
| "step": 280 |
| }, |
| { |
| "completion_length": 128.0, |
| "epoch": 0.0303422956484181, |
| "grad_norm": 2.953835964202881, |
| "kl": 0.09375, |
| "learning_rate": 9.954442128857761e-07, |
| "loss": 0.0001, |
| "reward": 4.75, |
| "reward_std": 0.28877514600753784, |
| "rewards/gpt4o_reward_model": 4.75, |
| "step": 281 |
| }, |
| { |
| "completion_length": 142.75, |
| "epoch": 0.03045027534823453, |
| "grad_norm": 4.569122314453125, |
| "kl": 0.08544921875, |
| "learning_rate": 9.954110705136856e-07, |
| "loss": 0.0001, |
| "reward": 4.375, |
| "reward_std": 0.5387751460075378, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 282 |
| }, |
| { |
| "completion_length": 291.25, |
| "epoch": 0.030558255048050965, |
| "grad_norm": 1.5806266069412231, |
| "kl": 0.09521484375, |
| "learning_rate": 9.953778085820245e-07, |
| "loss": 0.0001, |
| "reward": 4.625, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.625, |
| "step": 283 |
| }, |
| { |
| "completion_length": 35.75, |
| "epoch": 0.0306662347478674, |
| "grad_norm": 5.810234069824219, |
| "kl": 0.11572265625, |
| "learning_rate": 9.953444270988203e-07, |
| "loss": 0.0001, |
| "reward": 4.5625, |
| "reward_std": 0.48945680260658264, |
| "rewards/gpt4o_reward_model": 4.5625, |
| "step": 284 |
| }, |
| { |
| "completion_length": 128.0, |
| "epoch": 0.030774214447683836, |
| "grad_norm": 0.006735849194228649, |
| "kl": 0.09912109375, |
| "learning_rate": 9.953109260721287e-07, |
| "loss": 0.0001, |
| "reward": 5.0, |
| "reward_std": 9.999999747378752e-05, |
| "rewards/gpt4o_reward_model": 5.0, |
| "step": 285 |
| }, |
| { |
| "completion_length": 188.5, |
| "epoch": 0.03088219414750027, |
| "grad_norm": 3.38196063041687, |
| "kl": 0.078125, |
| "learning_rate": 9.952773055100351e-07, |
| "loss": 0.0001, |
| "reward": 4.375, |
| "reward_std": 0.2501000165939331, |
| "rewards/gpt4o_reward_model": 4.375, |
| "step": 286 |
| }, |
| { |
| "completion_length": 241.75, |
| "epoch": 0.030990173847316706, |
| "grad_norm": 3.892089605331421, |
| "kl": 0.0888671875, |
| "learning_rate": 9.95243565420653e-07, |
| "loss": 0.0001, |
| "reward": 4.0625, |
| "reward_std": 0.633794367313385, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 287 |
| }, |
| { |
| "completion_length": 141.5, |
| "epoch": 0.031098153547133137, |
| "grad_norm": 3.2392618656158447, |
| "kl": 0.109375, |
| "learning_rate": 9.95209705812125e-07, |
| "loss": 0.0001, |
| "reward": 4.5, |
| "reward_std": 0.508794367313385, |
| "rewards/gpt4o_reward_model": 4.5, |
| "step": 288 |
| }, |
| { |
| "completion_length": 232.25, |
| "epoch": 0.031206133246949572, |
| "grad_norm": 2.5510284900665283, |
| "kl": 0.06640625, |
| "learning_rate": 9.95175726692623e-07, |
| "loss": 0.0001, |
| "reward": 4.0625, |
| "reward_std": 0.4435809552669525, |
| "rewards/gpt4o_reward_model": 4.0625, |
| "step": 289 |
| }, |
| { |
| "completion_length": 53.25, |
| "epoch": 0.03131411294676601, |
| "grad_norm": 6.491455078125, |
| "kl": 0.1220703125, |
| "learning_rate": 9.951416280703465e-07, |
| "loss": 0.0001, |
| "reward": 4.25, |
| "reward_std": 0.3944375813007355, |
| "rewards/gpt4o_reward_model": 4.25, |
| "step": 290 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 6400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 5, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|