{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03131411294676601, "eval_steps": 500, "global_step": 290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 149.0, "epoch": 0.00010797969981643452, "grad_norm": 3.35890793800354, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 3.875, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 3.875, "step": 1 }, { "completion_length": 87.25, "epoch": 0.00021595939963286903, "grad_norm": 10.705058097839355, "kl": 0.0, "learning_rate": 2e-07, "loss": -0.0, "reward": 4.5625, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.5625, "step": 2 }, { "completion_length": 125.0, "epoch": 0.00032393909944930353, "grad_norm": 7.079329013824463, "kl": 0.00016307830810546875, "learning_rate": 4e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.758794367313385, "rewards/gpt4o_reward_model": 4.5, "step": 3 }, { "completion_length": 99.5, "epoch": 0.00043191879926573806, "grad_norm": 4.656938552856445, "kl": 0.0004425048828125, "learning_rate": 6e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.6178992986679077, "rewards/gpt4o_reward_model": 3.6875, "step": 4 }, { "completion_length": 175.75, "epoch": 0.0005398984990821725, "grad_norm": 4.849682807922363, "kl": 7.62939453125e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.579224169254303, "rewards/gpt4o_reward_model": 4.0625, "step": 5 }, { "completion_length": 667.0, "epoch": 0.0006478781988986071, "grad_norm": 4.602669715881348, "kl": 5.936622619628906e-05, "learning_rate": 1e-06, "loss": 0.0, "reward": 3.125, "reward_std": 0.9418070316314697, "rewards/gpt4o_reward_model": 3.125, "step": 6 }, { "completion_length": 220.5, "epoch": 0.0007558578987150416, "grad_norm": 4.340234756469727, "kl": 0.00010204315185546875, "learning_rate": 9.999999396664822e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.7619017362594604, "rewards/gpt4o_reward_model": 4.0, "step": 7 }, { "completion_length": 89.75, "epoch": 0.0008638375985314761, "grad_norm": 3.9225738048553467, "kl": 0.0001468658447265625, "learning_rate": 9.999997586659434e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.8765935897827148, "rewards/gpt4o_reward_model": 4.1875, "step": 8 }, { "completion_length": 135.0, "epoch": 0.0009718172983479105, "grad_norm": 4.508596897125244, "kl": 0.00015354156494140625, "learning_rate": 9.999994569984275e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.5625, "step": 9 }, { "completion_length": 199.25, "epoch": 0.001079796998164345, "grad_norm": 5.958744525909424, "kl": 0.0003070831298828125, "learning_rate": 9.99999034664007e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.81220543384552, "rewards/gpt4o_reward_model": 3.9375, "step": 10 }, { "completion_length": 111.5, "epoch": 0.0011877766979807797, "grad_norm": 5.480228900909424, "kl": 0.0001983642578125, "learning_rate": 9.999984916627839e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.5625, "step": 11 }, { "completion_length": 185.25, "epoch": 0.0012957563977972141, "grad_norm": 3.4712541103363037, "kl": 0.000213623046875, "learning_rate": 9.999978279948895e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.3125, "step": 12 }, { "completion_length": 112.75, "epoch": 0.0014037360976136485, "grad_norm": 3.926478147506714, "kl": 0.0001087188720703125, "learning_rate": 9.999970436604836e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.7286534309387207, "rewards/gpt4o_reward_model": 4.3125, "step": 13 }, { "completion_length": 147.75, "epoch": 0.0015117157974300832, "grad_norm": 3.4515721797943115, "kl": 0.0002498626708984375, "learning_rate": 9.999961386597556e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.3125, "step": 14 }, { "completion_length": 117.5, "epoch": 0.0016196954972465176, "grad_norm": 6.8424906730651855, "kl": 0.000476837158203125, "learning_rate": 9.99995112992924e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 3.75, "step": 15 }, { "completion_length": 159.5, "epoch": 0.0017276751970629522, "grad_norm": 3.0959248542785645, "kl": 0.000247955322265625, "learning_rate": 9.999939666602364e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.6875, "step": 16 }, { "completion_length": 534.0, "epoch": 0.0018356548968793867, "grad_norm": 5.143649101257324, "kl": 0.000560760498046875, "learning_rate": 9.999926996619692e-07, "loss": 0.0, "reward": 3.5625, "reward_std": 0.8751000165939331, "rewards/gpt4o_reward_model": 3.5625, "step": 17 }, { "completion_length": 181.25, "epoch": 0.001943634596695821, "grad_norm": 3.3201894760131836, "kl": 0.0004825592041015625, "learning_rate": 9.999913119984283e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.25, "step": 18 }, { "completion_length": 107.0, "epoch": 0.0020516142965122555, "grad_norm": 4.407830238342285, "kl": 0.000476837158203125, "learning_rate": 9.999898036699488e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.0, "step": 19 }, { "completion_length": 106.5, "epoch": 0.00215959399632869, "grad_norm": 5.270094394683838, "kl": 0.0008392333984375, "learning_rate": 9.999881746768941e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.633794367313385, "rewards/gpt4o_reward_model": 3.6875, "step": 20 }, { "completion_length": 67.0, "epoch": 0.0022675736961451248, "grad_norm": 3.9482433795928955, "kl": 0.000579833984375, "learning_rate": 9.99986425019658e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.758794367313385, "rewards/gpt4o_reward_model": 3.75, "step": 21 }, { "completion_length": 163.5, "epoch": 0.0023755533959615594, "grad_norm": 3.5114428997039795, "kl": 0.000675201416015625, "learning_rate": 9.999845546986625e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.4375, "step": 22 }, { "completion_length": 80.5, "epoch": 0.0024835330957779936, "grad_norm": 2.4469258785247803, "kl": 0.00099945068359375, "learning_rate": 9.99982563714359e-07, "loss": 0.0, "reward": 3.1875, "reward_std": 0.23945678770542145, "rewards/gpt4o_reward_model": 3.1875, "step": 23 }, { "completion_length": 136.25, "epoch": 0.0025915127955944283, "grad_norm": 2.949350357055664, "kl": 0.000576019287109375, "learning_rate": 9.999804520672277e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.28877514600753784, "rewards/gpt4o_reward_model": 4.0, "step": 24 }, { "completion_length": 56.75, "epoch": 0.002699492495410863, "grad_norm": 4.914743423461914, "kl": 0.00127410888671875, "learning_rate": 9.999782197577788e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.5387751460075378, "rewards/gpt4o_reward_model": 4.0, "step": 25 }, { "completion_length": 83.0, "epoch": 0.002807472195227297, "grad_norm": 4.330339431762695, "kl": 0.00090789794921875, "learning_rate": 9.999758667865504e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.3125, "step": 26 }, { "completion_length": 111.25, "epoch": 0.0029154518950437317, "grad_norm": 7.491921901702881, "kl": 0.00148773193359375, "learning_rate": 9.999733931541108e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.796310305595398, "rewards/gpt4o_reward_model": 3.6875, "step": 27 }, { "completion_length": 99.5, "epoch": 0.0030234315948601664, "grad_norm": 1.0942319631576538, "kl": 0.00092315673828125, "learning_rate": 9.999707988610568e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.625, "step": 28 }, { "completion_length": 107.5, "epoch": 0.003131411294676601, "grad_norm": 4.034415245056152, "kl": 0.001800537109375, "learning_rate": 9.999680839080146e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.579224169254303, "rewards/gpt4o_reward_model": 3.9375, "step": 29 }, { "completion_length": 183.5, "epoch": 0.003239390994493035, "grad_norm": 5.061659336090088, "kl": 0.0013885498046875, "learning_rate": 9.999652482956392e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.4375, "step": 30 }, { "completion_length": 116.75, "epoch": 0.00334737069430947, "grad_norm": 4.587920188903809, "kl": 0.002685546875, "learning_rate": 9.99962292024615e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.7394567728042603, "rewards/gpt4o_reward_model": 3.8125, "step": 31 }, { "completion_length": 92.0, "epoch": 0.0034553503941259045, "grad_norm": 4.07750129699707, "kl": 0.001800537109375, "learning_rate": 9.999592150956556e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.47356173396110535, "rewards/gpt4o_reward_model": 3.6875, "step": 32 }, { "completion_length": 92.0, "epoch": 0.0035633300939423387, "grad_norm": 4.351490020751953, "kl": 0.0020294189453125, "learning_rate": 9.999560175095034e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.6115237474441528, "rewards/gpt4o_reward_model": 4.125, "step": 33 }, { "completion_length": 92.5, "epoch": 0.0036713097937587733, "grad_norm": 5.623327255249023, "kl": 0.0019378662109375, "learning_rate": 9.9995269926693e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.34856173396110535, "rewards/gpt4o_reward_model": 4.625, "step": 34 }, { "completion_length": 90.5, "epoch": 0.003779289493575208, "grad_norm": 7.087028503417969, "kl": 0.00164031982421875, "learning_rate": 9.999492603687366e-07, "loss": 0.0, "reward": 4.8125, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.8125, "step": 35 }, { "completion_length": 117.5, "epoch": 0.003887269193391642, "grad_norm": 3.4546611309051514, "kl": 0.00173187255859375, "learning_rate": 9.999457008157528e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.0, "step": 36 }, { "completion_length": 164.25, "epoch": 0.003995248893208077, "grad_norm": 3.795534610748291, "kl": 0.00201416015625, "learning_rate": 9.999420206088379e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 3.875, "step": 37 }, { "completion_length": 168.75, "epoch": 0.004103228593024511, "grad_norm": 4.190282344818115, "kl": 0.00183868408203125, "learning_rate": 9.999382197488796e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.3125, "step": 38 }, { "completion_length": 140.0, "epoch": 0.004211208292840946, "grad_norm": 2.999417781829834, "kl": 0.0020904541015625, "learning_rate": 9.999342982367957e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.36445680260658264, "rewards/gpt4o_reward_model": 4.0, "step": 39 }, { "completion_length": 102.5, "epoch": 0.00431918799265738, "grad_norm": 5.018375873565674, "kl": 0.0027313232421875, "learning_rate": 9.99930256073532e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 3.75, "step": 40 }, { "completion_length": 94.5, "epoch": 0.004427167692473815, "grad_norm": 5.892095565795898, "kl": 0.0028839111328125, "learning_rate": 9.999260932600648e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 1.1827775239944458, "rewards/gpt4o_reward_model": 3.8125, "step": 41 }, { "completion_length": 88.0, "epoch": 0.0045351473922902496, "grad_norm": 3.4567017555236816, "kl": 0.0014190673828125, "learning_rate": 9.99921809797398e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.47356173396110535, "rewards/gpt4o_reward_model": 4.1875, "step": 42 }, { "completion_length": 148.75, "epoch": 0.004643127092106684, "grad_norm": 4.687886714935303, "kl": 0.0025177001953125, "learning_rate": 9.999174056865658e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.5581127405166626, "rewards/gpt4o_reward_model": 4.1875, "step": 43 }, { "completion_length": 188.0, "epoch": 0.004751106791923119, "grad_norm": 3.5290260314941406, "kl": 0.00238037109375, "learning_rate": 9.999128809286309e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.25, "step": 44 }, { "completion_length": 86.5, "epoch": 0.004859086491739553, "grad_norm": 4.073955535888672, "kl": 0.002655029296875, "learning_rate": 9.99908235524685e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.0625, "step": 45 }, { "completion_length": 87.25, "epoch": 0.004967066191555987, "grad_norm": 4.2338104248046875, "kl": 0.0042724609375, "learning_rate": 9.9990346947585e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.5, "step": 46 }, { "completion_length": 127.25, "epoch": 0.005075045891372422, "grad_norm": 18.224328994750977, "kl": 0.01458740234375, "learning_rate": 9.998985827832752e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.7365237474441528, "rewards/gpt4o_reward_model": 4.1875, "step": 47 }, { "completion_length": 224.75, "epoch": 0.0051830255911888565, "grad_norm": 3.0918076038360596, "kl": 0.0031585693359375, "learning_rate": 9.998935754481404e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.4435809552669525, "rewards/gpt4o_reward_model": 4.0625, "step": 48 }, { "completion_length": 98.25, "epoch": 0.005291005291005291, "grad_norm": 6.054563522338867, "kl": 0.0042724609375, "learning_rate": 9.998884474716539e-07, "loss": 0.0, "reward": 2.875, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 2.875, "step": 49 }, { "completion_length": 103.75, "epoch": 0.005398984990821726, "grad_norm": 5.9451069831848145, "kl": 0.0028533935546875, "learning_rate": 9.998831988550533e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.5625, "step": 50 }, { "completion_length": 133.75, "epoch": 0.00550696469063816, "grad_norm": 6.4553728103637695, "kl": 0.006317138671875, "learning_rate": 9.998778295996054e-07, "loss": 0.0, "reward": 3.5625, "reward_std": 0.9518133401870728, "rewards/gpt4o_reward_model": 3.5625, "step": 51 }, { "completion_length": 81.5, "epoch": 0.005614944390454594, "grad_norm": 6.15158748626709, "kl": 0.002899169921875, "learning_rate": 9.998723397066058e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 1.060096263885498, "rewards/gpt4o_reward_model": 3.8125, "step": 52 }, { "completion_length": 134.0, "epoch": 0.005722924090271029, "grad_norm": 5.659409999847412, "kl": 0.0040283203125, "learning_rate": 9.998667291773794e-07, "loss": 0.0, "reward": 3.5625, "reward_std": 0.8831573724746704, "rewards/gpt4o_reward_model": 3.5625, "step": 53 }, { "completion_length": 100.0, "epoch": 0.0058309037900874635, "grad_norm": 5.585843563079834, "kl": 0.004150390625, "learning_rate": 9.998609980132803e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 3.75, "step": 54 }, { "completion_length": 158.0, "epoch": 0.005938883489903898, "grad_norm": 4.5382161140441895, "kl": 0.004302978515625, "learning_rate": 9.998551462156917e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.0, "step": 55 }, { "completion_length": 83.75, "epoch": 0.006046863189720333, "grad_norm": 3.957919120788574, "kl": 0.0040283203125, "learning_rate": 9.998491737860255e-07, "loss": 0.0, "reward": 4.375, "reward_std": 0.5840140581130981, "rewards/gpt4o_reward_model": 4.375, "step": 56 }, { "completion_length": 175.5, "epoch": 0.006154842889536767, "grad_norm": 4.4697747230529785, "kl": 0.004302978515625, "learning_rate": 9.998430807257234e-07, "loss": 0.0, "reward": 4.375, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.375, "step": 57 }, { "completion_length": 145.75, "epoch": 0.006262822589353202, "grad_norm": 4.669873237609863, "kl": 0.004364013671875, "learning_rate": 9.998368670362557e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 3.9375, "step": 58 }, { "completion_length": 172.25, "epoch": 0.006370802289169636, "grad_norm": 4.383346080780029, "kl": 0.003509521484375, "learning_rate": 9.998305327191222e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.971612811088562, "rewards/gpt4o_reward_model": 4.125, "step": 59 }, { "completion_length": 125.5, "epoch": 0.00647878198898607, "grad_norm": 4.461704254150391, "kl": 0.0030670166015625, "learning_rate": 9.998240777758514e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.25, "step": 60 }, { "completion_length": 220.25, "epoch": 0.0065867616888025055, "grad_norm": 4.459292888641357, "kl": 0.00457763671875, "learning_rate": 9.99817502208001e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.883794367313385, "rewards/gpt4o_reward_model": 4.0625, "step": 61 }, { "completion_length": 124.0, "epoch": 0.00669474138861894, "grad_norm": 4.459859848022461, "kl": 0.00347900390625, "learning_rate": 9.998108060171579e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.7283515930175781, "rewards/gpt4o_reward_model": 4.125, "step": 62 }, { "completion_length": 202.0, "epoch": 0.006802721088435374, "grad_norm": 4.133177757263184, "kl": 0.0035858154296875, "learning_rate": 9.998039892049383e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.25, "step": 63 }, { "completion_length": 71.5, "epoch": 0.006910700788251809, "grad_norm": 5.423212051391602, "kl": 0.0069580078125, "learning_rate": 9.997970517729874e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.8220234513282776, "rewards/gpt4o_reward_model": 3.9375, "step": 64 }, { "completion_length": 171.25, "epoch": 0.007018680488068243, "grad_norm": 3.992265462875366, "kl": 0.0023040771484375, "learning_rate": 9.997899937229792e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.5387751460075378, "rewards/gpt4o_reward_model": 3.875, "step": 65 }, { "completion_length": 118.25, "epoch": 0.007126660187884677, "grad_norm": 4.649938583374023, "kl": 0.00506591796875, "learning_rate": 9.997828150566171e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.704224169254303, "rewards/gpt4o_reward_model": 4.125, "step": 66 }, { "completion_length": 112.5, "epoch": 0.0072346398877011124, "grad_norm": 4.159398078918457, "kl": 0.005767822265625, "learning_rate": 9.997755157756337e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.5188006162643433, "rewards/gpt4o_reward_model": 3.6875, "step": 67 }, { "completion_length": 66.5, "epoch": 0.007342619587517547, "grad_norm": 3.303025484085083, "kl": 0.00494384765625, "learning_rate": 9.997680958817907e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.454224169254303, "rewards/gpt4o_reward_model": 4.125, "step": 68 }, { "completion_length": 151.0, "epoch": 0.007450599287333981, "grad_norm": 3.7500741481781006, "kl": 0.004150390625, "learning_rate": 9.99760555376878e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.5685809850692749, "rewards/gpt4o_reward_model": 4.5, "step": 69 }, { "completion_length": 131.5, "epoch": 0.007558578987150416, "grad_norm": 5.136856555938721, "kl": 0.00537109375, "learning_rate": 9.997528942627165e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.8678992986679077, "rewards/gpt4o_reward_model": 3.8125, "step": 70 }, { "completion_length": 249.25, "epoch": 0.00766655868696685, "grad_norm": 3.3416953086853027, "kl": 0.00469970703125, "learning_rate": 9.997451125411542e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.5646764636039734, "rewards/gpt4o_reward_model": 4.3125, "step": 71 }, { "completion_length": 134.0, "epoch": 0.007774538386783284, "grad_norm": 3.785247564315796, "kl": 0.0054931640625, "learning_rate": 9.997372102140694e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.5, "step": 72 }, { "completion_length": 82.0, "epoch": 0.007882518086599719, "grad_norm": 5.987561225891113, "kl": 0.00604248046875, "learning_rate": 9.997291872833694e-07, "loss": 0.0, "reward": 4.375, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.375, "step": 73 }, { "completion_length": 80.75, "epoch": 0.007990497786416154, "grad_norm": 4.2266130447387695, "kl": 0.00677490234375, "learning_rate": 9.9972104375099e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.75, "step": 74 }, { "completion_length": 120.5, "epoch": 0.008098477486232589, "grad_norm": 4.478511333465576, "kl": 0.005523681640625, "learning_rate": 9.997127796188967e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.6831127405166626, "rewards/gpt4o_reward_model": 4.0, "step": 75 }, { "completion_length": 238.75, "epoch": 0.008206457186049022, "grad_norm": 3.8930044174194336, "kl": 0.005279541015625, "learning_rate": 9.997043948890839e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.625, "step": 76 }, { "completion_length": 357.0, "epoch": 0.008314436885865457, "grad_norm": 4.40590763092041, "kl": 0.00555419921875, "learning_rate": 9.996958895635754e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.9470234513282776, "rewards/gpt4o_reward_model": 4.125, "step": 77 }, { "completion_length": 127.25, "epoch": 0.008422416585681892, "grad_norm": 4.700560092926025, "kl": 0.0064697265625, "learning_rate": 9.996872636444235e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.5985617637634277, "rewards/gpt4o_reward_model": 4.5, "step": 78 }, { "completion_length": 231.25, "epoch": 0.008530396285498325, "grad_norm": 3.062976360321045, "kl": 0.00421142578125, "learning_rate": 9.996785171337101e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 3.75, "step": 79 }, { "completion_length": 102.5, "epoch": 0.00863837598531476, "grad_norm": 4.180944919586182, "kl": 0.00518798828125, "learning_rate": 9.996696500335458e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.75, "step": 80 }, { "completion_length": 111.75, "epoch": 0.008746355685131196, "grad_norm": 4.435964107513428, "kl": 0.00726318359375, "learning_rate": 9.996606623460707e-07, "loss": 0.0, "reward": 4.375, "reward_std": 0.7887752056121826, "rewards/gpt4o_reward_model": 4.375, "step": 81 }, { "completion_length": 77.0, "epoch": 0.00885433538494763, "grad_norm": 3.1188766956329346, "kl": 0.007171630859375, "learning_rate": 9.99651554073454e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.5625, "step": 82 }, { "completion_length": 104.75, "epoch": 0.008962315084764064, "grad_norm": 5.255317687988281, "kl": 0.0072021484375, "learning_rate": 9.996423252178933e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.6935809850692749, "rewards/gpt4o_reward_model": 4.4375, "step": 83 }, { "completion_length": 106.75, "epoch": 0.009070294784580499, "grad_norm": 4.521153926849365, "kl": 0.00860595703125, "learning_rate": 9.996329757816166e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.6790332794189453, "rewards/gpt4o_reward_model": 4.0625, "step": 84 }, { "completion_length": 177.75, "epoch": 0.009178274484396934, "grad_norm": 4.019938945770264, "kl": 0.00653076171875, "learning_rate": 9.996235057668797e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.75, "step": 85 }, { "completion_length": 116.5, "epoch": 0.009286254184213368, "grad_norm": 4.606100559234619, "kl": 0.0078125, "learning_rate": 9.99613915175968e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.8233708143234253, "rewards/gpt4o_reward_model": 4.0625, "step": 86 }, { "completion_length": 310.25, "epoch": 0.009394233884029803, "grad_norm": 5.037557601928711, "kl": 0.00836181640625, "learning_rate": 9.996042040111962e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.7481511831283569, "rewards/gpt4o_reward_model": 3.9375, "step": 87 }, { "completion_length": 135.25, "epoch": 0.009502213583846238, "grad_norm": 5.936065196990967, "kl": 0.014404296875, "learning_rate": 9.99594372274908e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 1.006845474243164, "rewards/gpt4o_reward_model": 3.9375, "step": 88 }, { "completion_length": 165.25, "epoch": 0.009610193283662671, "grad_norm": 4.330586910247803, "kl": 0.007537841796875, "learning_rate": 9.995844199694763e-07, "loss": 0.0, "reward": 3.1875, "reward_std": 0.796310305595398, "rewards/gpt4o_reward_model": 3.1875, "step": 89 }, { "completion_length": 87.25, "epoch": 0.009718172983479106, "grad_norm": 3.3461477756500244, "kl": 0.0103759765625, "learning_rate": 9.995743470973024e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.28877514600753784, "rewards/gpt4o_reward_model": 4.75, "step": 90 }, { "completion_length": 64.5, "epoch": 0.009826152683295541, "grad_norm": 3.6653449535369873, "kl": 0.0084228515625, "learning_rate": 9.995641536608176e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.579224169254303, "rewards/gpt4o_reward_model": 4.4375, "step": 91 }, { "completion_length": 134.5, "epoch": 0.009934132383111974, "grad_norm": 3.128530979156494, "kl": 0.00848388671875, "learning_rate": 9.99553839662482e-07, "loss": 0.0, "reward": 4.8125, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.8125, "step": 92 }, { "completion_length": 121.25, "epoch": 0.01004211208292841, "grad_norm": 4.361353397369385, "kl": 0.0054931640625, "learning_rate": 9.995434051047845e-07, "loss": 0.0, "reward": 3.625, "reward_std": 0.6831127405166626, "rewards/gpt4o_reward_model": 3.625, "step": 93 }, { "completion_length": 109.25, "epoch": 0.010150091782744845, "grad_norm": 4.838723182678223, "kl": 0.013916015625, "learning_rate": 9.995328499902433e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.8485617637634277, "rewards/gpt4o_reward_model": 4.0, "step": 94 }, { "completion_length": 179.5, "epoch": 0.010258071482561278, "grad_norm": 3.451291084289551, "kl": 0.01019287109375, "learning_rate": 9.99522174321406e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.454224169254303, "rewards/gpt4o_reward_model": 3.875, "step": 95 }, { "completion_length": 99.5, "epoch": 0.010366051182377713, "grad_norm": 2.002145528793335, "kl": 0.01031494140625, "learning_rate": 9.995113781008485e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.6875, "step": 96 }, { "completion_length": 127.0, "epoch": 0.010474030882194148, "grad_norm": 4.460968494415283, "kl": 0.01141357421875, "learning_rate": 9.995004613311768e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.4928992986679077, "rewards/gpt4o_reward_model": 4.0, "step": 97 }, { "completion_length": 193.5, "epoch": 0.010582010582010581, "grad_norm": 5.521016597747803, "kl": 0.01336669921875, "learning_rate": 9.994894240150252e-07, "loss": 0.0, "reward": 2.375, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 2.375, "step": 98 }, { "completion_length": 126.5, "epoch": 0.010689990281827016, "grad_norm": 7.110630989074707, "kl": 0.010009765625, "learning_rate": 9.994782661550573e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.6875, "step": 99 }, { "completion_length": 253.75, "epoch": 0.010797969981643452, "grad_norm": 3.6743593215942383, "kl": 0.00982666015625, "learning_rate": 9.994669877539664e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.5879185199737549, "rewards/gpt4o_reward_model": 4.1875, "step": 100 }, { "completion_length": 68.5, "epoch": 0.010905949681459885, "grad_norm": 4.462601661682129, "kl": 0.01251220703125, "learning_rate": 9.994555888144736e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.6827775835990906, "rewards/gpt4o_reward_model": 4.0625, "step": 101 }, { "completion_length": 123.0, "epoch": 0.01101392938127632, "grad_norm": 4.228979110717773, "kl": 0.009033203125, "learning_rate": 9.994440693393305e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.6251000165939331, "rewards/gpt4o_reward_model": 4.6875, "step": 102 }, { "completion_length": 95.5, "epoch": 0.011121909081092755, "grad_norm": 4.379354476928711, "kl": 0.01068115234375, "learning_rate": 9.994324293313169e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.5728486180305481, "rewards/gpt4o_reward_model": 3.875, "step": 103 }, { "completion_length": 198.5, "epoch": 0.011229888780909188, "grad_norm": 4.31371545791626, "kl": 0.01171875, "learning_rate": 9.994206687932418e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.454224169254303, "rewards/gpt4o_reward_model": 4.5, "step": 104 }, { "completion_length": 210.0, "epoch": 0.011337868480725623, "grad_norm": 3.053598642349243, "kl": 0.00994873046875, "learning_rate": 9.994087877279436e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.7887751460075378, "rewards/gpt4o_reward_model": 4.25, "step": 105 }, { "completion_length": 144.75, "epoch": 0.011445848180542059, "grad_norm": 4.309823036193848, "kl": 0.0159912109375, "learning_rate": 9.993967861382895e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.454224169254303, "rewards/gpt4o_reward_model": 3.75, "step": 106 }, { "completion_length": 170.0, "epoch": 0.011553827880358492, "grad_norm": 4.035120487213135, "kl": 0.00762939453125, "learning_rate": 9.99384664027176e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.6790332198143005, "rewards/gpt4o_reward_model": 4.3125, "step": 107 }, { "completion_length": 219.0, "epoch": 0.011661807580174927, "grad_norm": 3.480762004852295, "kl": 0.0137939453125, "learning_rate": 9.993724213975286e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.329224169254303, "rewards/gpt4o_reward_model": 4.3125, "step": 108 }, { "completion_length": 95.75, "epoch": 0.011769787279991362, "grad_norm": 5.104645252227783, "kl": 0.017578125, "learning_rate": 9.993600582523015e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.8625079393386841, "rewards/gpt4o_reward_model": 3.8125, "step": 109 }, { "completion_length": 198.0, "epoch": 0.011877766979807795, "grad_norm": 5.45714807510376, "kl": 0.0157470703125, "learning_rate": 9.993475745944787e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.7694376111030579, "rewards/gpt4o_reward_model": 3.8125, "step": 110 }, { "completion_length": 242.25, "epoch": 0.01198574667962423, "grad_norm": 3.7047228813171387, "kl": 0.0146484375, "learning_rate": 9.99334970427073e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.8751000165939331, "rewards/gpt4o_reward_model": 3.9375, "step": 111 }, { "completion_length": 163.75, "epoch": 0.012093726379440665, "grad_norm": 4.073879718780518, "kl": 0.0111083984375, "learning_rate": 9.993222457531262e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.6724694967269897, "rewards/gpt4o_reward_model": 4.1875, "step": 112 }, { "completion_length": 113.5, "epoch": 0.012201706079257099, "grad_norm": 4.12031888961792, "kl": 0.01385498046875, "learning_rate": 9.99309400575709e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.6178992986679077, "rewards/gpt4o_reward_model": 3.9375, "step": 113 }, { "completion_length": 43.5, "epoch": 0.012309685779073534, "grad_norm": 0.003000754164531827, "kl": 0.0172119140625, "learning_rate": 9.992964348979213e-07, "loss": 0.0, "reward": 4.25, "reward_std": 9.999999747378752e-05, "rewards/gpt4o_reward_model": 4.25, "step": 114 }, { "completion_length": 270.0, "epoch": 0.012417665478889969, "grad_norm": 3.4506990909576416, "kl": 0.0166015625, "learning_rate": 9.992833487228923e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.5840140581130981, "rewards/gpt4o_reward_model": 4.125, "step": 115 }, { "completion_length": 232.25, "epoch": 0.012525645178706404, "grad_norm": 3.1857428550720215, "kl": 0.0169677734375, "learning_rate": 9.992701420537803e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.0, "step": 116 }, { "completion_length": 128.0, "epoch": 0.012633624878522837, "grad_norm": 3.1388204097747803, "kl": 0.01611328125, "learning_rate": 9.992568148937722e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.4375, "step": 117 }, { "completion_length": 125.75, "epoch": 0.012741604578339272, "grad_norm": 3.442460298538208, "kl": 0.01611328125, "learning_rate": 9.992433672460844e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.4435809552669525, "rewards/gpt4o_reward_model": 4.4375, "step": 118 }, { "completion_length": 143.75, "epoch": 0.012849584278155708, "grad_norm": 4.008980751037598, "kl": 0.0235595703125, "learning_rate": 9.992297991139627e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 4.625, "step": 119 }, { "completion_length": 100.5, "epoch": 0.01295756397797214, "grad_norm": 3.958385705947876, "kl": 0.01470947265625, "learning_rate": 9.992161105006809e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.5581127405166626, "rewards/gpt4o_reward_model": 4.1875, "step": 120 }, { "completion_length": 175.0, "epoch": 0.013065543677788576, "grad_norm": 3.9238104820251465, "kl": 0.017333984375, "learning_rate": 9.99202301409543e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.6637751460075378, "rewards/gpt4o_reward_model": 4.5625, "step": 121 }, { "completion_length": 64.0, "epoch": 0.013173523377605011, "grad_norm": 5.058772087097168, "kl": 0.023681640625, "learning_rate": 9.991883718438813e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.625, "step": 122 }, { "completion_length": 92.5, "epoch": 0.013281503077421444, "grad_norm": 6.274599552154541, "kl": 0.026123046875, "learning_rate": 9.991743218070577e-07, "loss": 0.0, "reward": 3.5, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 3.5, "step": 123 }, { "completion_length": 108.0, "epoch": 0.01338948277723788, "grad_norm": 4.530334949493408, "kl": 0.022705078125, "learning_rate": 9.991601513024628e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.7581573724746704, "rewards/gpt4o_reward_model": 3.875, "step": 124 }, { "completion_length": 103.25, "epoch": 0.013497462477054314, "grad_norm": 2.2850399017333984, "kl": 0.01556396484375, "learning_rate": 9.991458603335165e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.4331127107143402, "rewards/gpt4o_reward_model": 4.625, "step": 125 }, { "completion_length": 114.25, "epoch": 0.013605442176870748, "grad_norm": 4.146212100982666, "kl": 0.02392578125, "learning_rate": 9.991314489036677e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.1875, "step": 126 }, { "completion_length": 229.0, "epoch": 0.013713421876687183, "grad_norm": 4.601395130157471, "kl": 0.0205078125, "learning_rate": 9.991169170163943e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.0625, "step": 127 }, { "completion_length": 310.5, "epoch": 0.013821401576503618, "grad_norm": 3.042806625366211, "kl": 0.015869140625, "learning_rate": 9.991022646752035e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.75, "step": 128 }, { "completion_length": 145.0, "epoch": 0.013929381276320051, "grad_norm": 3.71138858795166, "kl": 0.0240478515625, "learning_rate": 9.990874918836313e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.7974694967269897, "rewards/gpt4o_reward_model": 4.0, "step": 129 }, { "completion_length": 269.5, "epoch": 0.014037360976136486, "grad_norm": 4.026693820953369, "kl": 0.0189208984375, "learning_rate": 9.990725986452426e-07, "loss": 0.0, "reward": 4.375, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.375, "step": 130 }, { "completion_length": 186.75, "epoch": 0.014145340675952921, "grad_norm": 3.600174903869629, "kl": 0.0284423828125, "learning_rate": 9.99057584963632e-07, "loss": 0.0, "reward": 3.625, "reward_std": 0.454224169254303, "rewards/gpt4o_reward_model": 3.625, "step": 131 }, { "completion_length": 86.5, "epoch": 0.014253320375769355, "grad_norm": 4.374396324157715, "kl": 0.0257568359375, "learning_rate": 9.99042450842423e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.7288135886192322, "rewards/gpt4o_reward_model": 4.25, "step": 132 }, { "completion_length": 208.0, "epoch": 0.01436130007558579, "grad_norm": 5.3308610916137695, "kl": 0.0211181640625, "learning_rate": 9.990271962852676e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.8081126809120178, "rewards/gpt4o_reward_model": 4.3125, "step": 133 }, { "completion_length": 98.5, "epoch": 0.014469279775402225, "grad_norm": 3.5027971267700195, "kl": 0.02197265625, "learning_rate": 9.990118212958473e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.4375, "step": 134 }, { "completion_length": 41.5, "epoch": 0.014577259475218658, "grad_norm": 8.935701370239258, "kl": 0.02490234375, "learning_rate": 9.989963258778728e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.6896764636039734, "rewards/gpt4o_reward_model": 4.0, "step": 135 }, { "completion_length": 85.25, "epoch": 0.014685239175035093, "grad_norm": 4.740475177764893, "kl": 0.047119140625, "learning_rate": 9.989807100350833e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.7090140581130981, "rewards/gpt4o_reward_model": 4.4375, "step": 136 }, { "completion_length": 109.5, "epoch": 0.014793218874851528, "grad_norm": 4.925447463989258, "kl": 0.0284423828125, "learning_rate": 9.989649737712478e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 4.0, "step": 137 }, { "completion_length": 82.75, "epoch": 0.014901198574667962, "grad_norm": 2.8169710636138916, "kl": 0.026611328125, "learning_rate": 9.98949117090164e-07, "loss": 0.0, "reward": 4.9375, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.9375, "step": 138 }, { "completion_length": 147.5, "epoch": 0.015009178274484397, "grad_norm": 4.605223178863525, "kl": 0.032470703125, "learning_rate": 9.989331399956583e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.5, "step": 139 }, { "completion_length": 239.75, "epoch": 0.015117157974300832, "grad_norm": 3.3013551235198975, "kl": 0.025634765625, "learning_rate": 9.98917042491587e-07, "loss": 0.0, "reward": 4.375, "reward_std": 0.6231511831283569, "rewards/gpt4o_reward_model": 4.375, "step": 140 }, { "completion_length": 129.75, "epoch": 0.015225137674117265, "grad_norm": 2.192405939102173, "kl": 0.0244140625, "learning_rate": 9.989008245818347e-07, "loss": 0.0, "reward": 4.875, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.875, "step": 141 }, { "completion_length": 253.75, "epoch": 0.0153331173739337, "grad_norm": 3.278887987136841, "kl": 0.02197265625, "learning_rate": 9.988844862703152e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.25, "step": 142 }, { "completion_length": 199.0, "epoch": 0.015441097073750135, "grad_norm": 3.9546308517456055, "kl": 0.02001953125, "learning_rate": 9.988680275609717e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.0625, "step": 143 }, { "completion_length": 179.0, "epoch": 0.015549076773566569, "grad_norm": 5.07952880859375, "kl": 0.032470703125, "learning_rate": 9.988514484577761e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.8644567728042603, "rewards/gpt4o_reward_model": 4.25, "step": 144 }, { "completion_length": 75.5, "epoch": 0.015657056473383005, "grad_norm": 5.506921291351318, "kl": 0.0361328125, "learning_rate": 9.988347489647298e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.5333483219146729, "rewards/gpt4o_reward_model": 4.1875, "step": 145 }, { "completion_length": 209.25, "epoch": 0.015765036173199437, "grad_norm": 4.1503119468688965, "kl": 0.030029296875, "learning_rate": 9.988179290858627e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.5, "step": 146 }, { "completion_length": 156.25, "epoch": 0.015873015873015872, "grad_norm": 3.442382335662842, "kl": 0.041015625, "learning_rate": 9.98800988825234e-07, "loss": 0.0, "reward": 4.875, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.875, "step": 147 }, { "completion_length": 103.5, "epoch": 0.015980995572832307, "grad_norm": 3.5556161403656006, "kl": 0.025390625, "learning_rate": 9.987839281869321e-07, "loss": 0.0, "reward": 4.8125, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.8125, "step": 148 }, { "completion_length": 94.5, "epoch": 0.016088975272648742, "grad_norm": 5.813422203063965, "kl": 0.0341796875, "learning_rate": 9.987667471750743e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.125, "step": 149 }, { "completion_length": 114.75, "epoch": 0.016196954972465177, "grad_norm": 2.6386795043945312, "kl": 0.026123046875, "learning_rate": 9.987494457938066e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.28877514600753784, "rewards/gpt4o_reward_model": 4.5, "step": 150 }, { "completion_length": 237.5, "epoch": 0.016304934672281612, "grad_norm": 5.137205123901367, "kl": 0.033447265625, "learning_rate": 9.987320240473049e-07, "loss": 0.0, "reward": 3.5625, "reward_std": 0.6770563125610352, "rewards/gpt4o_reward_model": 3.5625, "step": 151 }, { "completion_length": 61.25, "epoch": 0.016412914372098044, "grad_norm": 2.3743293285369873, "kl": 0.00823974609375, "learning_rate": 9.987144819397735e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.6827775835990906, "rewards/gpt4o_reward_model": 4.1875, "step": 152 }, { "completion_length": 130.75, "epoch": 0.01652089407191448, "grad_norm": 3.382657289505005, "kl": 0.032958984375, "learning_rate": 9.98696819475446e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.1875, "step": 153 }, { "completion_length": 98.75, "epoch": 0.016628873771730914, "grad_norm": 5.266931533813477, "kl": 0.025146484375, "learning_rate": 9.986790366585847e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.625, "step": 154 }, { "completion_length": 170.25, "epoch": 0.01673685347154735, "grad_norm": 4.579905033111572, "kl": 0.03369140625, "learning_rate": 9.986611334934814e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.8146764636039734, "rewards/gpt4o_reward_model": 4.1875, "step": 155 }, { "completion_length": 258.25, "epoch": 0.016844833171363784, "grad_norm": 2.3568973541259766, "kl": 0.029052734375, "learning_rate": 9.986431099844567e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.6875, "step": 156 }, { "completion_length": 205.75, "epoch": 0.01695281287118022, "grad_norm": 4.3519158363342285, "kl": 0.02392578125, "learning_rate": 9.9862496613586e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.8421862125396729, "rewards/gpt4o_reward_model": 3.9375, "step": 157 }, { "completion_length": 168.75, "epoch": 0.01706079257099665, "grad_norm": 3.8407411575317383, "kl": 0.0308837890625, "learning_rate": 9.986067019520707e-07, "loss": 0.0, "reward": 3.4375, "reward_std": 0.6251000165939331, "rewards/gpt4o_reward_model": 3.4375, "step": 158 }, { "completion_length": 143.75, "epoch": 0.017168772270813086, "grad_norm": 0.003290753811597824, "kl": 0.0281982421875, "learning_rate": 9.98588317437496e-07, "loss": 0.0, "reward": 2.5, "reward_std": 9.999999747378752e-05, "rewards/gpt4o_reward_model": 2.5, "step": 159 }, { "completion_length": 68.25, "epoch": 0.01727675197062952, "grad_norm": 4.90841817855835, "kl": 0.023193359375, "learning_rate": 9.98569812596573e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 3.875, "step": 160 }, { "completion_length": 121.75, "epoch": 0.017384731670445956, "grad_norm": 3.6256470680236816, "kl": 0.033935546875, "learning_rate": 9.985511874337672e-07, "loss": 0.0, "reward": 4.9375, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.9375, "step": 161 }, { "completion_length": 118.75, "epoch": 0.01749271137026239, "grad_norm": 2.8302996158599854, "kl": 0.03662109375, "learning_rate": 9.98532441953574e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.23945678770542145, "rewards/gpt4o_reward_model": 4.4375, "step": 162 }, { "completion_length": 160.75, "epoch": 0.017600691070078826, "grad_norm": 3.4306154251098633, "kl": 0.027099609375, "learning_rate": 9.985135761605167e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.4375, "step": 163 }, { "completion_length": 160.5, "epoch": 0.01770867076989526, "grad_norm": 3.66644287109375, "kl": 0.0283203125, "learning_rate": 9.984945900591486e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.47356173396110535, "rewards/gpt4o_reward_model": 4.5625, "step": 164 }, { "completion_length": 166.75, "epoch": 0.017816650469711693, "grad_norm": 3.7970166206359863, "kl": 0.0267333984375, "learning_rate": 9.98475483654052e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.25, "step": 165 }, { "completion_length": 272.75, "epoch": 0.017924630169528128, "grad_norm": 4.074111461639404, "kl": 0.033935546875, "learning_rate": 9.984562569498373e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.6637752056121826, "rewards/gpt4o_reward_model": 4.0625, "step": 166 }, { "completion_length": 107.5, "epoch": 0.018032609869344563, "grad_norm": 5.892201900482178, "kl": 0.040283203125, "learning_rate": 9.984369099511452e-07, "loss": 0.0, "reward": 3.375, "reward_std": 0.8623477220535278, "rewards/gpt4o_reward_model": 3.375, "step": 167 }, { "completion_length": 93.5, "epoch": 0.018140589569160998, "grad_norm": 3.144777297973633, "kl": 0.02099609375, "learning_rate": 9.984174426626443e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 3.9375, "step": 168 }, { "completion_length": 184.5, "epoch": 0.018248569268977433, "grad_norm": 4.250265598297119, "kl": 0.036376953125, "learning_rate": 9.98397855089033e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.625, "step": 169 }, { "completion_length": 25.5, "epoch": 0.01835654896879387, "grad_norm": 4.1492600440979, "kl": 0.033447265625, "learning_rate": 9.983781472350382e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.383794367313385, "rewards/gpt4o_reward_model": 4.6875, "step": 170 }, { "completion_length": 219.5, "epoch": 0.0184645286686103, "grad_norm": 3.171717405319214, "kl": 0.0380859375, "learning_rate": 9.983583191054162e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.5281319618225098, "rewards/gpt4o_reward_model": 4.0625, "step": 171 }, { "completion_length": 121.75, "epoch": 0.018572508368426735, "grad_norm": 4.934228897094727, "kl": 0.048828125, "learning_rate": 9.983383707049522e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.5879185199737549, "rewards/gpt4o_reward_model": 4.0625, "step": 172 }, { "completion_length": 191.0, "epoch": 0.01868048806824317, "grad_norm": 2.89996600151062, "kl": 0.02490234375, "learning_rate": 9.983183020384605e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.34856173396110535, "rewards/gpt4o_reward_model": 3.875, "step": 173 }, { "completion_length": 215.75, "epoch": 0.018788467768059605, "grad_norm": 3.5805468559265137, "kl": 0.038330078125, "learning_rate": 9.982981131107842e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.6229909658432007, "rewards/gpt4o_reward_model": 3.8125, "step": 174 }, { "completion_length": 178.25, "epoch": 0.01889644746787604, "grad_norm": 2.7104833126068115, "kl": 0.04150390625, "learning_rate": 9.982778039267958e-07, "loss": 0.0, "reward": 4.9375, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.9375, "step": 175 }, { "completion_length": 76.5, "epoch": 0.019004427167692475, "grad_norm": 4.9249267578125, "kl": 0.02392578125, "learning_rate": 9.982573744913964e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.7694375514984131, "rewards/gpt4o_reward_model": 4.5625, "step": 176 }, { "completion_length": 385.75, "epoch": 0.019112406867508907, "grad_norm": 4.761181354522705, "kl": 0.0703125, "learning_rate": 9.982368248095164e-07, "loss": 0.0001, "reward": 4.6875, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.6875, "step": 177 }, { "completion_length": 101.0, "epoch": 0.019220386567325342, "grad_norm": 3.50384259223938, "kl": 0.042236328125, "learning_rate": 9.982161548861152e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.625, "step": 178 }, { "completion_length": 103.25, "epoch": 0.019328366267141777, "grad_norm": 2.0653398036956787, "kl": 0.0439453125, "learning_rate": 9.98195364726181e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.3125, "step": 179 }, { "completion_length": 63.75, "epoch": 0.019436345966958212, "grad_norm": 5.203299522399902, "kl": 0.044677734375, "learning_rate": 9.981744543347312e-07, "loss": 0.0, "reward": 3.9375, "reward_std": 0.6251000165939331, "rewards/gpt4o_reward_model": 3.9375, "step": 180 }, { "completion_length": 100.25, "epoch": 0.019544325666774647, "grad_norm": 2.8786332607269287, "kl": 0.04052734375, "learning_rate": 9.981534237168124e-07, "loss": 0.0, "reward": 4.5625, "reward_std": 0.6250999569892883, "rewards/gpt4o_reward_model": 4.5625, "step": 181 }, { "completion_length": 231.5, "epoch": 0.019652305366591082, "grad_norm": 2.2581024169921875, "kl": 0.0284423828125, "learning_rate": 9.981322728774997e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.776972770690918, "rewards/gpt4o_reward_model": 4.25, "step": 182 }, { "completion_length": 106.5, "epoch": 0.019760285066407514, "grad_norm": 4.273443698883057, "kl": 0.03173828125, "learning_rate": 9.981110018218977e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.1875, "step": 183 }, { "completion_length": 94.5, "epoch": 0.01986826476622395, "grad_norm": 4.629092693328857, "kl": 0.07568359375, "learning_rate": 9.9808961055514e-07, "loss": 0.0001, "reward": 4.6875, "reward_std": 0.329224169254303, "rewards/gpt4o_reward_model": 4.6875, "step": 184 }, { "completion_length": 148.0, "epoch": 0.019976244466040384, "grad_norm": 3.932042360305786, "kl": 0.03173828125, "learning_rate": 9.980680990823886e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.5581127405166626, "rewards/gpt4o_reward_model": 4.3125, "step": 185 }, { "completion_length": 236.25, "epoch": 0.02008422416585682, "grad_norm": 3.403113842010498, "kl": 0.0380859375, "learning_rate": 9.980464674088354e-07, "loss": 0.0, "reward": 4.6875, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.6875, "step": 186 }, { "completion_length": 155.25, "epoch": 0.020192203865673254, "grad_norm": 3.3250508308410645, "kl": 0.05419921875, "learning_rate": 9.980247155397004e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.36445680260658264, "rewards/gpt4o_reward_model": 4.5, "step": 187 }, { "completion_length": 336.0, "epoch": 0.02030018356548969, "grad_norm": 2.428285837173462, "kl": 0.03515625, "learning_rate": 9.980028434802334e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.75, "step": 188 }, { "completion_length": 106.75, "epoch": 0.02040816326530612, "grad_norm": 4.417919635772705, "kl": 0.028076171875, "learning_rate": 9.979808512357129e-07, "loss": 0.0, "reward": 4.3125, "reward_std": 0.633794367313385, "rewards/gpt4o_reward_model": 4.3125, "step": 189 }, { "completion_length": 94.75, "epoch": 0.020516142965122556, "grad_norm": 3.8756754398345947, "kl": 0.042724609375, "learning_rate": 9.979587388114464e-07, "loss": 0.0, "reward": 4.75, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.75, "step": 190 }, { "completion_length": 119.75, "epoch": 0.02062412266493899, "grad_norm": 4.661324501037598, "kl": 0.0517578125, "learning_rate": 9.9793650621277e-07, "loss": 0.0001, "reward": 3.8125, "reward_std": 0.829224169254303, "rewards/gpt4o_reward_model": 3.8125, "step": 191 }, { "completion_length": 417.0, "epoch": 0.020732102364755426, "grad_norm": 2.5745935440063477, "kl": 0.0291748046875, "learning_rate": 9.979141534450495e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.25, "step": 192 }, { "completion_length": 178.5, "epoch": 0.02084008206457186, "grad_norm": 1.9205126762390137, "kl": 0.041259765625, "learning_rate": 9.978916805136794e-07, "loss": 0.0, "reward": 4.9375, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.9375, "step": 193 }, { "completion_length": 223.5, "epoch": 0.020948061764388296, "grad_norm": 3.269942283630371, "kl": 0.045166015625, "learning_rate": 9.97869087424083e-07, "loss": 0.0, "reward": 4.1875, "reward_std": 0.6229909658432007, "rewards/gpt4o_reward_model": 4.1875, "step": 194 }, { "completion_length": 101.25, "epoch": 0.02105604146420473, "grad_norm": 4.042669296264648, "kl": 0.0498046875, "learning_rate": 9.97846374181713e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.7479909658432007, "rewards/gpt4o_reward_model": 4.5, "step": 195 }, { "completion_length": 124.0, "epoch": 0.021164021164021163, "grad_norm": 4.541233539581299, "kl": 0.0478515625, "learning_rate": 9.978235407920506e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.5985617637634277, "rewards/gpt4o_reward_model": 4.0, "step": 196 }, { "completion_length": 218.25, "epoch": 0.021272000863837598, "grad_norm": 3.685286521911621, "kl": 0.03369140625, "learning_rate": 9.978005872606065e-07, "loss": 0.0, "reward": 4.5, "reward_std": 0.6231511831283569, "rewards/gpt4o_reward_model": 4.5, "step": 197 }, { "completion_length": 301.5, "epoch": 0.021379980563654033, "grad_norm": 2.64717173576355, "kl": 0.0478515625, "learning_rate": 9.977775135929202e-07, "loss": 0.0, "reward": 4.8125, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.8125, "step": 198 }, { "completion_length": 101.5, "epoch": 0.021487960263470468, "grad_norm": 3.8811116218566895, "kl": 0.04248046875, "learning_rate": 9.977543197945599e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.6251000165939331, "rewards/gpt4o_reward_model": 4.4375, "step": 199 }, { "completion_length": 110.25, "epoch": 0.021595939963286903, "grad_norm": 4.5807342529296875, "kl": 0.037109375, "learning_rate": 9.977310058711235e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.704224169254303, "rewards/gpt4o_reward_model": 3.75, "step": 200 }, { "completion_length": 207.5, "epoch": 0.021703919663103338, "grad_norm": 5.462955951690674, "kl": 0.09619140625, "learning_rate": 9.97707571828237e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.5625, "step": 201 }, { "completion_length": 138.5, "epoch": 0.02181189936291977, "grad_norm": 6.441577911376953, "kl": 0.0478515625, "learning_rate": 9.97684017671556e-07, "loss": 0.0, "reward": 3.4375, "reward_std": 0.9414719343185425, "rewards/gpt4o_reward_model": 3.4375, "step": 202 }, { "completion_length": 140.0, "epoch": 0.021919879062736205, "grad_norm": 3.3736348152160645, "kl": 0.037841796875, "learning_rate": 9.97660343406765e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.8872368931770325, "rewards/gpt4o_reward_model": 4.0, "step": 203 }, { "completion_length": 172.25, "epoch": 0.02202785876255264, "grad_norm": 3.9529149532318115, "kl": 0.044921875, "learning_rate": 9.97636549039577e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.625, "step": 204 }, { "completion_length": 667.25, "epoch": 0.022135838462369075, "grad_norm": 4.095589637756348, "kl": 0.046630859375, "learning_rate": 9.976126345757351e-07, "loss": 0.0, "reward": 3.625, "reward_std": 0.454224169254303, "rewards/gpt4o_reward_model": 3.625, "step": 205 }, { "completion_length": 187.0, "epoch": 0.02224381816218551, "grad_norm": 5.007861137390137, "kl": 0.04150390625, "learning_rate": 9.975886000210102e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.7286534309387207, "rewards/gpt4o_reward_model": 4.4375, "step": 206 }, { "completion_length": 37.0, "epoch": 0.022351797862001945, "grad_norm": 5.55353307723999, "kl": 0.0673828125, "learning_rate": 9.975644453812028e-07, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.633794367313385, "rewards/gpt4o_reward_model": 4.0625, "step": 207 }, { "completion_length": 210.75, "epoch": 0.022459777561818377, "grad_norm": 3.5840399265289307, "kl": 0.06494140625, "learning_rate": 9.97540170662142e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.5625, "step": 208 }, { "completion_length": 75.0, "epoch": 0.022567757261634812, "grad_norm": 4.6481828689575195, "kl": 0.048095703125, "learning_rate": 9.975157758696866e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.9524502754211426, "rewards/gpt4o_reward_model": 3.8125, "step": 209 }, { "completion_length": 179.75, "epoch": 0.022675736961451247, "grad_norm": 3.486043691635132, "kl": 0.036376953125, "learning_rate": 9.974912610097235e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.6036534309387207, "rewards/gpt4o_reward_model": 4.125, "step": 210 }, { "completion_length": 146.25, "epoch": 0.022783716661267682, "grad_norm": 2.9825496673583984, "kl": 0.0673828125, "learning_rate": 9.97466626088169e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.36445680260658264, "rewards/gpt4o_reward_model": 4.625, "step": 211 }, { "completion_length": 188.0, "epoch": 0.022891696361084117, "grad_norm": 5.593983173370361, "kl": 0.05615234375, "learning_rate": 9.974418711109684e-07, "loss": 0.0001, "reward": 4.125, "reward_std": 0.7974694967269897, "rewards/gpt4o_reward_model": 4.125, "step": 212 }, { "completion_length": 265.25, "epoch": 0.022999676060900552, "grad_norm": 4.792067527770996, "kl": 0.038330078125, "learning_rate": 9.97416996084096e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 4.125, "step": 213 }, { "completion_length": 131.25, "epoch": 0.023107655760716984, "grad_norm": 5.125412940979004, "kl": 0.041259765625, "learning_rate": 9.973920010135547e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 3.75, "step": 214 }, { "completion_length": 397.5, "epoch": 0.02321563546053342, "grad_norm": 4.1176910400390625, "kl": 0.0517578125, "learning_rate": 9.973668859053772e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.5625, "step": 215 }, { "completion_length": 145.25, "epoch": 0.023323615160349854, "grad_norm": 6.796933650970459, "kl": 0.134765625, "learning_rate": 9.973416507656243e-07, "loss": 0.0001, "reward": 4.4375, "reward_std": 0.6637752056121826, "rewards/gpt4o_reward_model": 4.4375, "step": 216 }, { "completion_length": 204.75, "epoch": 0.02343159486016629, "grad_norm": 2.643352746963501, "kl": 0.049072265625, "learning_rate": 9.97316295600386e-07, "loss": 0.0, "reward": 4.0, "reward_std": 0.28877514600753784, "rewards/gpt4o_reward_model": 4.0, "step": 217 }, { "completion_length": 73.0, "epoch": 0.023539574559982724, "grad_norm": 1.9597053527832031, "kl": 0.0390625, "learning_rate": 9.972908204157815e-07, "loss": 0.0, "reward": 4.4375, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.4375, "step": 218 }, { "completion_length": 317.5, "epoch": 0.02364755425979916, "grad_norm": 3.215393304824829, "kl": 0.033447265625, "learning_rate": 9.972652252179589e-07, "loss": 0.0, "reward": 3.625, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 3.625, "step": 219 }, { "completion_length": 168.75, "epoch": 0.02375553395961559, "grad_norm": 2.8032920360565186, "kl": 0.052001953125, "learning_rate": 9.97239510013095e-07, "loss": 0.0001, "reward": 4.875, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.875, "step": 220 }, { "completion_length": 118.75, "epoch": 0.023863513659432026, "grad_norm": 3.416057586669922, "kl": 0.0537109375, "learning_rate": 9.97213674807396e-07, "loss": 0.0001, "reward": 4.875, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.875, "step": 221 }, { "completion_length": 88.5, "epoch": 0.02397149335924846, "grad_norm": 3.6612114906311035, "kl": 0.05908203125, "learning_rate": 9.971877196070967e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 4.5, "step": 222 }, { "completion_length": 117.5, "epoch": 0.024079473059064896, "grad_norm": 2.947577714920044, "kl": 0.04345703125, "learning_rate": 9.971616444184607e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.6036534309387207, "rewards/gpt4o_reward_model": 4.25, "step": 223 }, { "completion_length": 155.0, "epoch": 0.02418745275888133, "grad_norm": 2.8114304542541504, "kl": 0.048583984375, "learning_rate": 9.971354492477812e-07, "loss": 0.0, "reward": 4.625, "reward_std": 0.36445680260658264, "rewards/gpt4o_reward_model": 4.625, "step": 224 }, { "completion_length": 105.75, "epoch": 0.024295432458697766, "grad_norm": 2.7532522678375244, "kl": 0.05224609375, "learning_rate": 9.9710913410138e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.5625, "step": 225 }, { "completion_length": 251.25, "epoch": 0.024403412158514198, "grad_norm": 3.266697883605957, "kl": 0.052001953125, "learning_rate": 9.970826989856076e-07, "loss": 0.0001, "reward": 4.6875, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.6875, "step": 226 }, { "completion_length": 199.5, "epoch": 0.024511391858330633, "grad_norm": 3.5519814491271973, "kl": 0.05908203125, "learning_rate": 9.970561439068438e-07, "loss": 0.0001, "reward": 3.9375, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 3.9375, "step": 227 }, { "completion_length": 61.25, "epoch": 0.024619371558147068, "grad_norm": 3.1293420791625977, "kl": 0.07080078125, "learning_rate": 9.970294688714975e-07, "loss": 0.0001, "reward": 4.6875, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.6875, "step": 228 }, { "completion_length": 207.25, "epoch": 0.024727351257963503, "grad_norm": 4.59749174118042, "kl": 0.04931640625, "learning_rate": 9.970026738860058e-07, "loss": 0.0, "reward": 3.96875, "reward_std": 0.8234953880310059, "rewards/gpt4o_reward_model": 3.96875, "step": 229 }, { "completion_length": 137.0, "epoch": 0.024835330957779938, "grad_norm": 3.6132426261901855, "kl": 0.04833984375, "learning_rate": 9.969757589568354e-07, "loss": 0.0, "reward": 4.25, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.25, "step": 230 }, { "completion_length": 115.75, "epoch": 0.024943310657596373, "grad_norm": 4.572238922119141, "kl": 0.06201171875, "learning_rate": 9.96948724090482e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.5774502754211426, "rewards/gpt4o_reward_model": 4.5, "step": 231 }, { "completion_length": 49.5, "epoch": 0.025051290357412808, "grad_norm": 3.109471559524536, "kl": 0.0751953125, "learning_rate": 9.969215692934702e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.4331127107143402, "rewards/gpt4o_reward_model": 4.625, "step": 232 }, { "completion_length": 336.0, "epoch": 0.02515927005722924, "grad_norm": 4.591429710388184, "kl": 0.05322265625, "learning_rate": 9.968942945723529e-07, "loss": 0.0001, "reward": 4.375, "reward_std": 0.8185809850692749, "rewards/gpt4o_reward_model": 4.375, "step": 233 }, { "completion_length": 174.25, "epoch": 0.025267249757045675, "grad_norm": 3.161344289779663, "kl": 0.06103515625, "learning_rate": 9.968668999337124e-07, "loss": 0.0001, "reward": 4.75, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.75, "step": 234 }, { "completion_length": 117.75, "epoch": 0.02537522945686211, "grad_norm": 3.8266708850860596, "kl": 0.05615234375, "learning_rate": 9.968393853841605e-07, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.23945678770542145, "rewards/gpt4o_reward_model": 4.0625, "step": 235 }, { "completion_length": 144.75, "epoch": 0.025483209156678545, "grad_norm": 3.3600192070007324, "kl": 0.05712890625, "learning_rate": 9.96811750930337e-07, "loss": 0.0001, "reward": 4.125, "reward_std": 0.9463939070701599, "rewards/gpt4o_reward_model": 4.125, "step": 236 }, { "completion_length": 260.5, "epoch": 0.02559118885649498, "grad_norm": 4.064725875854492, "kl": 0.05859375, "learning_rate": 9.96783996578911e-07, "loss": 0.0001, "reward": 4.75, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.75, "step": 237 }, { "completion_length": 151.5, "epoch": 0.025699168556311415, "grad_norm": 2.2150022983551025, "kl": 0.04931640625, "learning_rate": 9.967561223365806e-07, "loss": 0.0, "reward": 4.875, "reward_std": 0.14443756639957428, "rewards/gpt4o_reward_model": 4.875, "step": 238 }, { "completion_length": 140.5, "epoch": 0.025807148256127847, "grad_norm": 3.777392625808716, "kl": 0.09326171875, "learning_rate": 9.96728128210073e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.5387751460075378, "rewards/gpt4o_reward_model": 4.625, "step": 239 }, { "completion_length": 179.5, "epoch": 0.02591512795594428, "grad_norm": 5.373637676239014, "kl": 0.0849609375, "learning_rate": 9.967000142061439e-07, "loss": 0.0001, "reward": 4.3125, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.3125, "step": 240 }, { "completion_length": 133.0, "epoch": 0.026023107655760717, "grad_norm": 1.9456915855407715, "kl": 0.061767578125, "learning_rate": 9.966717803315785e-07, "loss": 0.0001, "reward": 4.3125, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.3125, "step": 241 }, { "completion_length": 335.0, "epoch": 0.026131087355577152, "grad_norm": 3.1238064765930176, "kl": 0.04443359375, "learning_rate": 9.966434265931902e-07, "loss": 0.0, "reward": 4.8125, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.8125, "step": 242 }, { "completion_length": 186.5, "epoch": 0.026239067055393587, "grad_norm": 4.6672492027282715, "kl": 0.0595703125, "learning_rate": 9.966149529978221e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.5625, "step": 243 }, { "completion_length": 192.25, "epoch": 0.026347046755210022, "grad_norm": 3.2580738067626953, "kl": 0.052001953125, "learning_rate": 9.965863595523454e-07, "loss": 0.0001, "reward": 4.6875, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.6875, "step": 244 }, { "completion_length": 152.75, "epoch": 0.026455026455026454, "grad_norm": 4.204739570617676, "kl": 0.0771484375, "learning_rate": 9.96557646263661e-07, "loss": 0.0001, "reward": 4.1875, "reward_std": 0.41377514600753784, "rewards/gpt4o_reward_model": 4.1875, "step": 245 }, { "completion_length": 120.0, "epoch": 0.02656300615484289, "grad_norm": 4.965406894683838, "kl": 0.055419921875, "learning_rate": 9.965288131386984e-07, "loss": 0.0001, "reward": 4.3125, "reward_std": 1.0930101871490479, "rewards/gpt4o_reward_model": 4.3125, "step": 246 }, { "completion_length": 67.25, "epoch": 0.026670985854659324, "grad_norm": 3.763925552368164, "kl": 0.07470703125, "learning_rate": 9.964998601844158e-07, "loss": 0.0001, "reward": 4.4375, "reward_std": 0.6251000165939331, "rewards/gpt4o_reward_model": 4.4375, "step": 247 }, { "completion_length": 81.5, "epoch": 0.02677896555447576, "grad_norm": 9.182212829589844, "kl": 0.055908203125, "learning_rate": 9.96470787407801e-07, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.0625, "step": 248 }, { "completion_length": 97.0, "epoch": 0.026886945254292194, "grad_norm": 2.3663017749786377, "kl": 0.04931640625, "learning_rate": 9.964415948158696e-07, "loss": 0.0, "reward": 4.875, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.875, "step": 249 }, { "completion_length": 100.75, "epoch": 0.02699492495410863, "grad_norm": 3.542591094970703, "kl": 0.08837890625, "learning_rate": 9.964122824156672e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.383794367313385, "rewards/gpt4o_reward_model": 4.5625, "step": 250 }, { "completion_length": 178.75, "epoch": 0.02710290465392506, "grad_norm": 3.1234817504882812, "kl": 0.061279296875, "learning_rate": 9.963828502142677e-07, "loss": 0.0001, "reward": 4.8125, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.8125, "step": 251 }, { "completion_length": 146.0, "epoch": 0.027210884353741496, "grad_norm": 3.096550226211548, "kl": 0.060546875, "learning_rate": 9.963532982187743e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.5, "step": 252 }, { "completion_length": 339.75, "epoch": 0.02731886405355793, "grad_norm": 4.885226726531982, "kl": 0.050048828125, "learning_rate": 9.963236264363188e-07, "loss": 0.0001, "reward": 3.875, "reward_std": 0.5985617637634277, "rewards/gpt4o_reward_model": 3.875, "step": 253 }, { "completion_length": 141.75, "epoch": 0.027426843753374366, "grad_norm": 4.643560886383057, "kl": 0.09033203125, "learning_rate": 9.962938348740617e-07, "loss": 0.0001, "reward": 4.3125, "reward_std": 0.633794367313385, "rewards/gpt4o_reward_model": 4.3125, "step": 254 }, { "completion_length": 150.75, "epoch": 0.0275348234531908, "grad_norm": 8.297577857971191, "kl": 0.1337890625, "learning_rate": 9.962639235391932e-07, "loss": 0.0001, "reward": 4.1875, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.1875, "step": 255 }, { "completion_length": 122.5, "epoch": 0.027642803153007236, "grad_norm": 3.3994829654693604, "kl": 0.055419921875, "learning_rate": 9.962338924389318e-07, "loss": 0.0001, "reward": 4.4375, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.4375, "step": 256 }, { "completion_length": 166.0, "epoch": 0.027750782852823667, "grad_norm": 4.4432244300842285, "kl": 0.0712890625, "learning_rate": 9.962037415805248e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.6444375514984131, "rewards/gpt4o_reward_model": 4.625, "step": 257 }, { "completion_length": 104.0, "epoch": 0.027858762552640103, "grad_norm": 4.164874076843262, "kl": 0.0546875, "learning_rate": 9.961734709712488e-07, "loss": 0.0001, "reward": 3.625, "reward_std": 0.9396764636039734, "rewards/gpt4o_reward_model": 3.625, "step": 258 }, { "completion_length": 334.25, "epoch": 0.027966742252456538, "grad_norm": 5.587090015411377, "kl": 0.08642578125, "learning_rate": 9.961430806184093e-07, "loss": 0.0001, "reward": 4.4375, "reward_std": 0.5921862125396729, "rewards/gpt4o_reward_model": 4.4375, "step": 259 }, { "completion_length": 50.25, "epoch": 0.028074721952272973, "grad_norm": 3.80084490776062, "kl": 0.0908203125, "learning_rate": 9.9611257052934e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.4331127107143402, "rewards/gpt4o_reward_model": 4.625, "step": 260 }, { "completion_length": 216.5, "epoch": 0.028182701652089408, "grad_norm": 5.016978740692139, "kl": 0.05322265625, "learning_rate": 9.960819407114046e-07, "loss": 0.0001, "reward": 4.375, "reward_std": 0.6896764636039734, "rewards/gpt4o_reward_model": 4.375, "step": 261 }, { "completion_length": 45.0, "epoch": 0.028290681351905843, "grad_norm": 9.00546932220459, "kl": 0.0830078125, "learning_rate": 9.960511911719949e-07, "loss": 0.0001, "reward": 3.3125, "reward_std": 0.9137751460075378, "rewards/gpt4o_reward_model": 3.3125, "step": 262 }, { "completion_length": 113.5, "epoch": 0.028398661051722278, "grad_norm": 2.5115816593170166, "kl": 0.0673828125, "learning_rate": 9.960203219185314e-07, "loss": 0.0001, "reward": 4.8125, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.8125, "step": 263 }, { "completion_length": 466.75, "epoch": 0.02850664075153871, "grad_norm": 4.739973545074463, "kl": 0.07861328125, "learning_rate": 9.959893329584647e-07, "loss": 0.0001, "reward": 4.1875, "reward_std": 0.633794367313385, "rewards/gpt4o_reward_model": 4.1875, "step": 264 }, { "completion_length": 188.25, "epoch": 0.028614620451355145, "grad_norm": 3.5349810123443604, "kl": 0.083984375, "learning_rate": 9.95958224299273e-07, "loss": 0.0001, "reward": 4.75, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.75, "step": 265 }, { "completion_length": 79.0, "epoch": 0.02872260015117158, "grad_norm": 4.118009567260742, "kl": 0.055908203125, "learning_rate": 9.95926995948464e-07, "loss": 0.0001, "reward": 4.4375, "reward_std": 0.7286534309387207, "rewards/gpt4o_reward_model": 4.4375, "step": 266 }, { "completion_length": 63.25, "epoch": 0.028830579850988015, "grad_norm": 4.015948295593262, "kl": 0.111328125, "learning_rate": 9.95895647913574e-07, "loss": 0.0001, "reward": 4.8125, "reward_std": 0.2694375813007355, "rewards/gpt4o_reward_model": 4.8125, "step": 267 }, { "completion_length": 85.75, "epoch": 0.02893855955080445, "grad_norm": 4.853457927703857, "kl": 0.07763671875, "learning_rate": 9.958641802021685e-07, "loss": 0.0001, "reward": 4.4375, "reward_std": 0.6038135886192322, "rewards/gpt4o_reward_model": 4.4375, "step": 268 }, { "completion_length": 103.0, "epoch": 0.029046539250620885, "grad_norm": 2.9576172828674316, "kl": 0.05126953125, "learning_rate": 9.958325928218419e-07, "loss": 0.0001, "reward": 4.125, "reward_std": 0.4788135886192322, "rewards/gpt4o_reward_model": 4.125, "step": 269 }, { "completion_length": 192.0, "epoch": 0.029154518950437316, "grad_norm": 3.628898859024048, "kl": 0.0869140625, "learning_rate": 9.958008857802169e-07, "loss": 0.0001, "reward": 4.8125, "reward_std": 0.3751000165939331, "rewards/gpt4o_reward_model": 4.8125, "step": 270 }, { "completion_length": 321.0, "epoch": 0.02926249865025375, "grad_norm": 3.3045108318328857, "kl": 0.06982421875, "learning_rate": 9.957690590849457e-07, "loss": 0.0001, "reward": 4.6875, "reward_std": 0.5194375514984131, "rewards/gpt4o_reward_model": 4.6875, "step": 271 }, { "completion_length": 123.5, "epoch": 0.029370478350070187, "grad_norm": 14.728436470031738, "kl": 6.5, "learning_rate": 9.957371127437093e-07, "loss": 0.0065, "reward": 3.75, "reward_std": 0.8185809850692749, "rewards/gpt4o_reward_model": 3.75, "step": 272 }, { "completion_length": 169.5, "epoch": 0.02947845804988662, "grad_norm": 2.554886817932129, "kl": 0.1025390625, "learning_rate": 9.957050467642172e-07, "loss": 0.0001, "reward": 4.875, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.875, "step": 273 }, { "completion_length": 119.25, "epoch": 0.029586437749703057, "grad_norm": 2.4691953659057617, "kl": 0.0673828125, "learning_rate": 9.956728611542082e-07, "loss": 0.0001, "reward": 4.9375, "reward_std": 0.1251000016927719, "rewards/gpt4o_reward_model": 4.9375, "step": 274 }, { "completion_length": 181.0, "epoch": 0.029694417449519492, "grad_norm": 4.251060485839844, "kl": 0.0791015625, "learning_rate": 9.956405559214498e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.6144567728042603, "rewards/gpt4o_reward_model": 4.5, "step": 275 }, { "completion_length": 181.25, "epoch": 0.029802397149335923, "grad_norm": 3.3034920692443848, "kl": 0.07958984375, "learning_rate": 9.956081310737382e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.47356173396110535, "rewards/gpt4o_reward_model": 4.5625, "step": 276 }, { "completion_length": 105.75, "epoch": 0.02991037684915236, "grad_norm": 4.519341945648193, "kl": 0.08642578125, "learning_rate": 9.955755866188986e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.5001000165939331, "rewards/gpt4o_reward_model": 4.625, "step": 277 }, { "completion_length": 115.0, "epoch": 0.030018356548968794, "grad_norm": 3.830291509628296, "kl": 0.0966796875, "learning_rate": 9.955429225647854e-07, "loss": 0.0001, "reward": 3.6875, "reward_std": 0.5281319618225098, "rewards/gpt4o_reward_model": 3.6875, "step": 278 }, { "completion_length": 342.75, "epoch": 0.03012633624878523, "grad_norm": 3.2819526195526123, "kl": 0.0712890625, "learning_rate": 9.95510138919281e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.625, "step": 279 }, { "completion_length": 107.75, "epoch": 0.030234315948601664, "grad_norm": 4.989029884338379, "kl": 0.0771484375, "learning_rate": 9.95477235690298e-07, "loss": 0.0001, "reward": 4.0, "reward_std": 0.7501000165939331, "rewards/gpt4o_reward_model": 4.0, "step": 280 }, { "completion_length": 128.0, "epoch": 0.0303422956484181, "grad_norm": 2.953835964202881, "kl": 0.09375, "learning_rate": 9.954442128857761e-07, "loss": 0.0001, "reward": 4.75, "reward_std": 0.28877514600753784, "rewards/gpt4o_reward_model": 4.75, "step": 281 }, { "completion_length": 142.75, "epoch": 0.03045027534823453, "grad_norm": 4.569122314453125, "kl": 0.08544921875, "learning_rate": 9.954110705136856e-07, "loss": 0.0001, "reward": 4.375, "reward_std": 0.5387751460075378, "rewards/gpt4o_reward_model": 4.375, "step": 282 }, { "completion_length": 291.25, "epoch": 0.030558255048050965, "grad_norm": 1.5806266069412231, "kl": 0.09521484375, "learning_rate": 9.953778085820245e-07, "loss": 0.0001, "reward": 4.625, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.625, "step": 283 }, { "completion_length": 35.75, "epoch": 0.0306662347478674, "grad_norm": 5.810234069824219, "kl": 0.11572265625, "learning_rate": 9.953444270988203e-07, "loss": 0.0001, "reward": 4.5625, "reward_std": 0.48945680260658264, "rewards/gpt4o_reward_model": 4.5625, "step": 284 }, { "completion_length": 128.0, "epoch": 0.030774214447683836, "grad_norm": 0.006735849194228649, "kl": 0.09912109375, "learning_rate": 9.953109260721287e-07, "loss": 0.0001, "reward": 5.0, "reward_std": 9.999999747378752e-05, "rewards/gpt4o_reward_model": 5.0, "step": 285 }, { "completion_length": 188.5, "epoch": 0.03088219414750027, "grad_norm": 3.38196063041687, "kl": 0.078125, "learning_rate": 9.952773055100351e-07, "loss": 0.0001, "reward": 4.375, "reward_std": 0.2501000165939331, "rewards/gpt4o_reward_model": 4.375, "step": 286 }, { "completion_length": 241.75, "epoch": 0.030990173847316706, "grad_norm": 3.892089605331421, "kl": 0.0888671875, "learning_rate": 9.95243565420653e-07, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.633794367313385, "rewards/gpt4o_reward_model": 4.0625, "step": 287 }, { "completion_length": 141.5, "epoch": 0.031098153547133137, "grad_norm": 3.2392618656158447, "kl": 0.109375, "learning_rate": 9.95209705812125e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.508794367313385, "rewards/gpt4o_reward_model": 4.5, "step": 288 }, { "completion_length": 232.25, "epoch": 0.031206133246949572, "grad_norm": 2.5510284900665283, "kl": 0.06640625, "learning_rate": 9.95175726692623e-07, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.4435809552669525, "rewards/gpt4o_reward_model": 4.0625, "step": 289 }, { "completion_length": 53.25, "epoch": 0.03131411294676601, "grad_norm": 6.491455078125, "kl": 0.1220703125, "learning_rate": 9.951416280703465e-07, "loss": 0.0001, "reward": 4.25, "reward_std": 0.3944375813007355, "rewards/gpt4o_reward_model": 4.25, "step": 290 } ], "logging_steps": 1, "max_steps": 6400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }