cp3 / trainer_state.json
eve1f's picture
Upload folder using huggingface_hub
035922f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.03131411294676601,
"eval_steps": 500,
"global_step": 290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 149.0,
"epoch": 0.00010797969981643452,
"grad_norm": 3.35890793800354,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 3.875,
"step": 1
},
{
"completion_length": 87.25,
"epoch": 0.00021595939963286903,
"grad_norm": 10.705058097839355,
"kl": 0.0,
"learning_rate": 2e-07,
"loss": -0.0,
"reward": 4.5625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.5625,
"step": 2
},
{
"completion_length": 125.0,
"epoch": 0.00032393909944930353,
"grad_norm": 7.079329013824463,
"kl": 0.00016307830810546875,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.758794367313385,
"rewards/gpt4o_reward_model": 4.5,
"step": 3
},
{
"completion_length": 99.5,
"epoch": 0.00043191879926573806,
"grad_norm": 4.656938552856445,
"kl": 0.0004425048828125,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.6178992986679077,
"rewards/gpt4o_reward_model": 3.6875,
"step": 4
},
{
"completion_length": 175.75,
"epoch": 0.0005398984990821725,
"grad_norm": 4.849682807922363,
"kl": 7.62939453125e-05,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.579224169254303,
"rewards/gpt4o_reward_model": 4.0625,
"step": 5
},
{
"completion_length": 667.0,
"epoch": 0.0006478781988986071,
"grad_norm": 4.602669715881348,
"kl": 5.936622619628906e-05,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 3.125,
"reward_std": 0.9418070316314697,
"rewards/gpt4o_reward_model": 3.125,
"step": 6
},
{
"completion_length": 220.5,
"epoch": 0.0007558578987150416,
"grad_norm": 4.340234756469727,
"kl": 0.00010204315185546875,
"learning_rate": 9.999999396664822e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.7619017362594604,
"rewards/gpt4o_reward_model": 4.0,
"step": 7
},
{
"completion_length": 89.75,
"epoch": 0.0008638375985314761,
"grad_norm": 3.9225738048553467,
"kl": 0.0001468658447265625,
"learning_rate": 9.999997586659434e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.8765935897827148,
"rewards/gpt4o_reward_model": 4.1875,
"step": 8
},
{
"completion_length": 135.0,
"epoch": 0.0009718172983479105,
"grad_norm": 4.508596897125244,
"kl": 0.00015354156494140625,
"learning_rate": 9.999994569984275e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.5625,
"step": 9
},
{
"completion_length": 199.25,
"epoch": 0.001079796998164345,
"grad_norm": 5.958744525909424,
"kl": 0.0003070831298828125,
"learning_rate": 9.99999034664007e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.81220543384552,
"rewards/gpt4o_reward_model": 3.9375,
"step": 10
},
{
"completion_length": 111.5,
"epoch": 0.0011877766979807797,
"grad_norm": 5.480228900909424,
"kl": 0.0001983642578125,
"learning_rate": 9.999984916627839e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.5625,
"step": 11
},
{
"completion_length": 185.25,
"epoch": 0.0012957563977972141,
"grad_norm": 3.4712541103363037,
"kl": 0.000213623046875,
"learning_rate": 9.999978279948895e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.3125,
"step": 12
},
{
"completion_length": 112.75,
"epoch": 0.0014037360976136485,
"grad_norm": 3.926478147506714,
"kl": 0.0001087188720703125,
"learning_rate": 9.999970436604836e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.7286534309387207,
"rewards/gpt4o_reward_model": 4.3125,
"step": 13
},
{
"completion_length": 147.75,
"epoch": 0.0015117157974300832,
"grad_norm": 3.4515721797943115,
"kl": 0.0002498626708984375,
"learning_rate": 9.999961386597556e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.3125,
"step": 14
},
{
"completion_length": 117.5,
"epoch": 0.0016196954972465176,
"grad_norm": 6.8424906730651855,
"kl": 0.000476837158203125,
"learning_rate": 9.99995112992924e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 3.75,
"step": 15
},
{
"completion_length": 159.5,
"epoch": 0.0017276751970629522,
"grad_norm": 3.0959248542785645,
"kl": 0.000247955322265625,
"learning_rate": 9.999939666602364e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.6875,
"step": 16
},
{
"completion_length": 534.0,
"epoch": 0.0018356548968793867,
"grad_norm": 5.143649101257324,
"kl": 0.000560760498046875,
"learning_rate": 9.999926996619692e-07,
"loss": 0.0,
"reward": 3.5625,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_reward_model": 3.5625,
"step": 17
},
{
"completion_length": 181.25,
"epoch": 0.001943634596695821,
"grad_norm": 3.3201894760131836,
"kl": 0.0004825592041015625,
"learning_rate": 9.999913119984283e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.25,
"step": 18
},
{
"completion_length": 107.0,
"epoch": 0.0020516142965122555,
"grad_norm": 4.407830238342285,
"kl": 0.000476837158203125,
"learning_rate": 9.999898036699488e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.0,
"step": 19
},
{
"completion_length": 106.5,
"epoch": 0.00215959399632869,
"grad_norm": 5.270094394683838,
"kl": 0.0008392333984375,
"learning_rate": 9.999881746768941e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.633794367313385,
"rewards/gpt4o_reward_model": 3.6875,
"step": 20
},
{
"completion_length": 67.0,
"epoch": 0.0022675736961451248,
"grad_norm": 3.9482433795928955,
"kl": 0.000579833984375,
"learning_rate": 9.99986425019658e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.758794367313385,
"rewards/gpt4o_reward_model": 3.75,
"step": 21
},
{
"completion_length": 163.5,
"epoch": 0.0023755533959615594,
"grad_norm": 3.5114428997039795,
"kl": 0.000675201416015625,
"learning_rate": 9.999845546986625e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.4375,
"step": 22
},
{
"completion_length": 80.5,
"epoch": 0.0024835330957779936,
"grad_norm": 2.4469258785247803,
"kl": 0.00099945068359375,
"learning_rate": 9.99982563714359e-07,
"loss": 0.0,
"reward": 3.1875,
"reward_std": 0.23945678770542145,
"rewards/gpt4o_reward_model": 3.1875,
"step": 23
},
{
"completion_length": 136.25,
"epoch": 0.0025915127955944283,
"grad_norm": 2.949350357055664,
"kl": 0.000576019287109375,
"learning_rate": 9.999804520672277e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_reward_model": 4.0,
"step": 24
},
{
"completion_length": 56.75,
"epoch": 0.002699492495410863,
"grad_norm": 4.914743423461914,
"kl": 0.00127410888671875,
"learning_rate": 9.999782197577788e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.5387751460075378,
"rewards/gpt4o_reward_model": 4.0,
"step": 25
},
{
"completion_length": 83.0,
"epoch": 0.002807472195227297,
"grad_norm": 4.330339431762695,
"kl": 0.00090789794921875,
"learning_rate": 9.999758667865504e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.3125,
"step": 26
},
{
"completion_length": 111.25,
"epoch": 0.0029154518950437317,
"grad_norm": 7.491921901702881,
"kl": 0.00148773193359375,
"learning_rate": 9.999733931541108e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.796310305595398,
"rewards/gpt4o_reward_model": 3.6875,
"step": 27
},
{
"completion_length": 99.5,
"epoch": 0.0030234315948601664,
"grad_norm": 1.0942319631576538,
"kl": 0.00092315673828125,
"learning_rate": 9.999707988610568e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.625,
"step": 28
},
{
"completion_length": 107.5,
"epoch": 0.003131411294676601,
"grad_norm": 4.034415245056152,
"kl": 0.001800537109375,
"learning_rate": 9.999680839080146e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.579224169254303,
"rewards/gpt4o_reward_model": 3.9375,
"step": 29
},
{
"completion_length": 183.5,
"epoch": 0.003239390994493035,
"grad_norm": 5.061659336090088,
"kl": 0.0013885498046875,
"learning_rate": 9.999652482956392e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.4375,
"step": 30
},
{
"completion_length": 116.75,
"epoch": 0.00334737069430947,
"grad_norm": 4.587920188903809,
"kl": 0.002685546875,
"learning_rate": 9.99962292024615e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.7394567728042603,
"rewards/gpt4o_reward_model": 3.8125,
"step": 31
},
{
"completion_length": 92.0,
"epoch": 0.0034553503941259045,
"grad_norm": 4.07750129699707,
"kl": 0.001800537109375,
"learning_rate": 9.999592150956556e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.47356173396110535,
"rewards/gpt4o_reward_model": 3.6875,
"step": 32
},
{
"completion_length": 92.0,
"epoch": 0.0035633300939423387,
"grad_norm": 4.351490020751953,
"kl": 0.0020294189453125,
"learning_rate": 9.999560175095034e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.6115237474441528,
"rewards/gpt4o_reward_model": 4.125,
"step": 33
},
{
"completion_length": 92.5,
"epoch": 0.0036713097937587733,
"grad_norm": 5.623327255249023,
"kl": 0.0019378662109375,
"learning_rate": 9.9995269926693e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.34856173396110535,
"rewards/gpt4o_reward_model": 4.625,
"step": 34
},
{
"completion_length": 90.5,
"epoch": 0.003779289493575208,
"grad_norm": 7.087028503417969,
"kl": 0.00164031982421875,
"learning_rate": 9.999492603687366e-07,
"loss": 0.0,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.8125,
"step": 35
},
{
"completion_length": 117.5,
"epoch": 0.003887269193391642,
"grad_norm": 3.4546611309051514,
"kl": 0.00173187255859375,
"learning_rate": 9.999457008157528e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.0,
"step": 36
},
{
"completion_length": 164.25,
"epoch": 0.003995248893208077,
"grad_norm": 3.795534610748291,
"kl": 0.00201416015625,
"learning_rate": 9.999420206088379e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 3.875,
"step": 37
},
{
"completion_length": 168.75,
"epoch": 0.004103228593024511,
"grad_norm": 4.190282344818115,
"kl": 0.00183868408203125,
"learning_rate": 9.999382197488796e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.3125,
"step": 38
},
{
"completion_length": 140.0,
"epoch": 0.004211208292840946,
"grad_norm": 2.999417781829834,
"kl": 0.0020904541015625,
"learning_rate": 9.999342982367957e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_reward_model": 4.0,
"step": 39
},
{
"completion_length": 102.5,
"epoch": 0.00431918799265738,
"grad_norm": 5.018375873565674,
"kl": 0.0027313232421875,
"learning_rate": 9.99930256073532e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 3.75,
"step": 40
},
{
"completion_length": 94.5,
"epoch": 0.004427167692473815,
"grad_norm": 5.892095565795898,
"kl": 0.0028839111328125,
"learning_rate": 9.999260932600648e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 1.1827775239944458,
"rewards/gpt4o_reward_model": 3.8125,
"step": 41
},
{
"completion_length": 88.0,
"epoch": 0.0045351473922902496,
"grad_norm": 3.4567017555236816,
"kl": 0.0014190673828125,
"learning_rate": 9.99921809797398e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.47356173396110535,
"rewards/gpt4o_reward_model": 4.1875,
"step": 42
},
{
"completion_length": 148.75,
"epoch": 0.004643127092106684,
"grad_norm": 4.687886714935303,
"kl": 0.0025177001953125,
"learning_rate": 9.999174056865658e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.5581127405166626,
"rewards/gpt4o_reward_model": 4.1875,
"step": 43
},
{
"completion_length": 188.0,
"epoch": 0.004751106791923119,
"grad_norm": 3.5290260314941406,
"kl": 0.00238037109375,
"learning_rate": 9.999128809286309e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.25,
"step": 44
},
{
"completion_length": 86.5,
"epoch": 0.004859086491739553,
"grad_norm": 4.073955535888672,
"kl": 0.002655029296875,
"learning_rate": 9.99908235524685e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.0625,
"step": 45
},
{
"completion_length": 87.25,
"epoch": 0.004967066191555987,
"grad_norm": 4.2338104248046875,
"kl": 0.0042724609375,
"learning_rate": 9.9990346947585e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.5,
"step": 46
},
{
"completion_length": 127.25,
"epoch": 0.005075045891372422,
"grad_norm": 18.224328994750977,
"kl": 0.01458740234375,
"learning_rate": 9.998985827832752e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.7365237474441528,
"rewards/gpt4o_reward_model": 4.1875,
"step": 47
},
{
"completion_length": 224.75,
"epoch": 0.0051830255911888565,
"grad_norm": 3.0918076038360596,
"kl": 0.0031585693359375,
"learning_rate": 9.998935754481404e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.4435809552669525,
"rewards/gpt4o_reward_model": 4.0625,
"step": 48
},
{
"completion_length": 98.25,
"epoch": 0.005291005291005291,
"grad_norm": 6.054563522338867,
"kl": 0.0042724609375,
"learning_rate": 9.998884474716539e-07,
"loss": 0.0,
"reward": 2.875,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 2.875,
"step": 49
},
{
"completion_length": 103.75,
"epoch": 0.005398984990821726,
"grad_norm": 5.9451069831848145,
"kl": 0.0028533935546875,
"learning_rate": 9.998831988550533e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.5625,
"step": 50
},
{
"completion_length": 133.75,
"epoch": 0.00550696469063816,
"grad_norm": 6.4553728103637695,
"kl": 0.006317138671875,
"learning_rate": 9.998778295996054e-07,
"loss": 0.0,
"reward": 3.5625,
"reward_std": 0.9518133401870728,
"rewards/gpt4o_reward_model": 3.5625,
"step": 51
},
{
"completion_length": 81.5,
"epoch": 0.005614944390454594,
"grad_norm": 6.15158748626709,
"kl": 0.002899169921875,
"learning_rate": 9.998723397066058e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 1.060096263885498,
"rewards/gpt4o_reward_model": 3.8125,
"step": 52
},
{
"completion_length": 134.0,
"epoch": 0.005722924090271029,
"grad_norm": 5.659409999847412,
"kl": 0.0040283203125,
"learning_rate": 9.998667291773794e-07,
"loss": 0.0,
"reward": 3.5625,
"reward_std": 0.8831573724746704,
"rewards/gpt4o_reward_model": 3.5625,
"step": 53
},
{
"completion_length": 100.0,
"epoch": 0.0058309037900874635,
"grad_norm": 5.585843563079834,
"kl": 0.004150390625,
"learning_rate": 9.998609980132803e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 3.75,
"step": 54
},
{
"completion_length": 158.0,
"epoch": 0.005938883489903898,
"grad_norm": 4.5382161140441895,
"kl": 0.004302978515625,
"learning_rate": 9.998551462156917e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.0,
"step": 55
},
{
"completion_length": 83.75,
"epoch": 0.006046863189720333,
"grad_norm": 3.957919120788574,
"kl": 0.0040283203125,
"learning_rate": 9.998491737860255e-07,
"loss": 0.0,
"reward": 4.375,
"reward_std": 0.5840140581130981,
"rewards/gpt4o_reward_model": 4.375,
"step": 56
},
{
"completion_length": 175.5,
"epoch": 0.006154842889536767,
"grad_norm": 4.4697747230529785,
"kl": 0.004302978515625,
"learning_rate": 9.998430807257234e-07,
"loss": 0.0,
"reward": 4.375,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.375,
"step": 57
},
{
"completion_length": 145.75,
"epoch": 0.006262822589353202,
"grad_norm": 4.669873237609863,
"kl": 0.004364013671875,
"learning_rate": 9.998368670362557e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 3.9375,
"step": 58
},
{
"completion_length": 172.25,
"epoch": 0.006370802289169636,
"grad_norm": 4.383346080780029,
"kl": 0.003509521484375,
"learning_rate": 9.998305327191222e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.971612811088562,
"rewards/gpt4o_reward_model": 4.125,
"step": 59
},
{
"completion_length": 125.5,
"epoch": 0.00647878198898607,
"grad_norm": 4.461704254150391,
"kl": 0.0030670166015625,
"learning_rate": 9.998240777758514e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.25,
"step": 60
},
{
"completion_length": 220.25,
"epoch": 0.0065867616888025055,
"grad_norm": 4.459292888641357,
"kl": 0.00457763671875,
"learning_rate": 9.99817502208001e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.883794367313385,
"rewards/gpt4o_reward_model": 4.0625,
"step": 61
},
{
"completion_length": 124.0,
"epoch": 0.00669474138861894,
"grad_norm": 4.459859848022461,
"kl": 0.00347900390625,
"learning_rate": 9.998108060171579e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.7283515930175781,
"rewards/gpt4o_reward_model": 4.125,
"step": 62
},
{
"completion_length": 202.0,
"epoch": 0.006802721088435374,
"grad_norm": 4.133177757263184,
"kl": 0.0035858154296875,
"learning_rate": 9.998039892049383e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.25,
"step": 63
},
{
"completion_length": 71.5,
"epoch": 0.006910700788251809,
"grad_norm": 5.423212051391602,
"kl": 0.0069580078125,
"learning_rate": 9.997970517729874e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.8220234513282776,
"rewards/gpt4o_reward_model": 3.9375,
"step": 64
},
{
"completion_length": 171.25,
"epoch": 0.007018680488068243,
"grad_norm": 3.992265462875366,
"kl": 0.0023040771484375,
"learning_rate": 9.997899937229792e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.5387751460075378,
"rewards/gpt4o_reward_model": 3.875,
"step": 65
},
{
"completion_length": 118.25,
"epoch": 0.007126660187884677,
"grad_norm": 4.649938583374023,
"kl": 0.00506591796875,
"learning_rate": 9.997828150566171e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.704224169254303,
"rewards/gpt4o_reward_model": 4.125,
"step": 66
},
{
"completion_length": 112.5,
"epoch": 0.0072346398877011124,
"grad_norm": 4.159398078918457,
"kl": 0.005767822265625,
"learning_rate": 9.997755157756337e-07,
"loss": 0.0,
"reward": 3.6875,
"reward_std": 0.5188006162643433,
"rewards/gpt4o_reward_model": 3.6875,
"step": 67
},
{
"completion_length": 66.5,
"epoch": 0.007342619587517547,
"grad_norm": 3.303025484085083,
"kl": 0.00494384765625,
"learning_rate": 9.997680958817907e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.454224169254303,
"rewards/gpt4o_reward_model": 4.125,
"step": 68
},
{
"completion_length": 151.0,
"epoch": 0.007450599287333981,
"grad_norm": 3.7500741481781006,
"kl": 0.004150390625,
"learning_rate": 9.99760555376878e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.5685809850692749,
"rewards/gpt4o_reward_model": 4.5,
"step": 69
},
{
"completion_length": 131.5,
"epoch": 0.007558578987150416,
"grad_norm": 5.136856555938721,
"kl": 0.00537109375,
"learning_rate": 9.997528942627165e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.8678992986679077,
"rewards/gpt4o_reward_model": 3.8125,
"step": 70
},
{
"completion_length": 249.25,
"epoch": 0.00766655868696685,
"grad_norm": 3.3416953086853027,
"kl": 0.00469970703125,
"learning_rate": 9.997451125411542e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.5646764636039734,
"rewards/gpt4o_reward_model": 4.3125,
"step": 71
},
{
"completion_length": 134.0,
"epoch": 0.007774538386783284,
"grad_norm": 3.785247564315796,
"kl": 0.0054931640625,
"learning_rate": 9.997372102140694e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.5,
"step": 72
},
{
"completion_length": 82.0,
"epoch": 0.007882518086599719,
"grad_norm": 5.987561225891113,
"kl": 0.00604248046875,
"learning_rate": 9.997291872833694e-07,
"loss": 0.0,
"reward": 4.375,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.375,
"step": 73
},
{
"completion_length": 80.75,
"epoch": 0.007990497786416154,
"grad_norm": 4.2266130447387695,
"kl": 0.00677490234375,
"learning_rate": 9.9972104375099e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.75,
"step": 74
},
{
"completion_length": 120.5,
"epoch": 0.008098477486232589,
"grad_norm": 4.478511333465576,
"kl": 0.005523681640625,
"learning_rate": 9.997127796188967e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.6831127405166626,
"rewards/gpt4o_reward_model": 4.0,
"step": 75
},
{
"completion_length": 238.75,
"epoch": 0.008206457186049022,
"grad_norm": 3.8930044174194336,
"kl": 0.005279541015625,
"learning_rate": 9.997043948890839e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.625,
"step": 76
},
{
"completion_length": 357.0,
"epoch": 0.008314436885865457,
"grad_norm": 4.40590763092041,
"kl": 0.00555419921875,
"learning_rate": 9.996958895635754e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.9470234513282776,
"rewards/gpt4o_reward_model": 4.125,
"step": 77
},
{
"completion_length": 127.25,
"epoch": 0.008422416585681892,
"grad_norm": 4.700560092926025,
"kl": 0.0064697265625,
"learning_rate": 9.996872636444235e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.5985617637634277,
"rewards/gpt4o_reward_model": 4.5,
"step": 78
},
{
"completion_length": 231.25,
"epoch": 0.008530396285498325,
"grad_norm": 3.062976360321045,
"kl": 0.00421142578125,
"learning_rate": 9.996785171337101e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 3.75,
"step": 79
},
{
"completion_length": 102.5,
"epoch": 0.00863837598531476,
"grad_norm": 4.180944919586182,
"kl": 0.00518798828125,
"learning_rate": 9.996696500335458e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.75,
"step": 80
},
{
"completion_length": 111.75,
"epoch": 0.008746355685131196,
"grad_norm": 4.435964107513428,
"kl": 0.00726318359375,
"learning_rate": 9.996606623460707e-07,
"loss": 0.0,
"reward": 4.375,
"reward_std": 0.7887752056121826,
"rewards/gpt4o_reward_model": 4.375,
"step": 81
},
{
"completion_length": 77.0,
"epoch": 0.00885433538494763,
"grad_norm": 3.1188766956329346,
"kl": 0.007171630859375,
"learning_rate": 9.99651554073454e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.5625,
"step": 82
},
{
"completion_length": 104.75,
"epoch": 0.008962315084764064,
"grad_norm": 5.255317687988281,
"kl": 0.0072021484375,
"learning_rate": 9.996423252178933e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.6935809850692749,
"rewards/gpt4o_reward_model": 4.4375,
"step": 83
},
{
"completion_length": 106.75,
"epoch": 0.009070294784580499,
"grad_norm": 4.521153926849365,
"kl": 0.00860595703125,
"learning_rate": 9.996329757816166e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.6790332794189453,
"rewards/gpt4o_reward_model": 4.0625,
"step": 84
},
{
"completion_length": 177.75,
"epoch": 0.009178274484396934,
"grad_norm": 4.019938945770264,
"kl": 0.00653076171875,
"learning_rate": 9.996235057668797e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.75,
"step": 85
},
{
"completion_length": 116.5,
"epoch": 0.009286254184213368,
"grad_norm": 4.606100559234619,
"kl": 0.0078125,
"learning_rate": 9.99613915175968e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.8233708143234253,
"rewards/gpt4o_reward_model": 4.0625,
"step": 86
},
{
"completion_length": 310.25,
"epoch": 0.009394233884029803,
"grad_norm": 5.037557601928711,
"kl": 0.00836181640625,
"learning_rate": 9.996042040111962e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.7481511831283569,
"rewards/gpt4o_reward_model": 3.9375,
"step": 87
},
{
"completion_length": 135.25,
"epoch": 0.009502213583846238,
"grad_norm": 5.936065196990967,
"kl": 0.014404296875,
"learning_rate": 9.99594372274908e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 1.006845474243164,
"rewards/gpt4o_reward_model": 3.9375,
"step": 88
},
{
"completion_length": 165.25,
"epoch": 0.009610193283662671,
"grad_norm": 4.330586910247803,
"kl": 0.007537841796875,
"learning_rate": 9.995844199694763e-07,
"loss": 0.0,
"reward": 3.1875,
"reward_std": 0.796310305595398,
"rewards/gpt4o_reward_model": 3.1875,
"step": 89
},
{
"completion_length": 87.25,
"epoch": 0.009718172983479106,
"grad_norm": 3.3461477756500244,
"kl": 0.0103759765625,
"learning_rate": 9.995743470973024e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_reward_model": 4.75,
"step": 90
},
{
"completion_length": 64.5,
"epoch": 0.009826152683295541,
"grad_norm": 3.6653449535369873,
"kl": 0.0084228515625,
"learning_rate": 9.995641536608176e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.579224169254303,
"rewards/gpt4o_reward_model": 4.4375,
"step": 91
},
{
"completion_length": 134.5,
"epoch": 0.009934132383111974,
"grad_norm": 3.128530979156494,
"kl": 0.00848388671875,
"learning_rate": 9.99553839662482e-07,
"loss": 0.0,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.8125,
"step": 92
},
{
"completion_length": 121.25,
"epoch": 0.01004211208292841,
"grad_norm": 4.361353397369385,
"kl": 0.0054931640625,
"learning_rate": 9.995434051047845e-07,
"loss": 0.0,
"reward": 3.625,
"reward_std": 0.6831127405166626,
"rewards/gpt4o_reward_model": 3.625,
"step": 93
},
{
"completion_length": 109.25,
"epoch": 0.010150091782744845,
"grad_norm": 4.838723182678223,
"kl": 0.013916015625,
"learning_rate": 9.995328499902433e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.8485617637634277,
"rewards/gpt4o_reward_model": 4.0,
"step": 94
},
{
"completion_length": 179.5,
"epoch": 0.010258071482561278,
"grad_norm": 3.451291084289551,
"kl": 0.01019287109375,
"learning_rate": 9.99522174321406e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.454224169254303,
"rewards/gpt4o_reward_model": 3.875,
"step": 95
},
{
"completion_length": 99.5,
"epoch": 0.010366051182377713,
"grad_norm": 2.002145528793335,
"kl": 0.01031494140625,
"learning_rate": 9.995113781008485e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.6875,
"step": 96
},
{
"completion_length": 127.0,
"epoch": 0.010474030882194148,
"grad_norm": 4.460968494415283,
"kl": 0.01141357421875,
"learning_rate": 9.995004613311768e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.4928992986679077,
"rewards/gpt4o_reward_model": 4.0,
"step": 97
},
{
"completion_length": 193.5,
"epoch": 0.010582010582010581,
"grad_norm": 5.521016597747803,
"kl": 0.01336669921875,
"learning_rate": 9.994894240150252e-07,
"loss": 0.0,
"reward": 2.375,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 2.375,
"step": 98
},
{
"completion_length": 126.5,
"epoch": 0.010689990281827016,
"grad_norm": 7.110630989074707,
"kl": 0.010009765625,
"learning_rate": 9.994782661550573e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.6875,
"step": 99
},
{
"completion_length": 253.75,
"epoch": 0.010797969981643452,
"grad_norm": 3.6743593215942383,
"kl": 0.00982666015625,
"learning_rate": 9.994669877539664e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.5879185199737549,
"rewards/gpt4o_reward_model": 4.1875,
"step": 100
},
{
"completion_length": 68.5,
"epoch": 0.010905949681459885,
"grad_norm": 4.462601661682129,
"kl": 0.01251220703125,
"learning_rate": 9.994555888144736e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.6827775835990906,
"rewards/gpt4o_reward_model": 4.0625,
"step": 101
},
{
"completion_length": 123.0,
"epoch": 0.01101392938127632,
"grad_norm": 4.228979110717773,
"kl": 0.009033203125,
"learning_rate": 9.994440693393305e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_reward_model": 4.6875,
"step": 102
},
{
"completion_length": 95.5,
"epoch": 0.011121909081092755,
"grad_norm": 4.379354476928711,
"kl": 0.01068115234375,
"learning_rate": 9.994324293313169e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.5728486180305481,
"rewards/gpt4o_reward_model": 3.875,
"step": 103
},
{
"completion_length": 198.5,
"epoch": 0.011229888780909188,
"grad_norm": 4.31371545791626,
"kl": 0.01171875,
"learning_rate": 9.994206687932418e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.454224169254303,
"rewards/gpt4o_reward_model": 4.5,
"step": 104
},
{
"completion_length": 210.0,
"epoch": 0.011337868480725623,
"grad_norm": 3.053598642349243,
"kl": 0.00994873046875,
"learning_rate": 9.994087877279436e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.7887751460075378,
"rewards/gpt4o_reward_model": 4.25,
"step": 105
},
{
"completion_length": 144.75,
"epoch": 0.011445848180542059,
"grad_norm": 4.309823036193848,
"kl": 0.0159912109375,
"learning_rate": 9.993967861382895e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.454224169254303,
"rewards/gpt4o_reward_model": 3.75,
"step": 106
},
{
"completion_length": 170.0,
"epoch": 0.011553827880358492,
"grad_norm": 4.035120487213135,
"kl": 0.00762939453125,
"learning_rate": 9.99384664027176e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.6790332198143005,
"rewards/gpt4o_reward_model": 4.3125,
"step": 107
},
{
"completion_length": 219.0,
"epoch": 0.011661807580174927,
"grad_norm": 3.480762004852295,
"kl": 0.0137939453125,
"learning_rate": 9.993724213975286e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.329224169254303,
"rewards/gpt4o_reward_model": 4.3125,
"step": 108
},
{
"completion_length": 95.75,
"epoch": 0.011769787279991362,
"grad_norm": 5.104645252227783,
"kl": 0.017578125,
"learning_rate": 9.993600582523015e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.8625079393386841,
"rewards/gpt4o_reward_model": 3.8125,
"step": 109
},
{
"completion_length": 198.0,
"epoch": 0.011877766979807795,
"grad_norm": 5.45714807510376,
"kl": 0.0157470703125,
"learning_rate": 9.993475745944787e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.7694376111030579,
"rewards/gpt4o_reward_model": 3.8125,
"step": 110
},
{
"completion_length": 242.25,
"epoch": 0.01198574667962423,
"grad_norm": 3.7047228813171387,
"kl": 0.0146484375,
"learning_rate": 9.99334970427073e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_reward_model": 3.9375,
"step": 111
},
{
"completion_length": 163.75,
"epoch": 0.012093726379440665,
"grad_norm": 4.073879718780518,
"kl": 0.0111083984375,
"learning_rate": 9.993222457531262e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.6724694967269897,
"rewards/gpt4o_reward_model": 4.1875,
"step": 112
},
{
"completion_length": 113.5,
"epoch": 0.012201706079257099,
"grad_norm": 4.12031888961792,
"kl": 0.01385498046875,
"learning_rate": 9.99309400575709e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.6178992986679077,
"rewards/gpt4o_reward_model": 3.9375,
"step": 113
},
{
"completion_length": 43.5,
"epoch": 0.012309685779073534,
"grad_norm": 0.003000754164531827,
"kl": 0.0172119140625,
"learning_rate": 9.992964348979213e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_reward_model": 4.25,
"step": 114
},
{
"completion_length": 270.0,
"epoch": 0.012417665478889969,
"grad_norm": 3.4506990909576416,
"kl": 0.0166015625,
"learning_rate": 9.992833487228923e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.5840140581130981,
"rewards/gpt4o_reward_model": 4.125,
"step": 115
},
{
"completion_length": 232.25,
"epoch": 0.012525645178706404,
"grad_norm": 3.1857428550720215,
"kl": 0.0169677734375,
"learning_rate": 9.992701420537803e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.0,
"step": 116
},
{
"completion_length": 128.0,
"epoch": 0.012633624878522837,
"grad_norm": 3.1388204097747803,
"kl": 0.01611328125,
"learning_rate": 9.992568148937722e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.4375,
"step": 117
},
{
"completion_length": 125.75,
"epoch": 0.012741604578339272,
"grad_norm": 3.442460298538208,
"kl": 0.01611328125,
"learning_rate": 9.992433672460844e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.4435809552669525,
"rewards/gpt4o_reward_model": 4.4375,
"step": 118
},
{
"completion_length": 143.75,
"epoch": 0.012849584278155708,
"grad_norm": 4.008980751037598,
"kl": 0.0235595703125,
"learning_rate": 9.992297991139627e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 4.625,
"step": 119
},
{
"completion_length": 100.5,
"epoch": 0.01295756397797214,
"grad_norm": 3.958385705947876,
"kl": 0.01470947265625,
"learning_rate": 9.992161105006809e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.5581127405166626,
"rewards/gpt4o_reward_model": 4.1875,
"step": 120
},
{
"completion_length": 175.0,
"epoch": 0.013065543677788576,
"grad_norm": 3.9238104820251465,
"kl": 0.017333984375,
"learning_rate": 9.99202301409543e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_reward_model": 4.5625,
"step": 121
},
{
"completion_length": 64.0,
"epoch": 0.013173523377605011,
"grad_norm": 5.058772087097168,
"kl": 0.023681640625,
"learning_rate": 9.991883718438813e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.625,
"step": 122
},
{
"completion_length": 92.5,
"epoch": 0.013281503077421444,
"grad_norm": 6.274599552154541,
"kl": 0.026123046875,
"learning_rate": 9.991743218070577e-07,
"loss": 0.0,
"reward": 3.5,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 3.5,
"step": 123
},
{
"completion_length": 108.0,
"epoch": 0.01338948277723788,
"grad_norm": 4.530334949493408,
"kl": 0.022705078125,
"learning_rate": 9.991601513024628e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.7581573724746704,
"rewards/gpt4o_reward_model": 3.875,
"step": 124
},
{
"completion_length": 103.25,
"epoch": 0.013497462477054314,
"grad_norm": 2.2850399017333984,
"kl": 0.01556396484375,
"learning_rate": 9.991458603335165e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_reward_model": 4.625,
"step": 125
},
{
"completion_length": 114.25,
"epoch": 0.013605442176870748,
"grad_norm": 4.146212100982666,
"kl": 0.02392578125,
"learning_rate": 9.991314489036677e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.1875,
"step": 126
},
{
"completion_length": 229.0,
"epoch": 0.013713421876687183,
"grad_norm": 4.601395130157471,
"kl": 0.0205078125,
"learning_rate": 9.991169170163943e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.0625,
"step": 127
},
{
"completion_length": 310.5,
"epoch": 0.013821401576503618,
"grad_norm": 3.042806625366211,
"kl": 0.015869140625,
"learning_rate": 9.991022646752035e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.75,
"step": 128
},
{
"completion_length": 145.0,
"epoch": 0.013929381276320051,
"grad_norm": 3.71138858795166,
"kl": 0.0240478515625,
"learning_rate": 9.990874918836313e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.7974694967269897,
"rewards/gpt4o_reward_model": 4.0,
"step": 129
},
{
"completion_length": 269.5,
"epoch": 0.014037360976136486,
"grad_norm": 4.026693820953369,
"kl": 0.0189208984375,
"learning_rate": 9.990725986452426e-07,
"loss": 0.0,
"reward": 4.375,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.375,
"step": 130
},
{
"completion_length": 186.75,
"epoch": 0.014145340675952921,
"grad_norm": 3.600174903869629,
"kl": 0.0284423828125,
"learning_rate": 9.99057584963632e-07,
"loss": 0.0,
"reward": 3.625,
"reward_std": 0.454224169254303,
"rewards/gpt4o_reward_model": 3.625,
"step": 131
},
{
"completion_length": 86.5,
"epoch": 0.014253320375769355,
"grad_norm": 4.374396324157715,
"kl": 0.0257568359375,
"learning_rate": 9.99042450842423e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.7288135886192322,
"rewards/gpt4o_reward_model": 4.25,
"step": 132
},
{
"completion_length": 208.0,
"epoch": 0.01436130007558579,
"grad_norm": 5.3308610916137695,
"kl": 0.0211181640625,
"learning_rate": 9.990271962852676e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.8081126809120178,
"rewards/gpt4o_reward_model": 4.3125,
"step": 133
},
{
"completion_length": 98.5,
"epoch": 0.014469279775402225,
"grad_norm": 3.5027971267700195,
"kl": 0.02197265625,
"learning_rate": 9.990118212958473e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.4375,
"step": 134
},
{
"completion_length": 41.5,
"epoch": 0.014577259475218658,
"grad_norm": 8.935701370239258,
"kl": 0.02490234375,
"learning_rate": 9.989963258778728e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.6896764636039734,
"rewards/gpt4o_reward_model": 4.0,
"step": 135
},
{
"completion_length": 85.25,
"epoch": 0.014685239175035093,
"grad_norm": 4.740475177764893,
"kl": 0.047119140625,
"learning_rate": 9.989807100350833e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.7090140581130981,
"rewards/gpt4o_reward_model": 4.4375,
"step": 136
},
{
"completion_length": 109.5,
"epoch": 0.014793218874851528,
"grad_norm": 4.925447463989258,
"kl": 0.0284423828125,
"learning_rate": 9.989649737712478e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 4.0,
"step": 137
},
{
"completion_length": 82.75,
"epoch": 0.014901198574667962,
"grad_norm": 2.8169710636138916,
"kl": 0.026611328125,
"learning_rate": 9.98949117090164e-07,
"loss": 0.0,
"reward": 4.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.9375,
"step": 138
},
{
"completion_length": 147.5,
"epoch": 0.015009178274484397,
"grad_norm": 4.605223178863525,
"kl": 0.032470703125,
"learning_rate": 9.989331399956583e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.5,
"step": 139
},
{
"completion_length": 239.75,
"epoch": 0.015117157974300832,
"grad_norm": 3.3013551235198975,
"kl": 0.025634765625,
"learning_rate": 9.98917042491587e-07,
"loss": 0.0,
"reward": 4.375,
"reward_std": 0.6231511831283569,
"rewards/gpt4o_reward_model": 4.375,
"step": 140
},
{
"completion_length": 129.75,
"epoch": 0.015225137674117265,
"grad_norm": 2.192405939102173,
"kl": 0.0244140625,
"learning_rate": 9.989008245818347e-07,
"loss": 0.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.875,
"step": 141
},
{
"completion_length": 253.75,
"epoch": 0.0153331173739337,
"grad_norm": 3.278887987136841,
"kl": 0.02197265625,
"learning_rate": 9.988844862703152e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.25,
"step": 142
},
{
"completion_length": 199.0,
"epoch": 0.015441097073750135,
"grad_norm": 3.9546308517456055,
"kl": 0.02001953125,
"learning_rate": 9.988680275609717e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.0625,
"step": 143
},
{
"completion_length": 179.0,
"epoch": 0.015549076773566569,
"grad_norm": 5.07952880859375,
"kl": 0.032470703125,
"learning_rate": 9.988514484577761e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.8644567728042603,
"rewards/gpt4o_reward_model": 4.25,
"step": 144
},
{
"completion_length": 75.5,
"epoch": 0.015657056473383005,
"grad_norm": 5.506921291351318,
"kl": 0.0361328125,
"learning_rate": 9.988347489647298e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.5333483219146729,
"rewards/gpt4o_reward_model": 4.1875,
"step": 145
},
{
"completion_length": 209.25,
"epoch": 0.015765036173199437,
"grad_norm": 4.1503119468688965,
"kl": 0.030029296875,
"learning_rate": 9.988179290858627e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.5,
"step": 146
},
{
"completion_length": 156.25,
"epoch": 0.015873015873015872,
"grad_norm": 3.442382335662842,
"kl": 0.041015625,
"learning_rate": 9.98800988825234e-07,
"loss": 0.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.875,
"step": 147
},
{
"completion_length": 103.5,
"epoch": 0.015980995572832307,
"grad_norm": 3.5556161403656006,
"kl": 0.025390625,
"learning_rate": 9.987839281869321e-07,
"loss": 0.0,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.8125,
"step": 148
},
{
"completion_length": 94.5,
"epoch": 0.016088975272648742,
"grad_norm": 5.813422203063965,
"kl": 0.0341796875,
"learning_rate": 9.987667471750743e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.125,
"step": 149
},
{
"completion_length": 114.75,
"epoch": 0.016196954972465177,
"grad_norm": 2.6386795043945312,
"kl": 0.026123046875,
"learning_rate": 9.987494457938066e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_reward_model": 4.5,
"step": 150
},
{
"completion_length": 237.5,
"epoch": 0.016304934672281612,
"grad_norm": 5.137205123901367,
"kl": 0.033447265625,
"learning_rate": 9.987320240473049e-07,
"loss": 0.0,
"reward": 3.5625,
"reward_std": 0.6770563125610352,
"rewards/gpt4o_reward_model": 3.5625,
"step": 151
},
{
"completion_length": 61.25,
"epoch": 0.016412914372098044,
"grad_norm": 2.3743293285369873,
"kl": 0.00823974609375,
"learning_rate": 9.987144819397735e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.6827775835990906,
"rewards/gpt4o_reward_model": 4.1875,
"step": 152
},
{
"completion_length": 130.75,
"epoch": 0.01652089407191448,
"grad_norm": 3.382657289505005,
"kl": 0.032958984375,
"learning_rate": 9.98696819475446e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.1875,
"step": 153
},
{
"completion_length": 98.75,
"epoch": 0.016628873771730914,
"grad_norm": 5.266931533813477,
"kl": 0.025146484375,
"learning_rate": 9.986790366585847e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.625,
"step": 154
},
{
"completion_length": 170.25,
"epoch": 0.01673685347154735,
"grad_norm": 4.579905033111572,
"kl": 0.03369140625,
"learning_rate": 9.986611334934814e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.8146764636039734,
"rewards/gpt4o_reward_model": 4.1875,
"step": 155
},
{
"completion_length": 258.25,
"epoch": 0.016844833171363784,
"grad_norm": 2.3568973541259766,
"kl": 0.029052734375,
"learning_rate": 9.986431099844567e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.6875,
"step": 156
},
{
"completion_length": 205.75,
"epoch": 0.01695281287118022,
"grad_norm": 4.3519158363342285,
"kl": 0.02392578125,
"learning_rate": 9.9862496613586e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.8421862125396729,
"rewards/gpt4o_reward_model": 3.9375,
"step": 157
},
{
"completion_length": 168.75,
"epoch": 0.01706079257099665,
"grad_norm": 3.8407411575317383,
"kl": 0.0308837890625,
"learning_rate": 9.986067019520707e-07,
"loss": 0.0,
"reward": 3.4375,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_reward_model": 3.4375,
"step": 158
},
{
"completion_length": 143.75,
"epoch": 0.017168772270813086,
"grad_norm": 0.003290753811597824,
"kl": 0.0281982421875,
"learning_rate": 9.98588317437496e-07,
"loss": 0.0,
"reward": 2.5,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_reward_model": 2.5,
"step": 159
},
{
"completion_length": 68.25,
"epoch": 0.01727675197062952,
"grad_norm": 4.90841817855835,
"kl": 0.023193359375,
"learning_rate": 9.98569812596573e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 3.875,
"step": 160
},
{
"completion_length": 121.75,
"epoch": 0.017384731670445956,
"grad_norm": 3.6256470680236816,
"kl": 0.033935546875,
"learning_rate": 9.985511874337672e-07,
"loss": 0.0,
"reward": 4.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.9375,
"step": 161
},
{
"completion_length": 118.75,
"epoch": 0.01749271137026239,
"grad_norm": 2.8302996158599854,
"kl": 0.03662109375,
"learning_rate": 9.98532441953574e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.23945678770542145,
"rewards/gpt4o_reward_model": 4.4375,
"step": 162
},
{
"completion_length": 160.75,
"epoch": 0.017600691070078826,
"grad_norm": 3.4306154251098633,
"kl": 0.027099609375,
"learning_rate": 9.985135761605167e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.4375,
"step": 163
},
{
"completion_length": 160.5,
"epoch": 0.01770867076989526,
"grad_norm": 3.66644287109375,
"kl": 0.0283203125,
"learning_rate": 9.984945900591486e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.47356173396110535,
"rewards/gpt4o_reward_model": 4.5625,
"step": 164
},
{
"completion_length": 166.75,
"epoch": 0.017816650469711693,
"grad_norm": 3.7970166206359863,
"kl": 0.0267333984375,
"learning_rate": 9.98475483654052e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.25,
"step": 165
},
{
"completion_length": 272.75,
"epoch": 0.017924630169528128,
"grad_norm": 4.074111461639404,
"kl": 0.033935546875,
"learning_rate": 9.984562569498373e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.6637752056121826,
"rewards/gpt4o_reward_model": 4.0625,
"step": 166
},
{
"completion_length": 107.5,
"epoch": 0.018032609869344563,
"grad_norm": 5.892201900482178,
"kl": 0.040283203125,
"learning_rate": 9.984369099511452e-07,
"loss": 0.0,
"reward": 3.375,
"reward_std": 0.8623477220535278,
"rewards/gpt4o_reward_model": 3.375,
"step": 167
},
{
"completion_length": 93.5,
"epoch": 0.018140589569160998,
"grad_norm": 3.144777297973633,
"kl": 0.02099609375,
"learning_rate": 9.984174426626443e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 3.9375,
"step": 168
},
{
"completion_length": 184.5,
"epoch": 0.018248569268977433,
"grad_norm": 4.250265598297119,
"kl": 0.036376953125,
"learning_rate": 9.98397855089033e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.625,
"step": 169
},
{
"completion_length": 25.5,
"epoch": 0.01835654896879387,
"grad_norm": 4.1492600440979,
"kl": 0.033447265625,
"learning_rate": 9.983781472350382e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.383794367313385,
"rewards/gpt4o_reward_model": 4.6875,
"step": 170
},
{
"completion_length": 219.5,
"epoch": 0.0184645286686103,
"grad_norm": 3.171717405319214,
"kl": 0.0380859375,
"learning_rate": 9.983583191054162e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.5281319618225098,
"rewards/gpt4o_reward_model": 4.0625,
"step": 171
},
{
"completion_length": 121.75,
"epoch": 0.018572508368426735,
"grad_norm": 4.934228897094727,
"kl": 0.048828125,
"learning_rate": 9.983383707049522e-07,
"loss": 0.0,
"reward": 4.0625,
"reward_std": 0.5879185199737549,
"rewards/gpt4o_reward_model": 4.0625,
"step": 172
},
{
"completion_length": 191.0,
"epoch": 0.01868048806824317,
"grad_norm": 2.89996600151062,
"kl": 0.02490234375,
"learning_rate": 9.983183020384605e-07,
"loss": 0.0,
"reward": 3.875,
"reward_std": 0.34856173396110535,
"rewards/gpt4o_reward_model": 3.875,
"step": 173
},
{
"completion_length": 215.75,
"epoch": 0.018788467768059605,
"grad_norm": 3.5805468559265137,
"kl": 0.038330078125,
"learning_rate": 9.982981131107842e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.6229909658432007,
"rewards/gpt4o_reward_model": 3.8125,
"step": 174
},
{
"completion_length": 178.25,
"epoch": 0.01889644746787604,
"grad_norm": 2.7104833126068115,
"kl": 0.04150390625,
"learning_rate": 9.982778039267958e-07,
"loss": 0.0,
"reward": 4.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.9375,
"step": 175
},
{
"completion_length": 76.5,
"epoch": 0.019004427167692475,
"grad_norm": 4.9249267578125,
"kl": 0.02392578125,
"learning_rate": 9.982573744913964e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.7694375514984131,
"rewards/gpt4o_reward_model": 4.5625,
"step": 176
},
{
"completion_length": 385.75,
"epoch": 0.019112406867508907,
"grad_norm": 4.761181354522705,
"kl": 0.0703125,
"learning_rate": 9.982368248095164e-07,
"loss": 0.0001,
"reward": 4.6875,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.6875,
"step": 177
},
{
"completion_length": 101.0,
"epoch": 0.019220386567325342,
"grad_norm": 3.50384259223938,
"kl": 0.042236328125,
"learning_rate": 9.982161548861152e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.625,
"step": 178
},
{
"completion_length": 103.25,
"epoch": 0.019328366267141777,
"grad_norm": 2.0653398036956787,
"kl": 0.0439453125,
"learning_rate": 9.98195364726181e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.3125,
"step": 179
},
{
"completion_length": 63.75,
"epoch": 0.019436345966958212,
"grad_norm": 5.203299522399902,
"kl": 0.044677734375,
"learning_rate": 9.981744543347312e-07,
"loss": 0.0,
"reward": 3.9375,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_reward_model": 3.9375,
"step": 180
},
{
"completion_length": 100.25,
"epoch": 0.019544325666774647,
"grad_norm": 2.8786332607269287,
"kl": 0.04052734375,
"learning_rate": 9.981534237168124e-07,
"loss": 0.0,
"reward": 4.5625,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_reward_model": 4.5625,
"step": 181
},
{
"completion_length": 231.5,
"epoch": 0.019652305366591082,
"grad_norm": 2.2581024169921875,
"kl": 0.0284423828125,
"learning_rate": 9.981322728774997e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.776972770690918,
"rewards/gpt4o_reward_model": 4.25,
"step": 182
},
{
"completion_length": 106.5,
"epoch": 0.019760285066407514,
"grad_norm": 4.273443698883057,
"kl": 0.03173828125,
"learning_rate": 9.981110018218977e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.1875,
"step": 183
},
{
"completion_length": 94.5,
"epoch": 0.01986826476622395,
"grad_norm": 4.629092693328857,
"kl": 0.07568359375,
"learning_rate": 9.9808961055514e-07,
"loss": 0.0001,
"reward": 4.6875,
"reward_std": 0.329224169254303,
"rewards/gpt4o_reward_model": 4.6875,
"step": 184
},
{
"completion_length": 148.0,
"epoch": 0.019976244466040384,
"grad_norm": 3.932042360305786,
"kl": 0.03173828125,
"learning_rate": 9.980680990823886e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.5581127405166626,
"rewards/gpt4o_reward_model": 4.3125,
"step": 185
},
{
"completion_length": 236.25,
"epoch": 0.02008422416585682,
"grad_norm": 3.403113842010498,
"kl": 0.0380859375,
"learning_rate": 9.980464674088354e-07,
"loss": 0.0,
"reward": 4.6875,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.6875,
"step": 186
},
{
"completion_length": 155.25,
"epoch": 0.020192203865673254,
"grad_norm": 3.3250508308410645,
"kl": 0.05419921875,
"learning_rate": 9.980247155397004e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_reward_model": 4.5,
"step": 187
},
{
"completion_length": 336.0,
"epoch": 0.02030018356548969,
"grad_norm": 2.428285837173462,
"kl": 0.03515625,
"learning_rate": 9.980028434802334e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.75,
"step": 188
},
{
"completion_length": 106.75,
"epoch": 0.02040816326530612,
"grad_norm": 4.417919635772705,
"kl": 0.028076171875,
"learning_rate": 9.979808512357129e-07,
"loss": 0.0,
"reward": 4.3125,
"reward_std": 0.633794367313385,
"rewards/gpt4o_reward_model": 4.3125,
"step": 189
},
{
"completion_length": 94.75,
"epoch": 0.020516142965122556,
"grad_norm": 3.8756754398345947,
"kl": 0.042724609375,
"learning_rate": 9.979587388114464e-07,
"loss": 0.0,
"reward": 4.75,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.75,
"step": 190
},
{
"completion_length": 119.75,
"epoch": 0.02062412266493899,
"grad_norm": 4.661324501037598,
"kl": 0.0517578125,
"learning_rate": 9.9793650621277e-07,
"loss": 0.0001,
"reward": 3.8125,
"reward_std": 0.829224169254303,
"rewards/gpt4o_reward_model": 3.8125,
"step": 191
},
{
"completion_length": 417.0,
"epoch": 0.020732102364755426,
"grad_norm": 2.5745935440063477,
"kl": 0.0291748046875,
"learning_rate": 9.979141534450495e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.25,
"step": 192
},
{
"completion_length": 178.5,
"epoch": 0.02084008206457186,
"grad_norm": 1.9205126762390137,
"kl": 0.041259765625,
"learning_rate": 9.978916805136794e-07,
"loss": 0.0,
"reward": 4.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.9375,
"step": 193
},
{
"completion_length": 223.5,
"epoch": 0.020948061764388296,
"grad_norm": 3.269942283630371,
"kl": 0.045166015625,
"learning_rate": 9.97869087424083e-07,
"loss": 0.0,
"reward": 4.1875,
"reward_std": 0.6229909658432007,
"rewards/gpt4o_reward_model": 4.1875,
"step": 194
},
{
"completion_length": 101.25,
"epoch": 0.02105604146420473,
"grad_norm": 4.042669296264648,
"kl": 0.0498046875,
"learning_rate": 9.97846374181713e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.7479909658432007,
"rewards/gpt4o_reward_model": 4.5,
"step": 195
},
{
"completion_length": 124.0,
"epoch": 0.021164021164021163,
"grad_norm": 4.541233539581299,
"kl": 0.0478515625,
"learning_rate": 9.978235407920506e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.5985617637634277,
"rewards/gpt4o_reward_model": 4.0,
"step": 196
},
{
"completion_length": 218.25,
"epoch": 0.021272000863837598,
"grad_norm": 3.685286521911621,
"kl": 0.03369140625,
"learning_rate": 9.978005872606065e-07,
"loss": 0.0,
"reward": 4.5,
"reward_std": 0.6231511831283569,
"rewards/gpt4o_reward_model": 4.5,
"step": 197
},
{
"completion_length": 301.5,
"epoch": 0.021379980563654033,
"grad_norm": 2.64717173576355,
"kl": 0.0478515625,
"learning_rate": 9.977775135929202e-07,
"loss": 0.0,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.8125,
"step": 198
},
{
"completion_length": 101.5,
"epoch": 0.021487960263470468,
"grad_norm": 3.8811116218566895,
"kl": 0.04248046875,
"learning_rate": 9.977543197945599e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_reward_model": 4.4375,
"step": 199
},
{
"completion_length": 110.25,
"epoch": 0.021595939963286903,
"grad_norm": 4.5807342529296875,
"kl": 0.037109375,
"learning_rate": 9.977310058711235e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.704224169254303,
"rewards/gpt4o_reward_model": 3.75,
"step": 200
},
{
"completion_length": 207.5,
"epoch": 0.021703919663103338,
"grad_norm": 5.462955951690674,
"kl": 0.09619140625,
"learning_rate": 9.97707571828237e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.5625,
"step": 201
},
{
"completion_length": 138.5,
"epoch": 0.02181189936291977,
"grad_norm": 6.441577911376953,
"kl": 0.0478515625,
"learning_rate": 9.97684017671556e-07,
"loss": 0.0,
"reward": 3.4375,
"reward_std": 0.9414719343185425,
"rewards/gpt4o_reward_model": 3.4375,
"step": 202
},
{
"completion_length": 140.0,
"epoch": 0.021919879062736205,
"grad_norm": 3.3736348152160645,
"kl": 0.037841796875,
"learning_rate": 9.97660343406765e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.8872368931770325,
"rewards/gpt4o_reward_model": 4.0,
"step": 203
},
{
"completion_length": 172.25,
"epoch": 0.02202785876255264,
"grad_norm": 3.9529149532318115,
"kl": 0.044921875,
"learning_rate": 9.97636549039577e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.625,
"step": 204
},
{
"completion_length": 667.25,
"epoch": 0.022135838462369075,
"grad_norm": 4.095589637756348,
"kl": 0.046630859375,
"learning_rate": 9.976126345757351e-07,
"loss": 0.0,
"reward": 3.625,
"reward_std": 0.454224169254303,
"rewards/gpt4o_reward_model": 3.625,
"step": 205
},
{
"completion_length": 187.0,
"epoch": 0.02224381816218551,
"grad_norm": 5.007861137390137,
"kl": 0.04150390625,
"learning_rate": 9.975886000210102e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.7286534309387207,
"rewards/gpt4o_reward_model": 4.4375,
"step": 206
},
{
"completion_length": 37.0,
"epoch": 0.022351797862001945,
"grad_norm": 5.55353307723999,
"kl": 0.0673828125,
"learning_rate": 9.975644453812028e-07,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.633794367313385,
"rewards/gpt4o_reward_model": 4.0625,
"step": 207
},
{
"completion_length": 210.75,
"epoch": 0.022459777561818377,
"grad_norm": 3.5840399265289307,
"kl": 0.06494140625,
"learning_rate": 9.97540170662142e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.5625,
"step": 208
},
{
"completion_length": 75.0,
"epoch": 0.022567757261634812,
"grad_norm": 4.6481828689575195,
"kl": 0.048095703125,
"learning_rate": 9.975157758696866e-07,
"loss": 0.0,
"reward": 3.8125,
"reward_std": 0.9524502754211426,
"rewards/gpt4o_reward_model": 3.8125,
"step": 209
},
{
"completion_length": 179.75,
"epoch": 0.022675736961451247,
"grad_norm": 3.486043691635132,
"kl": 0.036376953125,
"learning_rate": 9.974912610097235e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.6036534309387207,
"rewards/gpt4o_reward_model": 4.125,
"step": 210
},
{
"completion_length": 146.25,
"epoch": 0.022783716661267682,
"grad_norm": 2.9825496673583984,
"kl": 0.0673828125,
"learning_rate": 9.97466626088169e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_reward_model": 4.625,
"step": 211
},
{
"completion_length": 188.0,
"epoch": 0.022891696361084117,
"grad_norm": 5.593983173370361,
"kl": 0.05615234375,
"learning_rate": 9.974418711109684e-07,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.7974694967269897,
"rewards/gpt4o_reward_model": 4.125,
"step": 212
},
{
"completion_length": 265.25,
"epoch": 0.022999676060900552,
"grad_norm": 4.792067527770996,
"kl": 0.038330078125,
"learning_rate": 9.97416996084096e-07,
"loss": 0.0,
"reward": 4.125,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 4.125,
"step": 213
},
{
"completion_length": 131.25,
"epoch": 0.023107655760716984,
"grad_norm": 5.125412940979004,
"kl": 0.041259765625,
"learning_rate": 9.973920010135547e-07,
"loss": 0.0,
"reward": 3.75,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 3.75,
"step": 214
},
{
"completion_length": 397.5,
"epoch": 0.02321563546053342,
"grad_norm": 4.1176910400390625,
"kl": 0.0517578125,
"learning_rate": 9.973668859053772e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.5625,
"step": 215
},
{
"completion_length": 145.25,
"epoch": 0.023323615160349854,
"grad_norm": 6.796933650970459,
"kl": 0.134765625,
"learning_rate": 9.973416507656243e-07,
"loss": 0.0001,
"reward": 4.4375,
"reward_std": 0.6637752056121826,
"rewards/gpt4o_reward_model": 4.4375,
"step": 216
},
{
"completion_length": 204.75,
"epoch": 0.02343159486016629,
"grad_norm": 2.643352746963501,
"kl": 0.049072265625,
"learning_rate": 9.97316295600386e-07,
"loss": 0.0,
"reward": 4.0,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_reward_model": 4.0,
"step": 217
},
{
"completion_length": 73.0,
"epoch": 0.023539574559982724,
"grad_norm": 1.9597053527832031,
"kl": 0.0390625,
"learning_rate": 9.972908204157815e-07,
"loss": 0.0,
"reward": 4.4375,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.4375,
"step": 218
},
{
"completion_length": 317.5,
"epoch": 0.02364755425979916,
"grad_norm": 3.215393304824829,
"kl": 0.033447265625,
"learning_rate": 9.972652252179589e-07,
"loss": 0.0,
"reward": 3.625,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 3.625,
"step": 219
},
{
"completion_length": 168.75,
"epoch": 0.02375553395961559,
"grad_norm": 2.8032920360565186,
"kl": 0.052001953125,
"learning_rate": 9.97239510013095e-07,
"loss": 0.0001,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.875,
"step": 220
},
{
"completion_length": 118.75,
"epoch": 0.023863513659432026,
"grad_norm": 3.416057586669922,
"kl": 0.0537109375,
"learning_rate": 9.97213674807396e-07,
"loss": 0.0001,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.875,
"step": 221
},
{
"completion_length": 88.5,
"epoch": 0.02397149335924846,
"grad_norm": 3.6612114906311035,
"kl": 0.05908203125,
"learning_rate": 9.971877196070967e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 4.5,
"step": 222
},
{
"completion_length": 117.5,
"epoch": 0.024079473059064896,
"grad_norm": 2.947577714920044,
"kl": 0.04345703125,
"learning_rate": 9.971616444184607e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.6036534309387207,
"rewards/gpt4o_reward_model": 4.25,
"step": 223
},
{
"completion_length": 155.0,
"epoch": 0.02418745275888133,
"grad_norm": 2.8114304542541504,
"kl": 0.048583984375,
"learning_rate": 9.971354492477812e-07,
"loss": 0.0,
"reward": 4.625,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_reward_model": 4.625,
"step": 224
},
{
"completion_length": 105.75,
"epoch": 0.024295432458697766,
"grad_norm": 2.7532522678375244,
"kl": 0.05224609375,
"learning_rate": 9.9710913410138e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.5625,
"step": 225
},
{
"completion_length": 251.25,
"epoch": 0.024403412158514198,
"grad_norm": 3.266697883605957,
"kl": 0.052001953125,
"learning_rate": 9.970826989856076e-07,
"loss": 0.0001,
"reward": 4.6875,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.6875,
"step": 226
},
{
"completion_length": 199.5,
"epoch": 0.024511391858330633,
"grad_norm": 3.5519814491271973,
"kl": 0.05908203125,
"learning_rate": 9.970561439068438e-07,
"loss": 0.0001,
"reward": 3.9375,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 3.9375,
"step": 227
},
{
"completion_length": 61.25,
"epoch": 0.024619371558147068,
"grad_norm": 3.1293420791625977,
"kl": 0.07080078125,
"learning_rate": 9.970294688714975e-07,
"loss": 0.0001,
"reward": 4.6875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.6875,
"step": 228
},
{
"completion_length": 207.25,
"epoch": 0.024727351257963503,
"grad_norm": 4.59749174118042,
"kl": 0.04931640625,
"learning_rate": 9.970026738860058e-07,
"loss": 0.0,
"reward": 3.96875,
"reward_std": 0.8234953880310059,
"rewards/gpt4o_reward_model": 3.96875,
"step": 229
},
{
"completion_length": 137.0,
"epoch": 0.024835330957779938,
"grad_norm": 3.6132426261901855,
"kl": 0.04833984375,
"learning_rate": 9.969757589568354e-07,
"loss": 0.0,
"reward": 4.25,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.25,
"step": 230
},
{
"completion_length": 115.75,
"epoch": 0.024943310657596373,
"grad_norm": 4.572238922119141,
"kl": 0.06201171875,
"learning_rate": 9.96948724090482e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.5774502754211426,
"rewards/gpt4o_reward_model": 4.5,
"step": 231
},
{
"completion_length": 49.5,
"epoch": 0.025051290357412808,
"grad_norm": 3.109471559524536,
"kl": 0.0751953125,
"learning_rate": 9.969215692934702e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_reward_model": 4.625,
"step": 232
},
{
"completion_length": 336.0,
"epoch": 0.02515927005722924,
"grad_norm": 4.591429710388184,
"kl": 0.05322265625,
"learning_rate": 9.968942945723529e-07,
"loss": 0.0001,
"reward": 4.375,
"reward_std": 0.8185809850692749,
"rewards/gpt4o_reward_model": 4.375,
"step": 233
},
{
"completion_length": 174.25,
"epoch": 0.025267249757045675,
"grad_norm": 3.161344289779663,
"kl": 0.06103515625,
"learning_rate": 9.968668999337124e-07,
"loss": 0.0001,
"reward": 4.75,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.75,
"step": 234
},
{
"completion_length": 117.75,
"epoch": 0.02537522945686211,
"grad_norm": 3.8266708850860596,
"kl": 0.05615234375,
"learning_rate": 9.968393853841605e-07,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.23945678770542145,
"rewards/gpt4o_reward_model": 4.0625,
"step": 235
},
{
"completion_length": 144.75,
"epoch": 0.025483209156678545,
"grad_norm": 3.3600192070007324,
"kl": 0.05712890625,
"learning_rate": 9.96811750930337e-07,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.9463939070701599,
"rewards/gpt4o_reward_model": 4.125,
"step": 236
},
{
"completion_length": 260.5,
"epoch": 0.02559118885649498,
"grad_norm": 4.064725875854492,
"kl": 0.05859375,
"learning_rate": 9.96783996578911e-07,
"loss": 0.0001,
"reward": 4.75,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.75,
"step": 237
},
{
"completion_length": 151.5,
"epoch": 0.025699168556311415,
"grad_norm": 2.2150022983551025,
"kl": 0.04931640625,
"learning_rate": 9.967561223365806e-07,
"loss": 0.0,
"reward": 4.875,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_reward_model": 4.875,
"step": 238
},
{
"completion_length": 140.5,
"epoch": 0.025807148256127847,
"grad_norm": 3.777392625808716,
"kl": 0.09326171875,
"learning_rate": 9.96728128210073e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.5387751460075378,
"rewards/gpt4o_reward_model": 4.625,
"step": 239
},
{
"completion_length": 179.5,
"epoch": 0.02591512795594428,
"grad_norm": 5.373637676239014,
"kl": 0.0849609375,
"learning_rate": 9.967000142061439e-07,
"loss": 0.0001,
"reward": 4.3125,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.3125,
"step": 240
},
{
"completion_length": 133.0,
"epoch": 0.026023107655760717,
"grad_norm": 1.9456915855407715,
"kl": 0.061767578125,
"learning_rate": 9.966717803315785e-07,
"loss": 0.0001,
"reward": 4.3125,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.3125,
"step": 241
},
{
"completion_length": 335.0,
"epoch": 0.026131087355577152,
"grad_norm": 3.1238064765930176,
"kl": 0.04443359375,
"learning_rate": 9.966434265931902e-07,
"loss": 0.0,
"reward": 4.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.8125,
"step": 242
},
{
"completion_length": 186.5,
"epoch": 0.026239067055393587,
"grad_norm": 4.6672492027282715,
"kl": 0.0595703125,
"learning_rate": 9.966149529978221e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.5625,
"step": 243
},
{
"completion_length": 192.25,
"epoch": 0.026347046755210022,
"grad_norm": 3.2580738067626953,
"kl": 0.052001953125,
"learning_rate": 9.965863595523454e-07,
"loss": 0.0001,
"reward": 4.6875,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.6875,
"step": 244
},
{
"completion_length": 152.75,
"epoch": 0.026455026455026454,
"grad_norm": 4.204739570617676,
"kl": 0.0771484375,
"learning_rate": 9.96557646263661e-07,
"loss": 0.0001,
"reward": 4.1875,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_reward_model": 4.1875,
"step": 245
},
{
"completion_length": 120.0,
"epoch": 0.02656300615484289,
"grad_norm": 4.965406894683838,
"kl": 0.055419921875,
"learning_rate": 9.965288131386984e-07,
"loss": 0.0001,
"reward": 4.3125,
"reward_std": 1.0930101871490479,
"rewards/gpt4o_reward_model": 4.3125,
"step": 246
},
{
"completion_length": 67.25,
"epoch": 0.026670985854659324,
"grad_norm": 3.763925552368164,
"kl": 0.07470703125,
"learning_rate": 9.964998601844158e-07,
"loss": 0.0001,
"reward": 4.4375,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_reward_model": 4.4375,
"step": 247
},
{
"completion_length": 81.5,
"epoch": 0.02677896555447576,
"grad_norm": 9.182212829589844,
"kl": 0.055908203125,
"learning_rate": 9.96470787407801e-07,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.0625,
"step": 248
},
{
"completion_length": 97.0,
"epoch": 0.026886945254292194,
"grad_norm": 2.3663017749786377,
"kl": 0.04931640625,
"learning_rate": 9.964415948158696e-07,
"loss": 0.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.875,
"step": 249
},
{
"completion_length": 100.75,
"epoch": 0.02699492495410863,
"grad_norm": 3.542591094970703,
"kl": 0.08837890625,
"learning_rate": 9.964122824156672e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.383794367313385,
"rewards/gpt4o_reward_model": 4.5625,
"step": 250
},
{
"completion_length": 178.75,
"epoch": 0.02710290465392506,
"grad_norm": 3.1234817504882812,
"kl": 0.061279296875,
"learning_rate": 9.963828502142677e-07,
"loss": 0.0001,
"reward": 4.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.8125,
"step": 251
},
{
"completion_length": 146.0,
"epoch": 0.027210884353741496,
"grad_norm": 3.096550226211548,
"kl": 0.060546875,
"learning_rate": 9.963532982187743e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.5,
"step": 252
},
{
"completion_length": 339.75,
"epoch": 0.02731886405355793,
"grad_norm": 4.885226726531982,
"kl": 0.050048828125,
"learning_rate": 9.963236264363188e-07,
"loss": 0.0001,
"reward": 3.875,
"reward_std": 0.5985617637634277,
"rewards/gpt4o_reward_model": 3.875,
"step": 253
},
{
"completion_length": 141.75,
"epoch": 0.027426843753374366,
"grad_norm": 4.643560886383057,
"kl": 0.09033203125,
"learning_rate": 9.962938348740617e-07,
"loss": 0.0001,
"reward": 4.3125,
"reward_std": 0.633794367313385,
"rewards/gpt4o_reward_model": 4.3125,
"step": 254
},
{
"completion_length": 150.75,
"epoch": 0.0275348234531908,
"grad_norm": 8.297577857971191,
"kl": 0.1337890625,
"learning_rate": 9.962639235391932e-07,
"loss": 0.0001,
"reward": 4.1875,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.1875,
"step": 255
},
{
"completion_length": 122.5,
"epoch": 0.027642803153007236,
"grad_norm": 3.3994829654693604,
"kl": 0.055419921875,
"learning_rate": 9.962338924389318e-07,
"loss": 0.0001,
"reward": 4.4375,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.4375,
"step": 256
},
{
"completion_length": 166.0,
"epoch": 0.027750782852823667,
"grad_norm": 4.4432244300842285,
"kl": 0.0712890625,
"learning_rate": 9.962037415805248e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_reward_model": 4.625,
"step": 257
},
{
"completion_length": 104.0,
"epoch": 0.027858762552640103,
"grad_norm": 4.164874076843262,
"kl": 0.0546875,
"learning_rate": 9.961734709712488e-07,
"loss": 0.0001,
"reward": 3.625,
"reward_std": 0.9396764636039734,
"rewards/gpt4o_reward_model": 3.625,
"step": 258
},
{
"completion_length": 334.25,
"epoch": 0.027966742252456538,
"grad_norm": 5.587090015411377,
"kl": 0.08642578125,
"learning_rate": 9.961430806184093e-07,
"loss": 0.0001,
"reward": 4.4375,
"reward_std": 0.5921862125396729,
"rewards/gpt4o_reward_model": 4.4375,
"step": 259
},
{
"completion_length": 50.25,
"epoch": 0.028074721952272973,
"grad_norm": 3.80084490776062,
"kl": 0.0908203125,
"learning_rate": 9.9611257052934e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_reward_model": 4.625,
"step": 260
},
{
"completion_length": 216.5,
"epoch": 0.028182701652089408,
"grad_norm": 5.016978740692139,
"kl": 0.05322265625,
"learning_rate": 9.960819407114046e-07,
"loss": 0.0001,
"reward": 4.375,
"reward_std": 0.6896764636039734,
"rewards/gpt4o_reward_model": 4.375,
"step": 261
},
{
"completion_length": 45.0,
"epoch": 0.028290681351905843,
"grad_norm": 9.00546932220459,
"kl": 0.0830078125,
"learning_rate": 9.960511911719949e-07,
"loss": 0.0001,
"reward": 3.3125,
"reward_std": 0.9137751460075378,
"rewards/gpt4o_reward_model": 3.3125,
"step": 262
},
{
"completion_length": 113.5,
"epoch": 0.028398661051722278,
"grad_norm": 2.5115816593170166,
"kl": 0.0673828125,
"learning_rate": 9.960203219185314e-07,
"loss": 0.0001,
"reward": 4.8125,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.8125,
"step": 263
},
{
"completion_length": 466.75,
"epoch": 0.02850664075153871,
"grad_norm": 4.739973545074463,
"kl": 0.07861328125,
"learning_rate": 9.959893329584647e-07,
"loss": 0.0001,
"reward": 4.1875,
"reward_std": 0.633794367313385,
"rewards/gpt4o_reward_model": 4.1875,
"step": 264
},
{
"completion_length": 188.25,
"epoch": 0.028614620451355145,
"grad_norm": 3.5349810123443604,
"kl": 0.083984375,
"learning_rate": 9.95958224299273e-07,
"loss": 0.0001,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.75,
"step": 265
},
{
"completion_length": 79.0,
"epoch": 0.02872260015117158,
"grad_norm": 4.118009567260742,
"kl": 0.055908203125,
"learning_rate": 9.95926995948464e-07,
"loss": 0.0001,
"reward": 4.4375,
"reward_std": 0.7286534309387207,
"rewards/gpt4o_reward_model": 4.4375,
"step": 266
},
{
"completion_length": 63.25,
"epoch": 0.028830579850988015,
"grad_norm": 4.015948295593262,
"kl": 0.111328125,
"learning_rate": 9.95895647913574e-07,
"loss": 0.0001,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_reward_model": 4.8125,
"step": 267
},
{
"completion_length": 85.75,
"epoch": 0.02893855955080445,
"grad_norm": 4.853457927703857,
"kl": 0.07763671875,
"learning_rate": 9.958641802021685e-07,
"loss": 0.0001,
"reward": 4.4375,
"reward_std": 0.6038135886192322,
"rewards/gpt4o_reward_model": 4.4375,
"step": 268
},
{
"completion_length": 103.0,
"epoch": 0.029046539250620885,
"grad_norm": 2.9576172828674316,
"kl": 0.05126953125,
"learning_rate": 9.958325928218419e-07,
"loss": 0.0001,
"reward": 4.125,
"reward_std": 0.4788135886192322,
"rewards/gpt4o_reward_model": 4.125,
"step": 269
},
{
"completion_length": 192.0,
"epoch": 0.029154518950437316,
"grad_norm": 3.628898859024048,
"kl": 0.0869140625,
"learning_rate": 9.958008857802169e-07,
"loss": 0.0001,
"reward": 4.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_reward_model": 4.8125,
"step": 270
},
{
"completion_length": 321.0,
"epoch": 0.02926249865025375,
"grad_norm": 3.3045108318328857,
"kl": 0.06982421875,
"learning_rate": 9.957690590849457e-07,
"loss": 0.0001,
"reward": 4.6875,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_reward_model": 4.6875,
"step": 271
},
{
"completion_length": 123.5,
"epoch": 0.029370478350070187,
"grad_norm": 14.728436470031738,
"kl": 6.5,
"learning_rate": 9.957371127437093e-07,
"loss": 0.0065,
"reward": 3.75,
"reward_std": 0.8185809850692749,
"rewards/gpt4o_reward_model": 3.75,
"step": 272
},
{
"completion_length": 169.5,
"epoch": 0.02947845804988662,
"grad_norm": 2.554886817932129,
"kl": 0.1025390625,
"learning_rate": 9.957050467642172e-07,
"loss": 0.0001,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.875,
"step": 273
},
{
"completion_length": 119.25,
"epoch": 0.029586437749703057,
"grad_norm": 2.4691953659057617,
"kl": 0.0673828125,
"learning_rate": 9.956728611542082e-07,
"loss": 0.0001,
"reward": 4.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_reward_model": 4.9375,
"step": 274
},
{
"completion_length": 181.0,
"epoch": 0.029694417449519492,
"grad_norm": 4.251060485839844,
"kl": 0.0791015625,
"learning_rate": 9.956405559214498e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_reward_model": 4.5,
"step": 275
},
{
"completion_length": 181.25,
"epoch": 0.029802397149335923,
"grad_norm": 3.3034920692443848,
"kl": 0.07958984375,
"learning_rate": 9.956081310737382e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.47356173396110535,
"rewards/gpt4o_reward_model": 4.5625,
"step": 276
},
{
"completion_length": 105.75,
"epoch": 0.02991037684915236,
"grad_norm": 4.519341945648193,
"kl": 0.08642578125,
"learning_rate": 9.955755866188986e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_reward_model": 4.625,
"step": 277
},
{
"completion_length": 115.0,
"epoch": 0.030018356548968794,
"grad_norm": 3.830291509628296,
"kl": 0.0966796875,
"learning_rate": 9.955429225647854e-07,
"loss": 0.0001,
"reward": 3.6875,
"reward_std": 0.5281319618225098,
"rewards/gpt4o_reward_model": 3.6875,
"step": 278
},
{
"completion_length": 342.75,
"epoch": 0.03012633624878523,
"grad_norm": 3.2819526195526123,
"kl": 0.0712890625,
"learning_rate": 9.95510138919281e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.625,
"step": 279
},
{
"completion_length": 107.75,
"epoch": 0.030234315948601664,
"grad_norm": 4.989029884338379,
"kl": 0.0771484375,
"learning_rate": 9.95477235690298e-07,
"loss": 0.0001,
"reward": 4.0,
"reward_std": 0.7501000165939331,
"rewards/gpt4o_reward_model": 4.0,
"step": 280
},
{
"completion_length": 128.0,
"epoch": 0.0303422956484181,
"grad_norm": 2.953835964202881,
"kl": 0.09375,
"learning_rate": 9.954442128857761e-07,
"loss": 0.0001,
"reward": 4.75,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_reward_model": 4.75,
"step": 281
},
{
"completion_length": 142.75,
"epoch": 0.03045027534823453,
"grad_norm": 4.569122314453125,
"kl": 0.08544921875,
"learning_rate": 9.954110705136856e-07,
"loss": 0.0001,
"reward": 4.375,
"reward_std": 0.5387751460075378,
"rewards/gpt4o_reward_model": 4.375,
"step": 282
},
{
"completion_length": 291.25,
"epoch": 0.030558255048050965,
"grad_norm": 1.5806266069412231,
"kl": 0.09521484375,
"learning_rate": 9.953778085820245e-07,
"loss": 0.0001,
"reward": 4.625,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.625,
"step": 283
},
{
"completion_length": 35.75,
"epoch": 0.0306662347478674,
"grad_norm": 5.810234069824219,
"kl": 0.11572265625,
"learning_rate": 9.953444270988203e-07,
"loss": 0.0001,
"reward": 4.5625,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_reward_model": 4.5625,
"step": 284
},
{
"completion_length": 128.0,
"epoch": 0.030774214447683836,
"grad_norm": 0.006735849194228649,
"kl": 0.09912109375,
"learning_rate": 9.953109260721287e-07,
"loss": 0.0001,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_reward_model": 5.0,
"step": 285
},
{
"completion_length": 188.5,
"epoch": 0.03088219414750027,
"grad_norm": 3.38196063041687,
"kl": 0.078125,
"learning_rate": 9.952773055100351e-07,
"loss": 0.0001,
"reward": 4.375,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_reward_model": 4.375,
"step": 286
},
{
"completion_length": 241.75,
"epoch": 0.030990173847316706,
"grad_norm": 3.892089605331421,
"kl": 0.0888671875,
"learning_rate": 9.95243565420653e-07,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.633794367313385,
"rewards/gpt4o_reward_model": 4.0625,
"step": 287
},
{
"completion_length": 141.5,
"epoch": 0.031098153547133137,
"grad_norm": 3.2392618656158447,
"kl": 0.109375,
"learning_rate": 9.95209705812125e-07,
"loss": 0.0001,
"reward": 4.5,
"reward_std": 0.508794367313385,
"rewards/gpt4o_reward_model": 4.5,
"step": 288
},
{
"completion_length": 232.25,
"epoch": 0.031206133246949572,
"grad_norm": 2.5510284900665283,
"kl": 0.06640625,
"learning_rate": 9.95175726692623e-07,
"loss": 0.0001,
"reward": 4.0625,
"reward_std": 0.4435809552669525,
"rewards/gpt4o_reward_model": 4.0625,
"step": 289
},
{
"completion_length": 53.25,
"epoch": 0.03131411294676601,
"grad_norm": 6.491455078125,
"kl": 0.1220703125,
"learning_rate": 9.951416280703465e-07,
"loss": 0.0001,
"reward": 4.25,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_reward_model": 4.25,
"step": 290
}
],
"logging_steps": 1,
"max_steps": 6400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}