diff --git "a/Qwen_2.5_3B_nothink/VQA_Fundus/checkpoint-537/trainer_state.json" "b/Qwen_2.5_3B_nothink/VQA_Fundus/checkpoint-537/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Qwen_2.5_3B_nothink/VQA_Fundus/checkpoint-537/trainer_state.json" @@ -0,0 +1,7014 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9990697674418605, + "eval_steps": 500, + "global_step": 537, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 6.125, + "epoch": 0.0018604651162790699, + "grad_norm": 26.966157730405964, + "kl": 0.0, + "learning_rate": 9.981378026070763e-07, + "loss": 0.0, + "reward": 1.21875, + "reward_std": 0.6117308735847473, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.625, + "step": 1 + }, + { + "completion_length": 5.3125, + "epoch": 0.0037209302325581397, + "grad_norm": 29.753154899251022, + "kl": 0.00096893310546875, + "learning_rate": 9.962756052141526e-07, + "loss": 0.0, + "reward": 0.9375, + "reward_std": 0.6679348349571228, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.46875, + "step": 2 + }, + { + "completion_length": 5.40625, + "epoch": 0.005581395348837209, + "grad_norm": 70.79771260268588, + "kl": 0.021636962890625, + "learning_rate": 9.94413407821229e-07, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.6979155838489532, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.5625, + "step": 3 + }, + { + "completion_length": 5.59375, + "epoch": 0.0074418604651162795, + "grad_norm": 28.67566153705991, + "kl": 0.029510498046875, + "learning_rate": 9.925512104283055e-07, + "loss": 0.0012, + "reward": 0.96875, + "reward_std": 0.5290063321590424, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.53125, + "step": 4 + }, + { + "completion_length": 4.78125, + "epoch": 0.009302325581395349, + "grad_norm": 68.23692151573032, + "kl": 0.091796875, + "learning_rate": 9.906890130353816e-07, + "loss": 0.0037, + "reward": 1.15625, + "reward_std": 0.42685678601264954, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.5, + "step": 5 + }, + { + "completion_length": 6.28125, + "epoch": 0.011162790697674419, + "grad_norm": 25.114720038160037, + "kl": 0.0654296875, + "learning_rate": 9.888268156424581e-07, + "loss": 0.0026, + "reward": 1.3125, + "reward_std": 0.6257468461990356, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 6 + }, + { + "completion_length": 7.03125, + "epoch": 0.013023255813953489, + "grad_norm": 6.046578831405794, + "kl": 0.0501708984375, + "learning_rate": 9.869646182495344e-07, + "loss": 0.002, + "reward": 1.46875, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 7 + }, + { + "completion_length": 6.5625, + "epoch": 0.014883720930232559, + "grad_norm": 20.712363904690356, + "kl": 0.076904296875, + "learning_rate": 9.851024208566108e-07, + "loss": 0.0031, + "reward": 1.375, + "reward_std": 0.36435678601264954, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.90625, + "step": 8 + }, + { + "completion_length": 7.4375, + "epoch": 0.01674418604651163, + "grad_norm": 7.711904143466999, + "kl": 0.063232421875, + "learning_rate": 9.83240223463687e-07, + "loss": 0.0025, + "reward": 1.40625, + "reward_std": 0.27900634706020355, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.90625, + "step": 9 + }, + { + "completion_length": 7.5625, + "epoch": 0.018604651162790697, + "grad_norm": 4.730294607589442, + "kl": 0.0498046875, + "learning_rate": 9.813780260707634e-07, + "loss": 0.002, + "reward": 1.46875, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 10 + }, + { + "completion_length": 7.03125, + "epoch": 0.020465116279069766, + "grad_norm": 10.43773030239333, + "kl": 0.074951171875, + "learning_rate": 9.795158286778397e-07, + "loss": 0.003, + "reward": 1.5, + "reward_std": 0.45151597261428833, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.96875, + "step": 11 + }, + { + "completion_length": 7.0, + "epoch": 0.022325581395348838, + "grad_norm": 9.791060827366964, + "kl": 0.048583984375, + "learning_rate": 9.776536312849163e-07, + "loss": 0.0019, + "reward": 1.5, + "reward_std": 0.26933756470680237, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 12 + }, + { + "completion_length": 7.0, + "epoch": 0.024186046511627906, + "grad_norm": 7.296485581331431, + "kl": 0.09814453125, + "learning_rate": 9.757914338919924e-07, + "loss": 0.0039, + "reward": 1.53125, + "reward_std": 0.45683756470680237, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 13 + }, + { + "completion_length": 7.0625, + "epoch": 0.026046511627906978, + "grad_norm": 9.266361869168497, + "kl": 0.06634521484375, + "learning_rate": 9.73929236499069e-07, + "loss": 0.0027, + "reward": 1.5625, + "reward_std": 0.32216876745224, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 14 + }, + { + "completion_length": 7.0625, + "epoch": 0.027906976744186046, + "grad_norm": 8.397137188307578, + "kl": 0.071533203125, + "learning_rate": 9.720670391061452e-07, + "loss": 0.0029, + "reward": 1.59375, + "reward_std": 0.38466876745224, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 15 + }, + { + "completion_length": 7.0625, + "epoch": 0.029767441860465118, + "grad_norm": 26.406505025831684, + "kl": 0.0792236328125, + "learning_rate": 9.702048417132216e-07, + "loss": 0.0032, + "reward": 1.6875, + "reward_std": 0.34150634706020355, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 16 + }, + { + "completion_length": 7.0, + "epoch": 0.03162790697674418, + "grad_norm": 13.49164415911862, + "kl": 0.068359375, + "learning_rate": 9.68342644320298e-07, + "loss": 0.0027, + "reward": 1.65625, + "reward_std": 0.3318375498056412, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.96875, + "step": 17 + }, + { + "completion_length": 7.1875, + "epoch": 0.03348837209302326, + "grad_norm": 11.130087889880647, + "kl": 0.078369140625, + "learning_rate": 9.664804469273742e-07, + "loss": 0.0031, + "reward": 1.34375, + "reward_std": 0.4375, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.96875, + "step": 18 + }, + { + "completion_length": 7.03125, + "epoch": 0.035348837209302326, + "grad_norm": 260.5679340729235, + "kl": 12.35986328125, + "learning_rate": 9.646182495344505e-07, + "loss": 0.4957, + "reward": 1.4375, + "reward_std": 0.39433756470680237, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.96875, + "step": 19 + }, + { + "completion_length": 8.78125, + "epoch": 0.037209302325581395, + "grad_norm": 5.974419465509106, + "kl": 0.03326416015625, + "learning_rate": 9.62756052141527e-07, + "loss": 0.0013, + "reward": 1.625, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.96875, + "step": 20 + }, + { + "completion_length": 7.0625, + "epoch": 0.03906976744186046, + "grad_norm": 10.906349907398782, + "kl": 0.0616455078125, + "learning_rate": 9.608938547486032e-07, + "loss": 0.0025, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 21 + }, + { + "completion_length": 8.21875, + "epoch": 0.04093023255813953, + "grad_norm": 6.228383157506607, + "kl": 0.10498046875, + "learning_rate": 9.590316573556797e-07, + "loss": 0.0042, + "reward": 1.6875, + "reward_std": 0.30717839300632477, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96875, + "step": 22 + }, + { + "completion_length": 8.9375, + "epoch": 0.04279069767441861, + "grad_norm": 6.9304100616107265, + "kl": 0.07861328125, + "learning_rate": 9.57169459962756e-07, + "loss": 0.0031, + "reward": 1.40625, + "reward_std": 0.3617308586835861, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.96875, + "step": 23 + }, + { + "completion_length": 7.0, + "epoch": 0.044651162790697675, + "grad_norm": 4.9211033012103815, + "kl": 0.104736328125, + "learning_rate": 9.553072625698324e-07, + "loss": 0.0042, + "reward": 1.5625, + "reward_std": 0.26933756470680237, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 24 + }, + { + "completion_length": 7.0, + "epoch": 0.046511627906976744, + "grad_norm": 5.772075341182843, + "kl": 0.05126953125, + "learning_rate": 9.534450651769087e-07, + "loss": 0.002, + "reward": 1.5625, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 25 + }, + { + "completion_length": 7.0, + "epoch": 0.04837209302325581, + "grad_norm": 7.66254273739991, + "kl": 0.1005859375, + "learning_rate": 9.515828677839851e-07, + "loss": 0.004, + "reward": 1.65625, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 26 + }, + { + "completion_length": 7.40625, + "epoch": 0.05023255813953489, + "grad_norm": 9.621866766668685, + "kl": 0.1053466796875, + "learning_rate": 9.497206703910615e-07, + "loss": 0.0042, + "reward": 1.53125, + "reward_std": 0.4189092665910721, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9375, + "step": 27 + }, + { + "completion_length": 11.5, + "epoch": 0.052093023255813956, + "grad_norm": 5.625053187163431, + "kl": 0.103515625, + "learning_rate": 9.478584729981378e-07, + "loss": 0.0041, + "reward": 1.71875, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9375, + "step": 28 + }, + { + "completion_length": 6.96875, + "epoch": 0.053953488372093024, + "grad_norm": 11.988815111153999, + "kl": 0.086181640625, + "learning_rate": 9.459962756052141e-07, + "loss": 0.0034, + "reward": 1.65625, + "reward_std": 0.2666241526603699, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9375, + "step": 29 + }, + { + "completion_length": 7.0, + "epoch": 0.05581395348837209, + "grad_norm": 11.642120324006815, + "kl": 0.0791015625, + "learning_rate": 9.441340782122904e-07, + "loss": 0.0032, + "reward": 1.75, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 30 + }, + { + "completion_length": 11.78125, + "epoch": 0.05767441860465116, + "grad_norm": 1.7236129893955336, + "kl": 0.0574951171875, + "learning_rate": 9.422718808193669e-07, + "loss": 0.0023, + "reward": 1.78125, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 31 + }, + { + "completion_length": 7.46875, + "epoch": 0.059534883720930236, + "grad_norm": 3.10088790782511, + "kl": 0.111083984375, + "learning_rate": 9.404096834264431e-07, + "loss": 0.0044, + "reward": 1.71875, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 32 + }, + { + "completion_length": 7.0, + "epoch": 0.061395348837209304, + "grad_norm": 0.16333211764766373, + "kl": 0.07861328125, + "learning_rate": 9.385474860335195e-07, + "loss": 0.0031, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 33 + }, + { + "completion_length": 7.0, + "epoch": 0.06325581395348837, + "grad_norm": 7.759219919124216, + "kl": 0.11572265625, + "learning_rate": 9.366852886405958e-07, + "loss": 0.0046, + "reward": 1.8125, + "reward_std": 0.26933756470680237, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 34 + }, + { + "completion_length": 7.0, + "epoch": 0.06511627906976744, + "grad_norm": 19.896070435420302, + "kl": 0.0633544921875, + "learning_rate": 9.348230912476723e-07, + "loss": 0.0025, + "reward": 1.71875, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 35 + }, + { + "completion_length": 8.75, + "epoch": 0.06697674418604652, + "grad_norm": 5.861993739500396, + "kl": 0.1209716796875, + "learning_rate": 9.329608938547485e-07, + "loss": 0.0048, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 36 + }, + { + "completion_length": 7.0, + "epoch": 0.06883720930232558, + "grad_norm": 44.12478655041424, + "kl": 1.54833984375, + "learning_rate": 9.310986964618249e-07, + "loss": 0.0623, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 37 + }, + { + "completion_length": 7.1875, + "epoch": 0.07069767441860465, + "grad_norm": 4.4710629406601266, + "kl": 0.10791015625, + "learning_rate": 9.292364990689012e-07, + "loss": 0.0043, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 38 + }, + { + "completion_length": 6.9375, + "epoch": 0.07255813953488371, + "grad_norm": 10.754443826359896, + "kl": 0.1092529296875, + "learning_rate": 9.273743016759777e-07, + "loss": 0.0044, + "reward": 1.625, + "reward_std": 0.32216876745224, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 39 + }, + { + "completion_length": 8.875, + "epoch": 0.07441860465116279, + "grad_norm": 5.375899632331958, + "kl": 0.07275390625, + "learning_rate": 9.255121042830539e-07, + "loss": 0.0029, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 40 + }, + { + "completion_length": 16.125, + "epoch": 0.07627906976744186, + "grad_norm": 12.878061169602695, + "kl": 0.140380859375, + "learning_rate": 9.236499068901303e-07, + "loss": 0.0056, + "reward": 1.4375, + "reward_std": 0.4092404693365097, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 41 + }, + { + "completion_length": 7.25, + "epoch": 0.07813953488372093, + "grad_norm": 15.663595372886416, + "kl": 0.201416015625, + "learning_rate": 9.217877094972066e-07, + "loss": 0.0081, + "reward": 1.78125, + "reward_std": 0.3846687823534012, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 42 + }, + { + "completion_length": 7.0, + "epoch": 0.08, + "grad_norm": 10.064352907111846, + "kl": 0.051025390625, + "learning_rate": 9.199255121042831e-07, + "loss": 0.002, + "reward": 1.5, + "reward_std": 0.375, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 43 + }, + { + "completion_length": 7.6875, + "epoch": 0.08186046511627906, + "grad_norm": 12.944119510731715, + "kl": 0.16455078125, + "learning_rate": 9.180633147113593e-07, + "loss": 0.0066, + "reward": 1.75, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 44 + }, + { + "completion_length": 8.6875, + "epoch": 0.08372093023255814, + "grad_norm": 1.9828140995099124, + "kl": 0.14453125, + "learning_rate": 9.162011173184357e-07, + "loss": 0.0058, + "reward": 1.59375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.96875, + "step": 45 + }, + { + "completion_length": 8.03125, + "epoch": 0.08558139534883721, + "grad_norm": 3.5060840347720803, + "kl": 0.0849609375, + "learning_rate": 9.14338919925512e-07, + "loss": 0.0034, + "reward": 1.4375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 46 + }, + { + "completion_length": 7.0, + "epoch": 0.08744186046511628, + "grad_norm": 7.8271406576863125, + "kl": 0.124267578125, + "learning_rate": 9.124767225325885e-07, + "loss": 0.005, + "reward": 1.75, + "reward_std": 0.3221687823534012, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 47 + }, + { + "completion_length": 16.03125, + "epoch": 0.08930232558139535, + "grad_norm": 5.7284603569495305, + "kl": 0.25830078125, + "learning_rate": 9.106145251396647e-07, + "loss": 0.0103, + "reward": 1.8125, + "reward_std": 0.3221687823534012, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 48 + }, + { + "completion_length": 11.96875, + "epoch": 0.09116279069767441, + "grad_norm": 9.472862517097974, + "kl": 0.12060546875, + "learning_rate": 9.087523277467411e-07, + "loss": 0.0048, + "reward": 1.6875, + "reward_std": 0.46650634706020355, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.90625, + "step": 49 + }, + { + "completion_length": 7.0, + "epoch": 0.09302325581395349, + "grad_norm": 2.7457156258368705, + "kl": 0.041748046875, + "learning_rate": 9.068901303538175e-07, + "loss": 0.0017, + "reward": 1.5625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 50 + }, + { + "completion_length": 8.8125, + "epoch": 0.09488372093023256, + "grad_norm": 11.986664795920369, + "kl": 0.1484375, + "learning_rate": 9.050279329608939e-07, + "loss": 0.0059, + "reward": 1.59375, + "reward_std": 0.44184717535972595, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.90625, + "step": 51 + }, + { + "completion_length": 8.5625, + "epoch": 0.09674418604651162, + "grad_norm": 4.645111366609586, + "kl": 0.087890625, + "learning_rate": 9.031657355679702e-07, + "loss": 0.0035, + "reward": 1.78125, + "reward_std": 0.24467839300632477, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 52 + }, + { + "completion_length": 10.0625, + "epoch": 0.0986046511627907, + "grad_norm": 10.267660761809712, + "kl": 0.09765625, + "learning_rate": 9.013035381750465e-07, + "loss": 0.0039, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.90625, + "step": 53 + }, + { + "completion_length": 7.0, + "epoch": 0.10046511627906977, + "grad_norm": 10.731605523627294, + "kl": 0.53759765625, + "learning_rate": 8.994413407821229e-07, + "loss": 0.0215, + "reward": 1.65625, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 54 + }, + { + "completion_length": 16.59375, + "epoch": 0.10232558139534884, + "grad_norm": 14.826291272272616, + "kl": 0.60498046875, + "learning_rate": 8.975791433891993e-07, + "loss": 0.0243, + "reward": 1.5625, + "reward_std": 0.375, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 55 + }, + { + "completion_length": 7.0, + "epoch": 0.10418604651162791, + "grad_norm": 8.21675498152898, + "kl": 0.21044921875, + "learning_rate": 8.957169459962756e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.44716876745224, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 56 + }, + { + "completion_length": 7.0, + "epoch": 0.10604651162790697, + "grad_norm": 4.391303022188685, + "kl": 0.177734375, + "learning_rate": 8.938547486033518e-07, + "loss": 0.0071, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 57 + }, + { + "completion_length": 8.28125, + "epoch": 0.10790697674418605, + "grad_norm": 7.2174365138125784, + "kl": 0.0948486328125, + "learning_rate": 8.919925512104283e-07, + "loss": 0.0038, + "reward": 1.59375, + "reward_std": 0.3846687823534012, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.96875, + "step": 58 + }, + { + "completion_length": 7.1875, + "epoch": 0.10976744186046512, + "grad_norm": 6.69341308742532, + "kl": 0.101806640625, + "learning_rate": 8.901303538175046e-07, + "loss": 0.0041, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 59 + }, + { + "completion_length": 15.25, + "epoch": 0.11162790697674418, + "grad_norm": 19.781496559171643, + "kl": 1.6363525390625, + "learning_rate": 8.88268156424581e-07, + "loss": 0.0655, + "reward": 1.4375, + "reward_std": 0.45151595771312714, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.84375, + "step": 60 + }, + { + "completion_length": 7.0, + "epoch": 0.11348837209302326, + "grad_norm": 1.5283003356298774, + "kl": 0.2132568359375, + "learning_rate": 8.864059590316572e-07, + "loss": 0.0086, + "reward": 1.78125, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 61 + }, + { + "completion_length": 9.9375, + "epoch": 0.11534883720930232, + "grad_norm": 2.714248771258938, + "kl": 0.119140625, + "learning_rate": 8.845437616387337e-07, + "loss": 0.0048, + "reward": 1.65625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9375, + "step": 62 + }, + { + "completion_length": 8.21875, + "epoch": 0.1172093023255814, + "grad_norm": 5.423759197973722, + "kl": 0.265625, + "learning_rate": 8.8268156424581e-07, + "loss": 0.0107, + "reward": 1.625, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.96875, + "step": 63 + }, + { + "completion_length": 9.53125, + "epoch": 0.11906976744186047, + "grad_norm": 15.733138874326178, + "kl": 0.15234375, + "learning_rate": 8.808193668528864e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.47706207633018494, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.84375, + "step": 64 + }, + { + "completion_length": 7.125, + "epoch": 0.12093023255813953, + "grad_norm": 9.268032428728139, + "kl": 0.78125, + "learning_rate": 8.789571694599626e-07, + "loss": 0.0314, + "reward": 1.625, + "reward_std": 0.44716876745224, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.90625, + "step": 65 + }, + { + "completion_length": 13.6875, + "epoch": 0.12279069767441861, + "grad_norm": 5.796894017884865, + "kl": 0.4190673828125, + "learning_rate": 8.770949720670391e-07, + "loss": 0.0168, + "reward": 1.59375, + "reward_std": 0.3696783781051636, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.84375, + "step": 66 + }, + { + "completion_length": 9.6875, + "epoch": 0.12465116279069767, + "grad_norm": 5.952556804363693, + "kl": 0.279296875, + "learning_rate": 8.752327746741154e-07, + "loss": 0.0112, + "reward": 1.71875, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9375, + "step": 67 + }, + { + "completion_length": 8.9375, + "epoch": 0.12651162790697673, + "grad_norm": 6.912217835391154, + "kl": 0.093994140625, + "learning_rate": 8.733705772811918e-07, + "loss": 0.0038, + "reward": 1.75, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 68 + }, + { + "completion_length": 8.375, + "epoch": 0.1283720930232558, + "grad_norm": 13.684191980024522, + "kl": 0.9251708984375, + "learning_rate": 8.71508379888268e-07, + "loss": 0.0369, + "reward": 1.59375, + "reward_std": 0.3846687823534012, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.96875, + "step": 69 + }, + { + "completion_length": 11.8125, + "epoch": 0.13023255813953488, + "grad_norm": 6.5354458557560156, + "kl": 0.17626953125, + "learning_rate": 8.696461824953445e-07, + "loss": 0.007, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.875, + "step": 70 + }, + { + "completion_length": 7.75, + "epoch": 0.13209302325581396, + "grad_norm": 2.2225793527829745, + "kl": 0.07861328125, + "learning_rate": 8.677839851024208e-07, + "loss": 0.0031, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 71 + }, + { + "completion_length": 7.03125, + "epoch": 0.13395348837209303, + "grad_norm": 11.802236302391108, + "kl": 0.0908203125, + "learning_rate": 8.659217877094972e-07, + "loss": 0.0036, + "reward": 1.75, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 72 + }, + { + "completion_length": 7.0, + "epoch": 0.13581395348837208, + "grad_norm": 12.87626702979638, + "kl": 0.08935546875, + "learning_rate": 8.640595903165734e-07, + "loss": 0.0036, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 73 + }, + { + "completion_length": 13.78125, + "epoch": 0.13767441860465116, + "grad_norm": 3.6357613255389696, + "kl": 0.197265625, + "learning_rate": 8.621973929236499e-07, + "loss": 0.0079, + "reward": 1.65625, + "reward_std": 0.31684717535972595, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.90625, + "step": 74 + }, + { + "completion_length": 7.0, + "epoch": 0.13953488372093023, + "grad_norm": 7.797079246960243, + "kl": 0.3583984375, + "learning_rate": 8.603351955307262e-07, + "loss": 0.0144, + "reward": 1.8125, + "reward_std": 0.22706207633018494, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 75 + }, + { + "completion_length": 9.125, + "epoch": 0.1413953488372093, + "grad_norm": 5.029834706815348, + "kl": 0.1610107421875, + "learning_rate": 8.584729981378026e-07, + "loss": 0.0064, + "reward": 1.59375, + "reward_std": 0.4375, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.875, + "step": 76 + }, + { + "completion_length": 7.1875, + "epoch": 0.14325581395348838, + "grad_norm": 4.856511642475378, + "kl": 0.1014404296875, + "learning_rate": 8.566108007448789e-07, + "loss": 0.0041, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 77 + }, + { + "completion_length": 7.0625, + "epoch": 0.14511627906976743, + "grad_norm": 12.563304123350113, + "kl": 0.07470703125, + "learning_rate": 8.547486033519553e-07, + "loss": 0.003, + "reward": 1.6875, + "reward_std": 0.22706207633018494, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96875, + "step": 78 + }, + { + "completion_length": 10.21875, + "epoch": 0.1469767441860465, + "grad_norm": 5.074016458487912, + "kl": 0.300048828125, + "learning_rate": 8.528864059590316e-07, + "loss": 0.0121, + "reward": 1.84375, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 79 + }, + { + "completion_length": 8.65625, + "epoch": 0.14883720930232558, + "grad_norm": 2.1228710160076463, + "kl": 0.28173828125, + "learning_rate": 8.51024208566108e-07, + "loss": 0.0113, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 80 + }, + { + "completion_length": 10.71875, + "epoch": 0.15069767441860465, + "grad_norm": 7.108974894555497, + "kl": 0.3662109375, + "learning_rate": 8.491620111731844e-07, + "loss": 0.0147, + "reward": 1.46875, + "reward_std": 0.50966876745224, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.875, + "step": 81 + }, + { + "completion_length": 7.125, + "epoch": 0.15255813953488373, + "grad_norm": 2.739240272252859, + "kl": 0.141357421875, + "learning_rate": 8.472998137802607e-07, + "loss": 0.0057, + "reward": 1.71875, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 82 + }, + { + "completion_length": 16.5625, + "epoch": 0.15441860465116278, + "grad_norm": 5.169148665562794, + "kl": 0.18603515625, + "learning_rate": 8.45437616387337e-07, + "loss": 0.0075, + "reward": 1.65625, + "reward_std": 0.31684717535972595, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.875, + "step": 83 + }, + { + "completion_length": 9.40625, + "epoch": 0.15627906976744185, + "grad_norm": 7.46748811393347, + "kl": 0.228515625, + "learning_rate": 8.435754189944134e-07, + "loss": 0.0091, + "reward": 1.71875, + "reward_std": 0.3846687823534012, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9375, + "step": 84 + }, + { + "completion_length": 7.96875, + "epoch": 0.15813953488372093, + "grad_norm": 2.9180556202850525, + "kl": 0.23046875, + "learning_rate": 8.417132216014898e-07, + "loss": 0.0092, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 85 + }, + { + "completion_length": 7.0, + "epoch": 0.16, + "grad_norm": 2.7866359351668413, + "kl": 0.14013671875, + "learning_rate": 8.39851024208566e-07, + "loss": 0.0056, + "reward": 1.53125, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 86 + }, + { + "completion_length": 7.40625, + "epoch": 0.16186046511627908, + "grad_norm": 9.63349570214417, + "kl": 0.1533203125, + "learning_rate": 8.379888268156424e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.47706207633018494, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.90625, + "step": 87 + }, + { + "completion_length": 7.53125, + "epoch": 0.16372093023255813, + "grad_norm": 3.898378463636109, + "kl": 0.0394287109375, + "learning_rate": 8.361266294227187e-07, + "loss": 0.0016, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 88 + }, + { + "completion_length": 7.21875, + "epoch": 0.1655813953488372, + "grad_norm": 6.00056725816116, + "kl": 0.128662109375, + "learning_rate": 8.342644320297952e-07, + "loss": 0.0052, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 89 + }, + { + "completion_length": 7.71875, + "epoch": 0.16744186046511628, + "grad_norm": 2.075411370301875, + "kl": 0.083740234375, + "learning_rate": 8.324022346368714e-07, + "loss": 0.0033, + "reward": 1.5625, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.96875, + "step": 90 + }, + { + "completion_length": 7.0, + "epoch": 0.16930232558139535, + "grad_norm": 0.04302721654145541, + "kl": 0.080810546875, + "learning_rate": 8.305400372439478e-07, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 91 + }, + { + "completion_length": 8.375, + "epoch": 0.17116279069767443, + "grad_norm": 12.543658599630556, + "kl": 0.426513671875, + "learning_rate": 8.286778398510241e-07, + "loss": 0.0171, + "reward": 1.6875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 92 + }, + { + "completion_length": 7.15625, + "epoch": 0.17302325581395347, + "grad_norm": 3.95377638356341, + "kl": 0.1934814453125, + "learning_rate": 8.268156424581006e-07, + "loss": 0.0077, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 93 + }, + { + "completion_length": 7.0, + "epoch": 0.17488372093023255, + "grad_norm": 4.24613523067577, + "kl": 0.066162109375, + "learning_rate": 8.249534450651768e-07, + "loss": 0.0026, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 94 + }, + { + "completion_length": 9.0, + "epoch": 0.17674418604651163, + "grad_norm": 1.1772372484783475, + "kl": 0.08056640625, + "learning_rate": 8.230912476722532e-07, + "loss": 0.0032, + "reward": 1.71875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 95 + }, + { + "completion_length": 7.0, + "epoch": 0.1786046511627907, + "grad_norm": 0.04508882171019585, + "kl": 0.091796875, + "learning_rate": 8.212290502793295e-07, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 96 + }, + { + "completion_length": 7.0, + "epoch": 0.18046511627906978, + "grad_norm": 0.09803865665685438, + "kl": 0.13623046875, + "learning_rate": 8.19366852886406e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 97 + }, + { + "completion_length": 8.34375, + "epoch": 0.18232558139534882, + "grad_norm": 20.940175774741647, + "kl": 0.25537109375, + "learning_rate": 8.175046554934822e-07, + "loss": 0.0102, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 98 + }, + { + "completion_length": 7.125, + "epoch": 0.1841860465116279, + "grad_norm": 5.771479527371442, + "kl": 0.0906982421875, + "learning_rate": 8.156424581005586e-07, + "loss": 0.0036, + "reward": 1.6875, + "reward_std": 0.26933756470680237, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96875, + "step": 99 + }, + { + "completion_length": 6.96875, + "epoch": 0.18604651162790697, + "grad_norm": 1.968539510347447, + "kl": 0.0517578125, + "learning_rate": 8.13780260707635e-07, + "loss": 0.0021, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 100 + }, + { + "completion_length": 9.03125, + "epoch": 0.18790697674418605, + "grad_norm": 5.886458902831998, + "kl": 0.367431640625, + "learning_rate": 8.119180633147114e-07, + "loss": 0.0147, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 101 + }, + { + "completion_length": 7.0, + "epoch": 0.18976744186046512, + "grad_norm": 0.05286843275035698, + "kl": 0.09765625, + "learning_rate": 8.100558659217876e-07, + "loss": 0.0039, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 102 + }, + { + "completion_length": 7.0, + "epoch": 0.19162790697674417, + "grad_norm": 2.386773323074358, + "kl": 0.189453125, + "learning_rate": 8.08193668528864e-07, + "loss": 0.0076, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 103 + }, + { + "completion_length": 7.0, + "epoch": 0.19348837209302325, + "grad_norm": 1.805176059916673, + "kl": 0.3692626953125, + "learning_rate": 8.063314711359404e-07, + "loss": 0.0149, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 104 + }, + { + "completion_length": 7.0, + "epoch": 0.19534883720930232, + "grad_norm": 4.273413974692543, + "kl": 0.0966796875, + "learning_rate": 8.044692737430168e-07, + "loss": 0.0039, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 105 + }, + { + "completion_length": 7.0, + "epoch": 0.1972093023255814, + "grad_norm": 14.283801749577972, + "kl": 0.280517578125, + "learning_rate": 8.026070763500931e-07, + "loss": 0.0113, + "reward": 1.59375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 106 + }, + { + "completion_length": 7.0, + "epoch": 0.19906976744186047, + "grad_norm": 9.343773255345017, + "kl": 0.072265625, + "learning_rate": 8.007448789571694e-07, + "loss": 0.0029, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 107 + }, + { + "completion_length": 7.0, + "epoch": 0.20093023255813955, + "grad_norm": 11.348269377150984, + "kl": 0.193359375, + "learning_rate": 7.988826815642458e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 108 + }, + { + "completion_length": 7.5625, + "epoch": 0.2027906976744186, + "grad_norm": 4.688812144024163, + "kl": 0.0997314453125, + "learning_rate": 7.970204841713222e-07, + "loss": 0.004, + "reward": 1.71875, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 109 + }, + { + "completion_length": 7.0, + "epoch": 0.20465116279069767, + "grad_norm": 6.841502387847828, + "kl": 0.1376953125, + "learning_rate": 7.951582867783985e-07, + "loss": 0.0055, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 110 + }, + { + "completion_length": 7.0, + "epoch": 0.20651162790697675, + "grad_norm": 5.356000978555771, + "kl": 0.11181640625, + "learning_rate": 7.932960893854748e-07, + "loss": 0.0045, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 111 + }, + { + "completion_length": 7.0, + "epoch": 0.20837209302325582, + "grad_norm": 11.544487814556163, + "kl": 0.31005859375, + "learning_rate": 7.914338919925512e-07, + "loss": 0.0123, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 112 + }, + { + "completion_length": 7.125, + "epoch": 0.2102325581395349, + "grad_norm": 2.4257327600000482, + "kl": 0.22509765625, + "learning_rate": 7.895716945996276e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 113 + }, + { + "completion_length": 6.84375, + "epoch": 0.21209302325581394, + "grad_norm": 15.905008222581353, + "kl": 1.533203125, + "learning_rate": 7.877094972067039e-07, + "loss": 0.0616, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 114 + }, + { + "completion_length": 7.375, + "epoch": 0.21395348837209302, + "grad_norm": 6.1341554632903454, + "kl": 0.100341796875, + "learning_rate": 7.858472998137801e-07, + "loss": 0.004, + "reward": 1.71875, + "reward_std": 0.24467839300632477, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 115 + }, + { + "completion_length": 7.0, + "epoch": 0.2158139534883721, + "grad_norm": 4.222312056341739, + "kl": 0.0634765625, + "learning_rate": 7.839851024208566e-07, + "loss": 0.0025, + "reward": 1.8125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 116 + }, + { + "completion_length": 7.0, + "epoch": 0.21767441860465117, + "grad_norm": 2.525977560979308, + "kl": 0.041015625, + "learning_rate": 7.821229050279329e-07, + "loss": 0.0016, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 117 + }, + { + "completion_length": 7.0, + "epoch": 0.21953488372093025, + "grad_norm": 0.0403331039479963, + "kl": 0.15283203125, + "learning_rate": 7.802607076350093e-07, + "loss": 0.0061, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 118 + }, + { + "completion_length": 7.0, + "epoch": 0.2213953488372093, + "grad_norm": 16.575618394741664, + "kl": 0.084228515625, + "learning_rate": 7.783985102420855e-07, + "loss": 0.0034, + "reward": 1.8125, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 119 + }, + { + "completion_length": 7.0, + "epoch": 0.22325581395348837, + "grad_norm": 3.6595174265552126, + "kl": 0.0845947265625, + "learning_rate": 7.76536312849162e-07, + "loss": 0.0034, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 120 + }, + { + "completion_length": 6.875, + "epoch": 0.22511627906976744, + "grad_norm": 9.855139862594777, + "kl": 0.888671875, + "learning_rate": 7.746741154562383e-07, + "loss": 0.0355, + "reward": 1.8125, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 121 + }, + { + "completion_length": 7.0, + "epoch": 0.22697674418604652, + "grad_norm": 2.7039368634440946, + "kl": 0.09228515625, + "learning_rate": 7.728119180633147e-07, + "loss": 0.0037, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 122 + }, + { + "completion_length": 10.5625, + "epoch": 0.2288372093023256, + "grad_norm": 1.8776915351514876, + "kl": 0.2685546875, + "learning_rate": 7.709497206703909e-07, + "loss": 0.0107, + "reward": 1.8125, + "reward_std": 0.375, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.90625, + "step": 123 + }, + { + "completion_length": 7.5625, + "epoch": 0.23069767441860464, + "grad_norm": 4.492967413221305, + "kl": 0.1884765625, + "learning_rate": 7.690875232774674e-07, + "loss": 0.0075, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 124 + }, + { + "completion_length": 7.0, + "epoch": 0.23255813953488372, + "grad_norm": 7.599829850539572, + "kl": 0.0567626953125, + "learning_rate": 7.672253258845437e-07, + "loss": 0.0023, + "reward": 1.78125, + "reward_std": 0.20683756470680237, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 125 + }, + { + "completion_length": 6.90625, + "epoch": 0.2344186046511628, + "grad_norm": 12.73041695283084, + "kl": 0.14453125, + "learning_rate": 7.653631284916201e-07, + "loss": 0.0058, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 126 + }, + { + "completion_length": 7.28125, + "epoch": 0.23627906976744187, + "grad_norm": 15.030896276338488, + "kl": 0.462890625, + "learning_rate": 7.635009310986963e-07, + "loss": 0.0185, + "reward": 1.84375, + "reward_std": 0.25966876745224, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 127 + }, + { + "completion_length": 6.90625, + "epoch": 0.23813953488372094, + "grad_norm": 55.96774708425224, + "kl": 1.31640625, + "learning_rate": 7.616387337057728e-07, + "loss": 0.0529, + "reward": 1.75, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 128 + }, + { + "completion_length": 7.0, + "epoch": 0.24, + "grad_norm": 4.424276198101464, + "kl": 0.0538330078125, + "learning_rate": 7.597765363128491e-07, + "loss": 0.0022, + "reward": 1.6875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 129 + }, + { + "completion_length": 7.0, + "epoch": 0.24186046511627907, + "grad_norm": 6.577787186284137, + "kl": 0.100830078125, + "learning_rate": 7.579143389199255e-07, + "loss": 0.004, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 130 + }, + { + "completion_length": 7.0, + "epoch": 0.24372093023255814, + "grad_norm": 2.91325152595868, + "kl": 0.084228515625, + "learning_rate": 7.560521415270019e-07, + "loss": 0.0034, + "reward": 1.71875, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 131 + }, + { + "completion_length": 7.0, + "epoch": 0.24558139534883722, + "grad_norm": 18.88047653350267, + "kl": 0.119140625, + "learning_rate": 7.541899441340782e-07, + "loss": 0.0048, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 132 + }, + { + "completion_length": 7.09375, + "epoch": 0.2474418604651163, + "grad_norm": 6.436480791401577, + "kl": 0.07861328125, + "learning_rate": 7.523277467411545e-07, + "loss": 0.0031, + "reward": 1.6875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 133 + }, + { + "completion_length": 7.0, + "epoch": 0.24930232558139534, + "grad_norm": 4.38624556725526, + "kl": 0.0831298828125, + "learning_rate": 7.504655493482309e-07, + "loss": 0.0033, + "reward": 1.75, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 134 + }, + { + "completion_length": 7.0, + "epoch": 0.25116279069767444, + "grad_norm": 3.332330583725428, + "kl": 0.25, + "learning_rate": 7.486033519553073e-07, + "loss": 0.01, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 135 + }, + { + "completion_length": 6.9375, + "epoch": 0.25302325581395346, + "grad_norm": 3.799977826079449, + "kl": 0.17919921875, + "learning_rate": 7.467411545623836e-07, + "loss": 0.0072, + "reward": 1.90625, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 136 + }, + { + "completion_length": 7.21875, + "epoch": 0.25488372093023254, + "grad_norm": 5.903960835616988, + "kl": 0.1298828125, + "learning_rate": 7.448789571694599e-07, + "loss": 0.0052, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 137 + }, + { + "completion_length": 7.0, + "epoch": 0.2567441860465116, + "grad_norm": 0.036587797766214014, + "kl": 0.050537109375, + "learning_rate": 7.430167597765363e-07, + "loss": 0.002, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 138 + }, + { + "completion_length": 7.84375, + "epoch": 0.2586046511627907, + "grad_norm": 2.8701151458795224, + "kl": 0.21044921875, + "learning_rate": 7.411545623836127e-07, + "loss": 0.0084, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 139 + }, + { + "completion_length": 9.75, + "epoch": 0.26046511627906976, + "grad_norm": 3.4807388140192415, + "kl": 0.2265625, + "learning_rate": 7.39292364990689e-07, + "loss": 0.0091, + "reward": 1.8125, + "reward_std": 0.30717839300632477, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.90625, + "step": 140 + }, + { + "completion_length": 7.0, + "epoch": 0.26232558139534884, + "grad_norm": 3.744354326469237, + "kl": 0.13232421875, + "learning_rate": 7.374301675977653e-07, + "loss": 0.0053, + "reward": 1.65625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 141 + }, + { + "completion_length": 7.0, + "epoch": 0.2641860465116279, + "grad_norm": 5.073989138291937, + "kl": 0.3411865234375, + "learning_rate": 7.355679702048416e-07, + "loss": 0.0137, + "reward": 1.75, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 142 + }, + { + "completion_length": 15.03125, + "epoch": 0.266046511627907, + "grad_norm": 7.192740211646448, + "kl": 0.13232421875, + "learning_rate": 7.337057728119181e-07, + "loss": 0.0053, + "reward": 1.75, + "reward_std": 0.35206207633018494, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.90625, + "step": 143 + }, + { + "completion_length": 7.0, + "epoch": 0.26790697674418606, + "grad_norm": 2.533410435957848, + "kl": 0.1513671875, + "learning_rate": 7.318435754189943e-07, + "loss": 0.006, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 144 + }, + { + "completion_length": 9.0, + "epoch": 0.26976744186046514, + "grad_norm": 7.070134691758894, + "kl": 0.0546875, + "learning_rate": 7.299813780260707e-07, + "loss": 0.0022, + "reward": 1.75, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 145 + }, + { + "completion_length": 11.84375, + "epoch": 0.27162790697674416, + "grad_norm": 1.3980240035289724, + "kl": 0.1552734375, + "learning_rate": 7.28119180633147e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 146 + }, + { + "completion_length": 8.90625, + "epoch": 0.27348837209302324, + "grad_norm": 8.560573579151551, + "kl": 0.16162109375, + "learning_rate": 7.262569832402235e-07, + "loss": 0.0065, + "reward": 1.59375, + "reward_std": 0.20683756470680237, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.96875, + "step": 147 + }, + { + "completion_length": 7.0, + "epoch": 0.2753488372093023, + "grad_norm": 15.951408379511143, + "kl": 0.1416015625, + "learning_rate": 7.243947858472997e-07, + "loss": 0.0057, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 148 + }, + { + "completion_length": 7.0625, + "epoch": 0.2772093023255814, + "grad_norm": 4.666778387783328, + "kl": 0.2353515625, + "learning_rate": 7.225325884543761e-07, + "loss": 0.0094, + "reward": 1.71875, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 149 + }, + { + "completion_length": 8.46875, + "epoch": 0.27906976744186046, + "grad_norm": 9.522532564702036, + "kl": 0.3134765625, + "learning_rate": 7.206703910614524e-07, + "loss": 0.0126, + "reward": 1.78125, + "reward_std": 0.31684717535972595, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 150 + }, + { + "completion_length": 7.0, + "epoch": 0.28093023255813954, + "grad_norm": 6.11154436051918, + "kl": 0.0911865234375, + "learning_rate": 7.188081936685289e-07, + "loss": 0.0036, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 151 + }, + { + "completion_length": 9.8125, + "epoch": 0.2827906976744186, + "grad_norm": 3.974004333214621, + "kl": 0.1669921875, + "learning_rate": 7.169459962756051e-07, + "loss": 0.0067, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9375, + "step": 152 + }, + { + "completion_length": 7.0, + "epoch": 0.2846511627906977, + "grad_norm": 6.787854974441682, + "kl": 0.11865234375, + "learning_rate": 7.150837988826815e-07, + "loss": 0.0048, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 153 + }, + { + "completion_length": 9.90625, + "epoch": 0.28651162790697676, + "grad_norm": 3.5960700275559123, + "kl": 0.162109375, + "learning_rate": 7.132216014897579e-07, + "loss": 0.0065, + "reward": 1.6875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 154 + }, + { + "completion_length": 17.53125, + "epoch": 0.28837209302325584, + "grad_norm": 3.544876642199626, + "kl": 0.0548095703125, + "learning_rate": 7.113594040968343e-07, + "loss": 0.0022, + "reward": 1.78125, + "reward_std": 0.23673085868358612, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 155 + }, + { + "completion_length": 11.96875, + "epoch": 0.29023255813953486, + "grad_norm": 22.484714065635966, + "kl": 1.08251953125, + "learning_rate": 7.094972067039106e-07, + "loss": 0.0432, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 156 + }, + { + "completion_length": 9.6875, + "epoch": 0.29209302325581393, + "grad_norm": 1.036843830460288, + "kl": 0.1337890625, + "learning_rate": 7.076350093109869e-07, + "loss": 0.0053, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 157 + }, + { + "completion_length": 8.65625, + "epoch": 0.293953488372093, + "grad_norm": 2.0858170963272933, + "kl": 0.202880859375, + "learning_rate": 7.057728119180633e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 158 + }, + { + "completion_length": 7.125, + "epoch": 0.2958139534883721, + "grad_norm": 4.893354091101479, + "kl": 0.142578125, + "learning_rate": 7.039106145251397e-07, + "loss": 0.0057, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 159 + }, + { + "completion_length": 7.0, + "epoch": 0.29767441860465116, + "grad_norm": 3.0194310109498543, + "kl": 0.1015625, + "learning_rate": 7.02048417132216e-07, + "loss": 0.0041, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 160 + }, + { + "completion_length": 8.125, + "epoch": 0.29953488372093023, + "grad_norm": 6.331310958255691, + "kl": 0.11181640625, + "learning_rate": 7.001862197392923e-07, + "loss": 0.0045, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 161 + }, + { + "completion_length": 7.0, + "epoch": 0.3013953488372093, + "grad_norm": 2.394177509937314, + "kl": 0.0960693359375, + "learning_rate": 6.983240223463687e-07, + "loss": 0.0038, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 162 + }, + { + "completion_length": 7.03125, + "epoch": 0.3032558139534884, + "grad_norm": 2.7312679147647128, + "kl": 0.26220703125, + "learning_rate": 6.964618249534451e-07, + "loss": 0.0105, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 163 + }, + { + "completion_length": 7.90625, + "epoch": 0.30511627906976746, + "grad_norm": 7.612379703453899, + "kl": 0.0810546875, + "learning_rate": 6.945996275605214e-07, + "loss": 0.0032, + "reward": 1.78125, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 164 + }, + { + "completion_length": 7.0625, + "epoch": 0.30697674418604654, + "grad_norm": 6.627287291006775, + "kl": 0.1181640625, + "learning_rate": 6.927374301675977e-07, + "loss": 0.0047, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 165 + }, + { + "completion_length": 11.4375, + "epoch": 0.30883720930232555, + "grad_norm": 0.8658365016756733, + "kl": 0.0728759765625, + "learning_rate": 6.908752327746741e-07, + "loss": 0.0029, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 166 + }, + { + "completion_length": 7.0, + "epoch": 0.31069767441860463, + "grad_norm": 3.3541621674028836, + "kl": 0.10302734375, + "learning_rate": 6.890130353817505e-07, + "loss": 0.0041, + "reward": 1.75, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 167 + }, + { + "completion_length": 8.46875, + "epoch": 0.3125581395348837, + "grad_norm": 3.6576806777317334, + "kl": 0.065673828125, + "learning_rate": 6.871508379888268e-07, + "loss": 0.0026, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 168 + }, + { + "completion_length": 7.0, + "epoch": 0.3144186046511628, + "grad_norm": 5.652604698758747, + "kl": 0.081298828125, + "learning_rate": 6.852886405959031e-07, + "loss": 0.0033, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 169 + }, + { + "completion_length": 7.15625, + "epoch": 0.31627906976744186, + "grad_norm": 5.691302860509023, + "kl": 0.4453125, + "learning_rate": 6.834264432029795e-07, + "loss": 0.0179, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 170 + }, + { + "completion_length": 7.0, + "epoch": 0.31813953488372093, + "grad_norm": 2.4244268048163784, + "kl": 0.118408203125, + "learning_rate": 6.815642458100558e-07, + "loss": 0.0047, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 171 + }, + { + "completion_length": 7.3125, + "epoch": 0.32, + "grad_norm": 3.569544791260916, + "kl": 0.107666015625, + "learning_rate": 6.797020484171322e-07, + "loss": 0.0043, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 172 + }, + { + "completion_length": 7.28125, + "epoch": 0.3218604651162791, + "grad_norm": 10.176437017420042, + "kl": 0.091552734375, + "learning_rate": 6.778398510242084e-07, + "loss": 0.0037, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 173 + }, + { + "completion_length": 7.0, + "epoch": 0.32372093023255816, + "grad_norm": 7.974382130171821, + "kl": 0.2734375, + "learning_rate": 6.759776536312849e-07, + "loss": 0.0109, + "reward": 1.6875, + "reward_std": 0.2992308586835861, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96875, + "step": 174 + }, + { + "completion_length": 7.0, + "epoch": 0.32558139534883723, + "grad_norm": 0.07161423174264428, + "kl": 0.064208984375, + "learning_rate": 6.741154562383612e-07, + "loss": 0.0026, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 175 + }, + { + "completion_length": 7.0, + "epoch": 0.32744186046511625, + "grad_norm": 0.048623316045870835, + "kl": 0.08984375, + "learning_rate": 6.722532588454376e-07, + "loss": 0.0036, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 176 + }, + { + "completion_length": 7.0, + "epoch": 0.3293023255813953, + "grad_norm": 7.729895415965763, + "kl": 0.09619140625, + "learning_rate": 6.703910614525138e-07, + "loss": 0.0038, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 177 + }, + { + "completion_length": 7.0, + "epoch": 0.3311627906976744, + "grad_norm": 6.37545593346638, + "kl": 0.0919189453125, + "learning_rate": 6.685288640595903e-07, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 178 + }, + { + "completion_length": 6.9375, + "epoch": 0.3330232558139535, + "grad_norm": 5.631863993978278, + "kl": 0.19775390625, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0079, + "reward": 1.71875, + "reward_std": 0.24467839300632477, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 179 + }, + { + "completion_length": 17.125, + "epoch": 0.33488372093023255, + "grad_norm": 13.084148416037696, + "kl": 0.11676025390625, + "learning_rate": 6.64804469273743e-07, + "loss": 0.0047, + "reward": 1.65625, + "reward_std": 0.24467839300632477, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9375, + "step": 180 + }, + { + "completion_length": 7.0, + "epoch": 0.33674418604651163, + "grad_norm": 10.225739753784334, + "kl": 0.03125, + "learning_rate": 6.629422718808194e-07, + "loss": 0.0012, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 181 + }, + { + "completion_length": 7.0, + "epoch": 0.3386046511627907, + "grad_norm": 5.398144523878105, + "kl": 0.12841796875, + "learning_rate": 6.610800744878957e-07, + "loss": 0.0051, + "reward": 1.84375, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 182 + }, + { + "completion_length": 7.03125, + "epoch": 0.3404651162790698, + "grad_norm": 4.702359002591123, + "kl": 0.130859375, + "learning_rate": 6.59217877094972e-07, + "loss": 0.0052, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 183 + }, + { + "completion_length": 7.0, + "epoch": 0.34232558139534885, + "grad_norm": 15.888076092278354, + "kl": 0.2685546875, + "learning_rate": 6.573556797020484e-07, + "loss": 0.0107, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 184 + }, + { + "completion_length": 7.9375, + "epoch": 0.34418604651162793, + "grad_norm": 6.865868810223541, + "kl": 0.0859375, + "learning_rate": 6.554934823091248e-07, + "loss": 0.0034, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 185 + }, + { + "completion_length": 6.96875, + "epoch": 0.34604651162790695, + "grad_norm": 3.812387943897516, + "kl": 0.09521484375, + "learning_rate": 6.536312849162011e-07, + "loss": 0.0038, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 186 + }, + { + "completion_length": 8.28125, + "epoch": 0.347906976744186, + "grad_norm": 4.292262610287447, + "kl": 0.138916015625, + "learning_rate": 6.517690875232774e-07, + "loss": 0.0056, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 187 + }, + { + "completion_length": 10.09375, + "epoch": 0.3497674418604651, + "grad_norm": 3.63154014941604, + "kl": 0.060791015625, + "learning_rate": 6.499068901303538e-07, + "loss": 0.0024, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 188 + }, + { + "completion_length": 6.9375, + "epoch": 0.3516279069767442, + "grad_norm": 6.044758542788755, + "kl": 0.1552734375, + "learning_rate": 6.480446927374302e-07, + "loss": 0.0062, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 189 + }, + { + "completion_length": 11.9375, + "epoch": 0.35348837209302325, + "grad_norm": 2.6916325618804535, + "kl": 0.095703125, + "learning_rate": 6.461824953445065e-07, + "loss": 0.0038, + "reward": 1.78125, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 190 + }, + { + "completion_length": 7.0, + "epoch": 0.3553488372093023, + "grad_norm": 60.36488192082516, + "kl": 0.111083984375, + "learning_rate": 6.443202979515828e-07, + "loss": 0.0044, + "reward": 1.84375, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 191 + }, + { + "completion_length": 7.0, + "epoch": 0.3572093023255814, + "grad_norm": 2.816289812983186, + "kl": 0.0894775390625, + "learning_rate": 6.424581005586592e-07, + "loss": 0.0036, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 192 + }, + { + "completion_length": 7.0, + "epoch": 0.3590697674418605, + "grad_norm": 2.8131180441644537, + "kl": 0.10009765625, + "learning_rate": 6.405959031657356e-07, + "loss": 0.004, + "reward": 1.6875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 193 + }, + { + "completion_length": 7.0625, + "epoch": 0.36093023255813955, + "grad_norm": 6.892323176624426, + "kl": 0.20880126953125, + "learning_rate": 6.387337057728119e-07, + "loss": 0.0084, + "reward": 1.8125, + "reward_std": 0.3221687823534012, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 194 + }, + { + "completion_length": 7.03125, + "epoch": 0.3627906976744186, + "grad_norm": 3.8064960568722337, + "kl": 0.064208984375, + "learning_rate": 6.368715083798882e-07, + "loss": 0.0026, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 195 + }, + { + "completion_length": 8.71875, + "epoch": 0.36465116279069765, + "grad_norm": 6.817589260948733, + "kl": 0.08203125, + "learning_rate": 6.350093109869646e-07, + "loss": 0.0033, + "reward": 1.84375, + "reward_std": 0.20683756470680237, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 196 + }, + { + "completion_length": 6.96875, + "epoch": 0.3665116279069767, + "grad_norm": 11.024975594491176, + "kl": 0.19140625, + "learning_rate": 6.33147113594041e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 197 + }, + { + "completion_length": 7.0, + "epoch": 0.3683720930232558, + "grad_norm": 2.1624344858379247, + "kl": 0.08154296875, + "learning_rate": 6.312849162011172e-07, + "loss": 0.0033, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 198 + }, + { + "completion_length": 6.84375, + "epoch": 0.3702325581395349, + "grad_norm": 31.907666661851906, + "kl": 2.38427734375, + "learning_rate": 6.294227188081936e-07, + "loss": 0.0952, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 199 + }, + { + "completion_length": 8.8125, + "epoch": 0.37209302325581395, + "grad_norm": 10.535958775253093, + "kl": 0.1611328125, + "learning_rate": 6.275605214152699e-07, + "loss": 0.0065, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 200 + }, + { + "completion_length": 7.25, + "epoch": 0.373953488372093, + "grad_norm": 9.98204707838698, + "kl": 0.09619140625, + "learning_rate": 6.256983240223464e-07, + "loss": 0.0038, + "reward": 1.78125, + "reward_std": 0.33183756470680237, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 201 + }, + { + "completion_length": 7.0, + "epoch": 0.3758139534883721, + "grad_norm": 4.674065297742783, + "kl": 0.11767578125, + "learning_rate": 6.238361266294226e-07, + "loss": 0.0047, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 202 + }, + { + "completion_length": 7.0, + "epoch": 0.3776744186046512, + "grad_norm": 2.8393505746902266, + "kl": 0.0784912109375, + "learning_rate": 6.21973929236499e-07, + "loss": 0.0031, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 203 + }, + { + "completion_length": 7.0, + "epoch": 0.37953488372093025, + "grad_norm": 7.698454890694392, + "kl": 0.099365234375, + "learning_rate": 6.201117318435754e-07, + "loss": 0.004, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 204 + }, + { + "completion_length": 7.71875, + "epoch": 0.3813953488372093, + "grad_norm": 1.1350389692124458, + "kl": 0.1121826171875, + "learning_rate": 6.182495344506518e-07, + "loss": 0.0045, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 205 + }, + { + "completion_length": 7.0, + "epoch": 0.38325581395348834, + "grad_norm": 4.456844226809678, + "kl": 0.10595703125, + "learning_rate": 6.16387337057728e-07, + "loss": 0.0042, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 206 + }, + { + "completion_length": 7.0, + "epoch": 0.3851162790697674, + "grad_norm": 2.9615671400739862, + "kl": 0.162109375, + "learning_rate": 6.145251396648044e-07, + "loss": 0.0065, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 207 + }, + { + "completion_length": 7.0, + "epoch": 0.3869767441860465, + "grad_norm": 0.025516856172865036, + "kl": 0.081298828125, + "learning_rate": 6.126629422718808e-07, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 208 + }, + { + "completion_length": 8.3125, + "epoch": 0.38883720930232557, + "grad_norm": 1.6811567393411686, + "kl": 0.1015625, + "learning_rate": 6.108007448789572e-07, + "loss": 0.0041, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 209 + }, + { + "completion_length": 7.0, + "epoch": 0.39069767441860465, + "grad_norm": 0.020307809915519195, + "kl": 0.079345703125, + "learning_rate": 6.089385474860335e-07, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 210 + }, + { + "completion_length": 7.0625, + "epoch": 0.3925581395348837, + "grad_norm": 5.211812136746312, + "kl": 0.169189453125, + "learning_rate": 6.070763500931098e-07, + "loss": 0.0068, + "reward": 1.90625, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 211 + }, + { + "completion_length": 8.96875, + "epoch": 0.3944186046511628, + "grad_norm": 13.655033509366184, + "kl": 0.169677734375, + "learning_rate": 6.052141527001862e-07, + "loss": 0.0068, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 212 + }, + { + "completion_length": 7.0, + "epoch": 0.39627906976744187, + "grad_norm": 6.491881821068701, + "kl": 0.30078125, + "learning_rate": 6.033519553072626e-07, + "loss": 0.0121, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 213 + }, + { + "completion_length": 7.0, + "epoch": 0.39813953488372095, + "grad_norm": 0.13283055646276293, + "kl": 0.06982421875, + "learning_rate": 6.014897579143389e-07, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 214 + }, + { + "completion_length": 7.03125, + "epoch": 0.4, + "grad_norm": 6.103942324847313, + "kl": 0.2177734375, + "learning_rate": 5.996275605214152e-07, + "loss": 0.0087, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 215 + }, + { + "completion_length": 7.21875, + "epoch": 0.4018604651162791, + "grad_norm": 10.263334158813597, + "kl": 0.16015625, + "learning_rate": 5.977653631284916e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 216 + }, + { + "completion_length": 7.3125, + "epoch": 0.4037209302325581, + "grad_norm": 1.9299492654623225, + "kl": 0.16015625, + "learning_rate": 5.95903165735568e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 217 + }, + { + "completion_length": 7.0, + "epoch": 0.4055813953488372, + "grad_norm": 7.45438549421264, + "kl": 0.13232421875, + "learning_rate": 5.940409683426443e-07, + "loss": 0.0053, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 218 + }, + { + "completion_length": 7.0, + "epoch": 0.40744186046511627, + "grad_norm": 4.476139692668387, + "kl": 0.0908203125, + "learning_rate": 5.921787709497206e-07, + "loss": 0.0036, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 219 + }, + { + "completion_length": 7.25, + "epoch": 0.40930232558139534, + "grad_norm": 1.4730116662757038, + "kl": 0.121337890625, + "learning_rate": 5.90316573556797e-07, + "loss": 0.0049, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 220 + }, + { + "completion_length": 7.96875, + "epoch": 0.4111627906976744, + "grad_norm": 4.046363541720071, + "kl": 0.251953125, + "learning_rate": 5.884543761638734e-07, + "loss": 0.0101, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 221 + }, + { + "completion_length": 7.0625, + "epoch": 0.4130232558139535, + "grad_norm": 3.090200609402099, + "kl": 0.10791015625, + "learning_rate": 5.865921787709497e-07, + "loss": 0.0043, + "reward": 1.78125, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 222 + }, + { + "completion_length": 7.0, + "epoch": 0.41488372093023257, + "grad_norm": 0.0334131986440854, + "kl": 0.09033203125, + "learning_rate": 5.84729981378026e-07, + "loss": 0.0036, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 223 + }, + { + "completion_length": 7.0, + "epoch": 0.41674418604651164, + "grad_norm": 1.4885503664281547, + "kl": 0.2158203125, + "learning_rate": 5.828677839851024e-07, + "loss": 0.0086, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 224 + }, + { + "completion_length": 7.28125, + "epoch": 0.4186046511627907, + "grad_norm": 1.6308477873453437, + "kl": 0.1502685546875, + "learning_rate": 5.810055865921788e-07, + "loss": 0.006, + "reward": 1.75, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 225 + }, + { + "completion_length": 7.0, + "epoch": 0.4204651162790698, + "grad_norm": 3.3172611559649536, + "kl": 0.12353515625, + "learning_rate": 5.791433891992551e-07, + "loss": 0.005, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 226 + }, + { + "completion_length": 9.1875, + "epoch": 0.4223255813953488, + "grad_norm": 1.1386114336817916, + "kl": 0.111083984375, + "learning_rate": 5.772811918063313e-07, + "loss": 0.0044, + "reward": 1.59375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.96875, + "step": 227 + }, + { + "completion_length": 7.15625, + "epoch": 0.4241860465116279, + "grad_norm": 3.2396190210407716, + "kl": 0.10986328125, + "learning_rate": 5.754189944134078e-07, + "loss": 0.0044, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 228 + }, + { + "completion_length": 7.0625, + "epoch": 0.42604651162790697, + "grad_norm": 1.556600357911171, + "kl": 0.179931640625, + "learning_rate": 5.735567970204841e-07, + "loss": 0.0072, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 229 + }, + { + "completion_length": 7.0, + "epoch": 0.42790697674418604, + "grad_norm": 0.03271942763045044, + "kl": 0.103515625, + "learning_rate": 5.716945996275605e-07, + "loss": 0.0041, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 230 + }, + { + "completion_length": 7.0, + "epoch": 0.4297674418604651, + "grad_norm": 7.651516677825048, + "kl": 0.19140625, + "learning_rate": 5.698324022346367e-07, + "loss": 0.0077, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 231 + }, + { + "completion_length": 7.0, + "epoch": 0.4316279069767442, + "grad_norm": 5.2262014297183885, + "kl": 0.1416015625, + "learning_rate": 5.679702048417132e-07, + "loss": 0.0057, + "reward": 1.625, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 232 + }, + { + "completion_length": 7.0, + "epoch": 0.43348837209302327, + "grad_norm": 0.15442492992450474, + "kl": 0.101806640625, + "learning_rate": 5.661080074487895e-07, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 233 + }, + { + "completion_length": 7.0, + "epoch": 0.43534883720930234, + "grad_norm": 0.06415206895470296, + "kl": 0.087890625, + "learning_rate": 5.642458100558659e-07, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 234 + }, + { + "completion_length": 7.0, + "epoch": 0.4372093023255814, + "grad_norm": 0.034775902561906914, + "kl": 0.066650390625, + "learning_rate": 5.623836126629423e-07, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 235 + }, + { + "completion_length": 14.78125, + "epoch": 0.4390697674418605, + "grad_norm": 14.43230257502733, + "kl": 0.39892578125, + "learning_rate": 5.605214152700186e-07, + "loss": 0.016, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 236 + }, + { + "completion_length": 7.0, + "epoch": 0.4409302325581395, + "grad_norm": 3.51925456394119, + "kl": 0.090087890625, + "learning_rate": 5.586592178770949e-07, + "loss": 0.0036, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 237 + }, + { + "completion_length": 7.0, + "epoch": 0.4427906976744186, + "grad_norm": 0.0228077081712126, + "kl": 0.10400390625, + "learning_rate": 5.567970204841713e-07, + "loss": 0.0042, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 238 + }, + { + "completion_length": 7.1875, + "epoch": 0.44465116279069766, + "grad_norm": 6.808445664651886, + "kl": 0.543701171875, + "learning_rate": 5.549348230912477e-07, + "loss": 0.0217, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 239 + }, + { + "completion_length": 8.1875, + "epoch": 0.44651162790697674, + "grad_norm": 5.2093407518348105, + "kl": 0.09130859375, + "learning_rate": 5.53072625698324e-07, + "loss": 0.0037, + "reward": 1.8125, + "reward_std": 0.3221687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 240 + }, + { + "completion_length": 9.65625, + "epoch": 0.4483720930232558, + "grad_norm": 1.4400151470728306, + "kl": 0.11572265625, + "learning_rate": 5.512104283054003e-07, + "loss": 0.0046, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 241 + }, + { + "completion_length": 7.1875, + "epoch": 0.4502325581395349, + "grad_norm": 9.197357660702982, + "kl": 0.7919921875, + "learning_rate": 5.493482309124767e-07, + "loss": 0.0316, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 242 + }, + { + "completion_length": 7.0, + "epoch": 0.45209302325581396, + "grad_norm": 0.4782914189444419, + "kl": 0.2744140625, + "learning_rate": 5.474860335195531e-07, + "loss": 0.011, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 243 + }, + { + "completion_length": 7.0, + "epoch": 0.45395348837209304, + "grad_norm": 0.11971389840882889, + "kl": 0.15673828125, + "learning_rate": 5.456238361266294e-07, + "loss": 0.0062, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 244 + }, + { + "completion_length": 9.0625, + "epoch": 0.4558139534883721, + "grad_norm": 1.9214984823007695, + "kl": 0.0693359375, + "learning_rate": 5.437616387337057e-07, + "loss": 0.0028, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 245 + }, + { + "completion_length": 7.0, + "epoch": 0.4576744186046512, + "grad_norm": 0.03381418388775164, + "kl": 0.0423583984375, + "learning_rate": 5.418994413407821e-07, + "loss": 0.0017, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 246 + }, + { + "completion_length": 7.0, + "epoch": 0.4595348837209302, + "grad_norm": 0.042937487469313834, + "kl": 0.0882568359375, + "learning_rate": 5.400372439478585e-07, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 247 + }, + { + "completion_length": 7.25, + "epoch": 0.4613953488372093, + "grad_norm": 5.5513207586302205, + "kl": 0.232421875, + "learning_rate": 5.381750465549348e-07, + "loss": 0.0093, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 248 + }, + { + "completion_length": 7.09375, + "epoch": 0.46325581395348836, + "grad_norm": 15.112638965830387, + "kl": 0.66943359375, + "learning_rate": 5.363128491620111e-07, + "loss": 0.0269, + "reward": 1.71875, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 249 + }, + { + "completion_length": 7.0, + "epoch": 0.46511627906976744, + "grad_norm": 2.359771266102971, + "kl": 0.1044921875, + "learning_rate": 5.344506517690875e-07, + "loss": 0.0042, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 250 + }, + { + "completion_length": 7.0, + "epoch": 0.4669767441860465, + "grad_norm": 1.681670634515614, + "kl": 0.191162109375, + "learning_rate": 5.325884543761639e-07, + "loss": 0.0076, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 251 + }, + { + "completion_length": 7.0, + "epoch": 0.4688372093023256, + "grad_norm": 0.04934970561082363, + "kl": 0.11474609375, + "learning_rate": 5.307262569832402e-07, + "loss": 0.0046, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 252 + }, + { + "completion_length": 7.40625, + "epoch": 0.47069767441860466, + "grad_norm": 3.036432406648037, + "kl": 0.14208984375, + "learning_rate": 5.288640595903165e-07, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 253 + }, + { + "completion_length": 7.96875, + "epoch": 0.47255813953488374, + "grad_norm": 4.963714484948783, + "kl": 0.1572265625, + "learning_rate": 5.27001862197393e-07, + "loss": 0.0063, + "reward": 1.8125, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 254 + }, + { + "completion_length": 7.3125, + "epoch": 0.4744186046511628, + "grad_norm": 4.219848601538747, + "kl": 0.2041015625, + "learning_rate": 5.251396648044693e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 255 + }, + { + "completion_length": 7.25, + "epoch": 0.4762790697674419, + "grad_norm": 7.206472762375939, + "kl": 0.38671875, + "learning_rate": 5.232774674115455e-07, + "loss": 0.0155, + "reward": 1.78125, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 256 + }, + { + "completion_length": 7.5, + "epoch": 0.4781395348837209, + "grad_norm": 4.572262159609816, + "kl": 0.15185546875, + "learning_rate": 5.214152700186219e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.18217839300632477, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 257 + }, + { + "completion_length": 7.0, + "epoch": 0.48, + "grad_norm": 0.027768748317704987, + "kl": 0.1005859375, + "learning_rate": 5.195530726256983e-07, + "loss": 0.004, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 258 + }, + { + "completion_length": 7.21875, + "epoch": 0.48186046511627906, + "grad_norm": 9.124546909432944, + "kl": 0.0908203125, + "learning_rate": 5.176908752327747e-07, + "loss": 0.0036, + "reward": 1.71875, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 259 + }, + { + "completion_length": 7.0, + "epoch": 0.48372093023255813, + "grad_norm": 2.5628076244550657, + "kl": 0.0623779296875, + "learning_rate": 5.15828677839851e-07, + "loss": 0.0025, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 260 + }, + { + "completion_length": 7.0, + "epoch": 0.4855813953488372, + "grad_norm": 3.2894087854348193, + "kl": 0.11474609375, + "learning_rate": 5.139664804469273e-07, + "loss": 0.0046, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 261 + }, + { + "completion_length": 7.15625, + "epoch": 0.4874418604651163, + "grad_norm": 2.4792563568901596, + "kl": 0.0916748046875, + "learning_rate": 5.121042830540037e-07, + "loss": 0.0037, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 262 + }, + { + "completion_length": 7.0, + "epoch": 0.48930232558139536, + "grad_norm": 0.028923541299722258, + "kl": 0.10693359375, + "learning_rate": 5.102420856610801e-07, + "loss": 0.0043, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 263 + }, + { + "completion_length": 8.71875, + "epoch": 0.49116279069767443, + "grad_norm": 4.30338517541484, + "kl": 0.11962890625, + "learning_rate": 5.083798882681564e-07, + "loss": 0.0048, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 264 + }, + { + "completion_length": 7.0, + "epoch": 0.4930232558139535, + "grad_norm": 12.299856998296246, + "kl": 0.091552734375, + "learning_rate": 5.065176908752327e-07, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 265 + }, + { + "completion_length": 7.0, + "epoch": 0.4948837209302326, + "grad_norm": 0.02648154378869004, + "kl": 0.0830078125, + "learning_rate": 5.046554934823091e-07, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 266 + }, + { + "completion_length": 7.0, + "epoch": 0.4967441860465116, + "grad_norm": 4.43899924814322, + "kl": 0.11279296875, + "learning_rate": 5.027932960893855e-07, + "loss": 0.0045, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 267 + }, + { + "completion_length": 7.25, + "epoch": 0.4986046511627907, + "grad_norm": 12.982619764639555, + "kl": 0.6875, + "learning_rate": 5.009310986964618e-07, + "loss": 0.0275, + "reward": 1.90625, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 268 + }, + { + "completion_length": 7.0, + "epoch": 0.5004651162790698, + "grad_norm": 9.373611021036686, + "kl": 0.622802734375, + "learning_rate": 4.990689013035381e-07, + "loss": 0.0249, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9375, + "step": 269 + }, + { + "completion_length": 7.15625, + "epoch": 0.5023255813953489, + "grad_norm": 0.05528330194635162, + "kl": 0.11181640625, + "learning_rate": 4.972067039106145e-07, + "loss": 0.0045, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 270 + }, + { + "completion_length": 7.09375, + "epoch": 0.5041860465116279, + "grad_norm": 7.280223588518196, + "kl": 0.48974609375, + "learning_rate": 4.953445065176908e-07, + "loss": 0.0196, + "reward": 1.875, + "reward_std": 0.18217839300632477, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 271 + }, + { + "completion_length": 7.0, + "epoch": 0.5060465116279069, + "grad_norm": 5.145299582423085, + "kl": 0.036376953125, + "learning_rate": 4.934823091247672e-07, + "loss": 0.0015, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 272 + }, + { + "completion_length": 7.0, + "epoch": 0.5079069767441861, + "grad_norm": 2.9958179869102652, + "kl": 0.30712890625, + "learning_rate": 4.916201117318435e-07, + "loss": 0.0122, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 273 + }, + { + "completion_length": 7.0, + "epoch": 0.5097674418604651, + "grad_norm": 13.603323523008575, + "kl": 0.0623779296875, + "learning_rate": 4.897579143389199e-07, + "loss": 0.0025, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 274 + }, + { + "completion_length": 6.96875, + "epoch": 0.5116279069767442, + "grad_norm": 7.088606205353909, + "kl": 0.4794921875, + "learning_rate": 4.878957169459962e-07, + "loss": 0.0193, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 275 + }, + { + "completion_length": 7.0, + "epoch": 0.5134883720930232, + "grad_norm": 0.7486469672661343, + "kl": 0.21826171875, + "learning_rate": 4.860335195530726e-07, + "loss": 0.0087, + "reward": 1.71875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 276 + }, + { + "completion_length": 7.0, + "epoch": 0.5153488372093024, + "grad_norm": 4.863346752255601, + "kl": 0.09124755859375, + "learning_rate": 4.84171322160149e-07, + "loss": 0.0037, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 277 + }, + { + "completion_length": 8.25, + "epoch": 0.5172093023255814, + "grad_norm": 5.173312524321795, + "kl": 0.122802734375, + "learning_rate": 4.823091247672253e-07, + "loss": 0.0049, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 278 + }, + { + "completion_length": 7.15625, + "epoch": 0.5190697674418605, + "grad_norm": 10.853145641247945, + "kl": 0.65869140625, + "learning_rate": 4.804469273743016e-07, + "loss": 0.0264, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 279 + }, + { + "completion_length": 7.0, + "epoch": 0.5209302325581395, + "grad_norm": 4.3321461264584125, + "kl": 0.13330078125, + "learning_rate": 4.78584729981378e-07, + "loss": 0.0054, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 280 + }, + { + "completion_length": 7.0, + "epoch": 0.5227906976744187, + "grad_norm": 0.07654331733721662, + "kl": 0.052001953125, + "learning_rate": 4.7672253258845435e-07, + "loss": 0.0021, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 281 + }, + { + "completion_length": 7.0, + "epoch": 0.5246511627906977, + "grad_norm": 2.4138058495603305, + "kl": 0.096435546875, + "learning_rate": 4.7486033519553073e-07, + "loss": 0.0039, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 282 + }, + { + "completion_length": 7.0, + "epoch": 0.5265116279069767, + "grad_norm": 4.032075181464584, + "kl": 0.4013671875, + "learning_rate": 4.7299813780260705e-07, + "loss": 0.016, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 283 + }, + { + "completion_length": 7.0625, + "epoch": 0.5283720930232558, + "grad_norm": 4.056556205074768, + "kl": 0.35546875, + "learning_rate": 4.7113594040968343e-07, + "loss": 0.0142, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 284 + }, + { + "completion_length": 7.0, + "epoch": 0.5302325581395348, + "grad_norm": 7.269564564565014, + "kl": 0.1533203125, + "learning_rate": 4.6927374301675976e-07, + "loss": 0.0061, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 285 + }, + { + "completion_length": 7.21875, + "epoch": 0.532093023255814, + "grad_norm": 2.2545124308556264, + "kl": 0.169921875, + "learning_rate": 4.6741154562383613e-07, + "loss": 0.0068, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 286 + }, + { + "completion_length": 7.5, + "epoch": 0.533953488372093, + "grad_norm": 6.095343680272726, + "kl": 0.19140625, + "learning_rate": 4.6554934823091246e-07, + "loss": 0.0076, + "reward": 1.8125, + "reward_std": 0.25434717535972595, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 287 + }, + { + "completion_length": 9.8125, + "epoch": 0.5358139534883721, + "grad_norm": 1.7190811337843963, + "kl": 0.08056640625, + "learning_rate": 4.6368715083798884e-07, + "loss": 0.0032, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 288 + }, + { + "completion_length": 7.0, + "epoch": 0.5376744186046511, + "grad_norm": 4.53833890118138, + "kl": 0.139404296875, + "learning_rate": 4.6182495344506516e-07, + "loss": 0.0056, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 289 + }, + { + "completion_length": 7.125, + "epoch": 0.5395348837209303, + "grad_norm": 4.964525595962821, + "kl": 0.461181640625, + "learning_rate": 4.5996275605214154e-07, + "loss": 0.0185, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 290 + }, + { + "completion_length": 8.03125, + "epoch": 0.5413953488372093, + "grad_norm": 8.436635559347815, + "kl": 0.080810546875, + "learning_rate": 4.5810055865921786e-07, + "loss": 0.0032, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 291 + }, + { + "completion_length": 7.25, + "epoch": 0.5432558139534883, + "grad_norm": 5.109777735725768, + "kl": 0.15185546875, + "learning_rate": 4.5623836126629424e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 292 + }, + { + "completion_length": 7.3125, + "epoch": 0.5451162790697675, + "grad_norm": 5.636561820946858, + "kl": 0.2255859375, + "learning_rate": 4.5437616387337056e-07, + "loss": 0.009, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 293 + }, + { + "completion_length": 10.71875, + "epoch": 0.5469767441860465, + "grad_norm": 5.814582727972166, + "kl": 0.102783203125, + "learning_rate": 4.5251396648044694e-07, + "loss": 0.0041, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9375, + "step": 294 + }, + { + "completion_length": 7.0, + "epoch": 0.5488372093023256, + "grad_norm": 2.835094789148846, + "kl": 0.137451171875, + "learning_rate": 4.5065176908752327e-07, + "loss": 0.0055, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 295 + }, + { + "completion_length": 7.1875, + "epoch": 0.5506976744186046, + "grad_norm": 3.169108370254635, + "kl": 0.236328125, + "learning_rate": 4.4878957169459964e-07, + "loss": 0.0095, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 296 + }, + { + "completion_length": 7.3125, + "epoch": 0.5525581395348838, + "grad_norm": 4.628942622058202, + "kl": 0.18310546875, + "learning_rate": 4.469273743016759e-07, + "loss": 0.0073, + "reward": 1.71875, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9375, + "step": 297 + }, + { + "completion_length": 7.0, + "epoch": 0.5544186046511628, + "grad_norm": 0.024982443899385642, + "kl": 0.114501953125, + "learning_rate": 4.450651769087523e-07, + "loss": 0.0046, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 298 + }, + { + "completion_length": 7.0, + "epoch": 0.5562790697674419, + "grad_norm": 1.183766561494926, + "kl": 0.3369140625, + "learning_rate": 4.432029795158286e-07, + "loss": 0.0135, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 299 + }, + { + "completion_length": 7.03125, + "epoch": 0.5581395348837209, + "grad_norm": 1.5935962293928572, + "kl": 0.138671875, + "learning_rate": 4.41340782122905e-07, + "loss": 0.0056, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 300 + }, + { + "completion_length": 7.0, + "epoch": 0.56, + "grad_norm": 0.06183818985099608, + "kl": 0.09521484375, + "learning_rate": 4.394785847299813e-07, + "loss": 0.0038, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 301 + }, + { + "completion_length": 7.0625, + "epoch": 0.5618604651162791, + "grad_norm": 5.72273333039999, + "kl": 0.2216796875, + "learning_rate": 4.376163873370577e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.3221687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 302 + }, + { + "completion_length": 7.0, + "epoch": 0.5637209302325581, + "grad_norm": 5.072723330827652, + "kl": 0.08642578125, + "learning_rate": 4.35754189944134e-07, + "loss": 0.0035, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 303 + }, + { + "completion_length": 7.25, + "epoch": 0.5655813953488372, + "grad_norm": 1.9138503647597314, + "kl": 0.094207763671875, + "learning_rate": 4.338919925512104e-07, + "loss": 0.0038, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 304 + }, + { + "completion_length": 7.28125, + "epoch": 0.5674418604651162, + "grad_norm": 4.450647519797281, + "kl": 0.26513671875, + "learning_rate": 4.320297951582867e-07, + "loss": 0.0106, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 305 + }, + { + "completion_length": 7.0, + "epoch": 0.5693023255813954, + "grad_norm": 0.42417670266280466, + "kl": 0.105224609375, + "learning_rate": 4.301675977653631e-07, + "loss": 0.0042, + "reward": 1.75, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 306 + }, + { + "completion_length": 8.53125, + "epoch": 0.5711627906976744, + "grad_norm": 4.285582112131496, + "kl": 0.1923828125, + "learning_rate": 4.283054003724394e-07, + "loss": 0.0077, + "reward": 1.65625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.90625, + "step": 307 + }, + { + "completion_length": 7.0, + "epoch": 0.5730232558139535, + "grad_norm": 0.016680620222379525, + "kl": 0.079345703125, + "learning_rate": 4.264432029795158e-07, + "loss": 0.0032, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 308 + }, + { + "completion_length": 7.25, + "epoch": 0.5748837209302325, + "grad_norm": 0.7656733384141977, + "kl": 0.1826171875, + "learning_rate": 4.245810055865922e-07, + "loss": 0.0073, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 309 + }, + { + "completion_length": 7.09375, + "epoch": 0.5767441860465117, + "grad_norm": 1.8105190721760713, + "kl": 0.076416015625, + "learning_rate": 4.227188081936685e-07, + "loss": 0.0031, + "reward": 1.8125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 310 + }, + { + "completion_length": 7.0625, + "epoch": 0.5786046511627907, + "grad_norm": 7.489673212977552, + "kl": 0.310546875, + "learning_rate": 4.208566108007449e-07, + "loss": 0.0124, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 311 + }, + { + "completion_length": 7.15625, + "epoch": 0.5804651162790697, + "grad_norm": 2.9171394103856376, + "kl": 0.42578125, + "learning_rate": 4.189944134078212e-07, + "loss": 0.017, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 312 + }, + { + "completion_length": 7.28125, + "epoch": 0.5823255813953488, + "grad_norm": 9.929497018406316, + "kl": 0.23681640625, + "learning_rate": 4.171322160148976e-07, + "loss": 0.0095, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 313 + }, + { + "completion_length": 7.0, + "epoch": 0.5841860465116279, + "grad_norm": 5.297406946985658, + "kl": 0.098876953125, + "learning_rate": 4.152700186219739e-07, + "loss": 0.004, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 314 + }, + { + "completion_length": 7.0, + "epoch": 0.586046511627907, + "grad_norm": 0.08866476610588253, + "kl": 0.073974609375, + "learning_rate": 4.134078212290503e-07, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 315 + }, + { + "completion_length": 7.0, + "epoch": 0.587906976744186, + "grad_norm": 0.6987457530365371, + "kl": 0.26025390625, + "learning_rate": 4.115456238361266e-07, + "loss": 0.0104, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 316 + }, + { + "completion_length": 7.0, + "epoch": 0.5897674418604651, + "grad_norm": 0.1804740055488923, + "kl": 0.10302734375, + "learning_rate": 4.09683426443203e-07, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 317 + }, + { + "completion_length": 7.0, + "epoch": 0.5916279069767442, + "grad_norm": 0.03796599062207493, + "kl": 0.0645751953125, + "learning_rate": 4.078212290502793e-07, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 318 + }, + { + "completion_length": 7.0, + "epoch": 0.5934883720930233, + "grad_norm": 6.034967486510372, + "kl": 0.132568359375, + "learning_rate": 4.059590316573557e-07, + "loss": 0.0053, + "reward": 1.78125, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 319 + }, + { + "completion_length": 7.6875, + "epoch": 0.5953488372093023, + "grad_norm": 4.1969462441707375, + "kl": 0.2021484375, + "learning_rate": 4.04096834264432e-07, + "loss": 0.0081, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9375, + "step": 320 + }, + { + "completion_length": 7.15625, + "epoch": 0.5972093023255814, + "grad_norm": 15.34438171724722, + "kl": 0.146484375, + "learning_rate": 4.022346368715084e-07, + "loss": 0.0059, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 321 + }, + { + "completion_length": 9.5, + "epoch": 0.5990697674418605, + "grad_norm": 1.3373601507850006, + "kl": 0.2275390625, + "learning_rate": 4.003724394785847e-07, + "loss": 0.0091, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 322 + }, + { + "completion_length": 7.0, + "epoch": 0.6009302325581395, + "grad_norm": 0.052608789868671554, + "kl": 0.1181640625, + "learning_rate": 3.985102420856611e-07, + "loss": 0.0047, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 323 + }, + { + "completion_length": 7.0, + "epoch": 0.6027906976744186, + "grad_norm": 5.706614929601725, + "kl": 0.22265625, + "learning_rate": 3.966480446927374e-07, + "loss": 0.0089, + "reward": 1.8125, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 324 + }, + { + "completion_length": 7.0, + "epoch": 0.6046511627906976, + "grad_norm": 4.246760392717802, + "kl": 0.1064453125, + "learning_rate": 3.947858472998138e-07, + "loss": 0.0043, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 325 + }, + { + "completion_length": 7.0, + "epoch": 0.6065116279069768, + "grad_norm": 6.09770485071996, + "kl": 0.16064453125, + "learning_rate": 3.9292364990689007e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 326 + }, + { + "completion_length": 7.3125, + "epoch": 0.6083720930232558, + "grad_norm": 3.6236287153759537, + "kl": 0.31298828125, + "learning_rate": 3.9106145251396645e-07, + "loss": 0.0125, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 327 + }, + { + "completion_length": 7.0, + "epoch": 0.6102325581395349, + "grad_norm": 0.021677692785096885, + "kl": 0.093994140625, + "learning_rate": 3.8919925512104277e-07, + "loss": 0.0038, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 328 + }, + { + "completion_length": 7.0, + "epoch": 0.6120930232558139, + "grad_norm": 0.07267604929918552, + "kl": 0.158203125, + "learning_rate": 3.8733705772811915e-07, + "loss": 0.0063, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 329 + }, + { + "completion_length": 7.0, + "epoch": 0.6139534883720931, + "grad_norm": 0.032083949737976944, + "kl": 0.0562744140625, + "learning_rate": 3.8547486033519547e-07, + "loss": 0.0023, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 330 + }, + { + "completion_length": 7.03125, + "epoch": 0.6158139534883721, + "grad_norm": 0.7024693784622495, + "kl": 0.17529296875, + "learning_rate": 3.8361266294227185e-07, + "loss": 0.007, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 331 + }, + { + "completion_length": 9.59375, + "epoch": 0.6176744186046511, + "grad_norm": 7.3288072752732765, + "kl": 0.114013671875, + "learning_rate": 3.817504655493482e-07, + "loss": 0.0046, + "reward": 1.84375, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.90625, + "step": 332 + }, + { + "completion_length": 7.0, + "epoch": 0.6195348837209302, + "grad_norm": 0.03335046773482252, + "kl": 0.068359375, + "learning_rate": 3.7988826815642455e-07, + "loss": 0.0027, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 333 + }, + { + "completion_length": 7.0, + "epoch": 0.6213953488372093, + "grad_norm": 0.02308391451560933, + "kl": 0.0927734375, + "learning_rate": 3.7802607076350093e-07, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 334 + }, + { + "completion_length": 7.0, + "epoch": 0.6232558139534884, + "grad_norm": 2.619162120109373, + "kl": 0.1240234375, + "learning_rate": 3.7616387337057725e-07, + "loss": 0.005, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 335 + }, + { + "completion_length": 8.25, + "epoch": 0.6251162790697674, + "grad_norm": 2.0757890848944953, + "kl": 0.06396484375, + "learning_rate": 3.7430167597765363e-07, + "loss": 0.0026, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 336 + }, + { + "completion_length": 7.0, + "epoch": 0.6269767441860465, + "grad_norm": 0.025756331603180686, + "kl": 0.098388671875, + "learning_rate": 3.7243947858472996e-07, + "loss": 0.0039, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 337 + }, + { + "completion_length": 7.0, + "epoch": 0.6288372093023256, + "grad_norm": 0.02218231287209564, + "kl": 0.125, + "learning_rate": 3.7057728119180633e-07, + "loss": 0.005, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 338 + }, + { + "completion_length": 7.0, + "epoch": 0.6306976744186047, + "grad_norm": 0.028748163976391, + "kl": 0.059326171875, + "learning_rate": 3.6871508379888266e-07, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 339 + }, + { + "completion_length": 7.0, + "epoch": 0.6325581395348837, + "grad_norm": 2.5949024950710666, + "kl": 0.34521484375, + "learning_rate": 3.6685288640595904e-07, + "loss": 0.0139, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 340 + }, + { + "completion_length": 7.0, + "epoch": 0.6344186046511628, + "grad_norm": 3.5982092297114483, + "kl": 0.304931640625, + "learning_rate": 3.6499068901303536e-07, + "loss": 0.0122, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 341 + }, + { + "completion_length": 7.0, + "epoch": 0.6362790697674419, + "grad_norm": 4.061347074423092, + "kl": 0.080810546875, + "learning_rate": 3.6312849162011174e-07, + "loss": 0.0032, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 342 + }, + { + "completion_length": 7.0, + "epoch": 0.6381395348837209, + "grad_norm": 9.399170598757712, + "kl": 0.26708984375, + "learning_rate": 3.6126629422718806e-07, + "loss": 0.0107, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 343 + }, + { + "completion_length": 7.0, + "epoch": 0.64, + "grad_norm": 0.044413226818219735, + "kl": 0.119873046875, + "learning_rate": 3.5940409683426444e-07, + "loss": 0.0048, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 344 + }, + { + "completion_length": 7.0, + "epoch": 0.641860465116279, + "grad_norm": 14.060631435479708, + "kl": 0.104248046875, + "learning_rate": 3.5754189944134076e-07, + "loss": 0.0042, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 345 + }, + { + "completion_length": 7.0, + "epoch": 0.6437209302325582, + "grad_norm": 0.04910639065256545, + "kl": 0.0836181640625, + "learning_rate": 3.5567970204841714e-07, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 346 + }, + { + "completion_length": 7.15625, + "epoch": 0.6455813953488372, + "grad_norm": 2.9046943470468434, + "kl": 0.09521484375, + "learning_rate": 3.5381750465549347e-07, + "loss": 0.0038, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 347 + }, + { + "completion_length": 7.0, + "epoch": 0.6474418604651163, + "grad_norm": 3.2728883025333886, + "kl": 0.30126953125, + "learning_rate": 3.5195530726256984e-07, + "loss": 0.0121, + "reward": 1.75, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 348 + }, + { + "completion_length": 8.09375, + "epoch": 0.6493023255813953, + "grad_norm": 7.3323913013299045, + "kl": 0.88525390625, + "learning_rate": 3.5009310986964617e-07, + "loss": 0.0354, + "reward": 1.78125, + "reward_std": 0.3125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 349 + }, + { + "completion_length": 7.0, + "epoch": 0.6511627906976745, + "grad_norm": 7.314655577306749, + "kl": 0.2744140625, + "learning_rate": 3.4823091247672255e-07, + "loss": 0.0109, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 350 + }, + { + "completion_length": 7.0, + "epoch": 0.6530232558139535, + "grad_norm": 3.4035386120445343, + "kl": 0.20068359375, + "learning_rate": 3.4636871508379887e-07, + "loss": 0.008, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 351 + }, + { + "completion_length": 7.0, + "epoch": 0.6548837209302325, + "grad_norm": 3.5026814484077176, + "kl": 0.12841796875, + "learning_rate": 3.4450651769087525e-07, + "loss": 0.0051, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 352 + }, + { + "completion_length": 7.0, + "epoch": 0.6567441860465116, + "grad_norm": 0.015064399208069866, + "kl": 0.084228515625, + "learning_rate": 3.4264432029795157e-07, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "completion_length": 7.0, + "epoch": 0.6586046511627907, + "grad_norm": 3.129320615833332, + "kl": 0.1796875, + "learning_rate": 3.407821229050279e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 354 + }, + { + "completion_length": 7.0, + "epoch": 0.6604651162790698, + "grad_norm": 42.02036408312087, + "kl": 2.39306640625, + "learning_rate": 3.389199255121042e-07, + "loss": 0.0959, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 355 + }, + { + "completion_length": 7.0625, + "epoch": 0.6623255813953488, + "grad_norm": 9.752409125812312, + "kl": 0.0966796875, + "learning_rate": 3.370577281191806e-07, + "loss": 0.0039, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 356 + }, + { + "completion_length": 7.0, + "epoch": 0.6641860465116279, + "grad_norm": 3.1248583533872676, + "kl": 0.063232421875, + "learning_rate": 3.351955307262569e-07, + "loss": 0.0025, + "reward": 1.8125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 357 + }, + { + "completion_length": 7.0, + "epoch": 0.666046511627907, + "grad_norm": 1.4335963838624122, + "kl": 0.185546875, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0074, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 358 + }, + { + "completion_length": 7.28125, + "epoch": 0.6679069767441861, + "grad_norm": 15.350088103007185, + "kl": 0.18310546875, + "learning_rate": 3.314711359404097e-07, + "loss": 0.0073, + "reward": 1.71875, + "reward_std": 0.24467839300632477, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 359 + }, + { + "completion_length": 7.0, + "epoch": 0.6697674418604651, + "grad_norm": 5.133479554063451, + "kl": 0.16845703125, + "learning_rate": 3.29608938547486e-07, + "loss": 0.0067, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 360 + }, + { + "completion_length": 7.0, + "epoch": 0.6716279069767442, + "grad_norm": 11.243038070654391, + "kl": 0.218994140625, + "learning_rate": 3.277467411545624e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 361 + }, + { + "completion_length": 7.0625, + "epoch": 0.6734883720930233, + "grad_norm": 2.9430329732545455, + "kl": 0.197021484375, + "learning_rate": 3.258845437616387e-07, + "loss": 0.0079, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 362 + }, + { + "completion_length": 7.28125, + "epoch": 0.6753488372093023, + "grad_norm": 3.0740673268956136, + "kl": 0.3017578125, + "learning_rate": 3.240223463687151e-07, + "loss": 0.0121, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 363 + }, + { + "completion_length": 7.21875, + "epoch": 0.6772093023255814, + "grad_norm": 6.6442076897017275, + "kl": 0.12841796875, + "learning_rate": 3.221601489757914e-07, + "loss": 0.0051, + "reward": 1.75, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 364 + }, + { + "completion_length": 7.0, + "epoch": 0.6790697674418604, + "grad_norm": 2.6441460908704735, + "kl": 0.1217041015625, + "learning_rate": 3.202979515828678e-07, + "loss": 0.0049, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 365 + }, + { + "completion_length": 7.0, + "epoch": 0.6809302325581396, + "grad_norm": 2.3640709283625845, + "kl": 0.2669677734375, + "learning_rate": 3.184357541899441e-07, + "loss": 0.0107, + "reward": 1.875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 366 + }, + { + "completion_length": 7.0, + "epoch": 0.6827906976744186, + "grad_norm": 4.140065109309227, + "kl": 0.392578125, + "learning_rate": 3.165735567970205e-07, + "loss": 0.0157, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 367 + }, + { + "completion_length": 7.21875, + "epoch": 0.6846511627906977, + "grad_norm": 2.6435545501571793, + "kl": 0.26220703125, + "learning_rate": 3.147113594040968e-07, + "loss": 0.0105, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 368 + }, + { + "completion_length": 7.0, + "epoch": 0.6865116279069767, + "grad_norm": 6.966681356756185, + "kl": 0.16162109375, + "learning_rate": 3.128491620111732e-07, + "loss": 0.0065, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 369 + }, + { + "completion_length": 7.5625, + "epoch": 0.6883720930232559, + "grad_norm": 1.812937536917783, + "kl": 0.107421875, + "learning_rate": 3.109869646182495e-07, + "loss": 0.0043, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 370 + }, + { + "completion_length": 7.0, + "epoch": 0.6902325581395349, + "grad_norm": 6.464017885408894, + "kl": 0.395751953125, + "learning_rate": 3.091247672253259e-07, + "loss": 0.0159, + "reward": 1.84375, + "reward_std": 0.25966876745224, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 371 + }, + { + "completion_length": 7.0, + "epoch": 0.6920930232558139, + "grad_norm": 2.0592389921864975, + "kl": 0.118896484375, + "learning_rate": 3.072625698324022e-07, + "loss": 0.0048, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 372 + }, + { + "completion_length": 7.0, + "epoch": 0.693953488372093, + "grad_norm": 6.290484355075438, + "kl": 0.130126953125, + "learning_rate": 3.054003724394786e-07, + "loss": 0.0052, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 373 + }, + { + "completion_length": 7.09375, + "epoch": 0.695813953488372, + "grad_norm": 4.888300134780872, + "kl": 0.111572265625, + "learning_rate": 3.035381750465549e-07, + "loss": 0.0045, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 374 + }, + { + "completion_length": 8.65625, + "epoch": 0.6976744186046512, + "grad_norm": 3.103561740616555, + "kl": 0.16259765625, + "learning_rate": 3.016759776536313e-07, + "loss": 0.0065, + "reward": 1.78125, + "reward_std": 0.23673085868358612, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 375 + }, + { + "completion_length": 7.0625, + "epoch": 0.6995348837209302, + "grad_norm": 5.23489488440174, + "kl": 0.5087890625, + "learning_rate": 2.998137802607076e-07, + "loss": 0.0204, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 376 + }, + { + "completion_length": 7.34375, + "epoch": 0.7013953488372093, + "grad_norm": 5.973091279895836, + "kl": 0.1875, + "learning_rate": 2.97951582867784e-07, + "loss": 0.0075, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 377 + }, + { + "completion_length": 7.0, + "epoch": 0.7032558139534884, + "grad_norm": 0.05760043664480089, + "kl": 0.128173828125, + "learning_rate": 2.960893854748603e-07, + "loss": 0.0051, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 378 + }, + { + "completion_length": 7.40625, + "epoch": 0.7051162790697675, + "grad_norm": 3.5873745374561232, + "kl": 0.116943359375, + "learning_rate": 2.942271880819367e-07, + "loss": 0.0047, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 379 + }, + { + "completion_length": 7.8125, + "epoch": 0.7069767441860465, + "grad_norm": 1.2375748875855552, + "kl": 0.3466796875, + "learning_rate": 2.92364990689013e-07, + "loss": 0.0139, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 380 + }, + { + "completion_length": 7.0, + "epoch": 0.7088372093023256, + "grad_norm": 6.180204987533683, + "kl": 0.1533203125, + "learning_rate": 2.905027932960894e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 381 + }, + { + "completion_length": 7.03125, + "epoch": 0.7106976744186047, + "grad_norm": 5.706995587017942, + "kl": 0.2412109375, + "learning_rate": 2.8864059590316567e-07, + "loss": 0.0097, + "reward": 1.875, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 382 + }, + { + "completion_length": 7.25, + "epoch": 0.7125581395348837, + "grad_norm": 4.915608432523938, + "kl": 0.22265625, + "learning_rate": 2.8677839851024205e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.26933756470680237, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96875, + "step": 383 + }, + { + "completion_length": 7.0, + "epoch": 0.7144186046511628, + "grad_norm": 0.05555178214348584, + "kl": 0.13916015625, + "learning_rate": 2.849162011173184e-07, + "loss": 0.0056, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 384 + }, + { + "completion_length": 7.0, + "epoch": 0.7162790697674418, + "grad_norm": 5.766086190663749, + "kl": 0.3173828125, + "learning_rate": 2.8305400372439475e-07, + "loss": 0.0127, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 385 + }, + { + "completion_length": 7.21875, + "epoch": 0.718139534883721, + "grad_norm": 6.611501542031115, + "kl": 0.2373046875, + "learning_rate": 2.8119180633147113e-07, + "loss": 0.0095, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 386 + }, + { + "completion_length": 7.28125, + "epoch": 0.72, + "grad_norm": 1.9880568934443987, + "kl": 0.22607421875, + "learning_rate": 2.7932960893854745e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 387 + }, + { + "completion_length": 7.0, + "epoch": 0.7218604651162791, + "grad_norm": 2.4873546098679027, + "kl": 0.104248046875, + "learning_rate": 2.7746741154562383e-07, + "loss": 0.0042, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 388 + }, + { + "completion_length": 9.15625, + "epoch": 0.7237209302325581, + "grad_norm": 2.047017393139286, + "kl": 0.3486328125, + "learning_rate": 2.7560521415270016e-07, + "loss": 0.014, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 389 + }, + { + "completion_length": 7.25, + "epoch": 0.7255813953488373, + "grad_norm": 0.6383176216196434, + "kl": 0.16748046875, + "learning_rate": 2.7374301675977653e-07, + "loss": 0.0067, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 390 + }, + { + "completion_length": 7.0, + "epoch": 0.7274418604651163, + "grad_norm": 0.09574564193181835, + "kl": 0.118408203125, + "learning_rate": 2.7188081936685286e-07, + "loss": 0.0047, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 391 + }, + { + "completion_length": 7.21875, + "epoch": 0.7293023255813953, + "grad_norm": 5.800573388094564, + "kl": 0.47021484375, + "learning_rate": 2.7001862197392924e-07, + "loss": 0.0188, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 392 + }, + { + "completion_length": 7.09375, + "epoch": 0.7311627906976744, + "grad_norm": 7.318866986194374, + "kl": 0.59375, + "learning_rate": 2.6815642458100556e-07, + "loss": 0.0237, + "reward": 1.8125, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 393 + }, + { + "completion_length": 7.0, + "epoch": 0.7330232558139534, + "grad_norm": 0.01637520100704435, + "kl": 0.101806640625, + "learning_rate": 2.6629422718808194e-07, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 394 + }, + { + "completion_length": 7.0, + "epoch": 0.7348837209302326, + "grad_norm": 3.6332824654774813, + "kl": 0.10400390625, + "learning_rate": 2.6443202979515826e-07, + "loss": 0.0041, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 395 + }, + { + "completion_length": 11.1875, + "epoch": 0.7367441860465116, + "grad_norm": 0.9227592826841687, + "kl": 0.120849609375, + "learning_rate": 2.6256983240223464e-07, + "loss": 0.0048, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 396 + }, + { + "completion_length": 7.125, + "epoch": 0.7386046511627907, + "grad_norm": 3.2818974969507617, + "kl": 0.30859375, + "learning_rate": 2.6070763500931096e-07, + "loss": 0.0124, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 397 + }, + { + "completion_length": 7.0, + "epoch": 0.7404651162790697, + "grad_norm": 0.03983347119149783, + "kl": 0.078369140625, + "learning_rate": 2.5884543761638734e-07, + "loss": 0.0031, + "reward": 1.75, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 398 + }, + { + "completion_length": 7.15625, + "epoch": 0.7423255813953489, + "grad_norm": 25.6616392517169, + "kl": 0.168701171875, + "learning_rate": 2.5698324022346367e-07, + "loss": 0.0067, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 399 + }, + { + "completion_length": 12.25, + "epoch": 0.7441860465116279, + "grad_norm": 3.4325531227009374, + "kl": 0.07861328125, + "learning_rate": 2.5512104283054004e-07, + "loss": 0.0031, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9375, + "step": 400 + }, + { + "completion_length": 7.84375, + "epoch": 0.746046511627907, + "grad_norm": 0.6229668892129623, + "kl": 0.1435546875, + "learning_rate": 2.5325884543761637e-07, + "loss": 0.0057, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 401 + }, + { + "completion_length": 8.65625, + "epoch": 0.747906976744186, + "grad_norm": 8.413973505811864, + "kl": 0.10546875, + "learning_rate": 2.5139664804469275e-07, + "loss": 0.0042, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 402 + }, + { + "completion_length": 7.0, + "epoch": 0.7497674418604651, + "grad_norm": 1.887663079931034, + "kl": 0.28515625, + "learning_rate": 2.4953445065176907e-07, + "loss": 0.0114, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 403 + }, + { + "completion_length": 7.0, + "epoch": 0.7516279069767442, + "grad_norm": 3.052695034983716, + "kl": 0.142578125, + "learning_rate": 2.476722532588454e-07, + "loss": 0.0057, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 404 + }, + { + "completion_length": 13.0, + "epoch": 0.7534883720930232, + "grad_norm": 25.702962526182535, + "kl": 0.15625, + "learning_rate": 2.4581005586592177e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.2463996410369873, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 405 + }, + { + "completion_length": 7.0, + "epoch": 0.7553488372093023, + "grad_norm": 5.706063959994095, + "kl": 0.090087890625, + "learning_rate": 2.439478584729981e-07, + "loss": 0.0036, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 406 + }, + { + "completion_length": 7.15625, + "epoch": 0.7572093023255814, + "grad_norm": 2.541610376372466, + "kl": 0.17578125, + "learning_rate": 2.420856610800745e-07, + "loss": 0.007, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 407 + }, + { + "completion_length": 7.25, + "epoch": 0.7590697674418605, + "grad_norm": 6.463745084779171, + "kl": 0.16455078125, + "learning_rate": 2.402234636871508e-07, + "loss": 0.0066, + "reward": 1.84375, + "reward_std": 0.20683756470680237, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 408 + }, + { + "completion_length": 7.1875, + "epoch": 0.7609302325581395, + "grad_norm": 6.713784161549877, + "kl": 0.6689453125, + "learning_rate": 2.3836126629422718e-07, + "loss": 0.0269, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 409 + }, + { + "completion_length": 7.0, + "epoch": 0.7627906976744186, + "grad_norm": 0.02555573778439531, + "kl": 0.0635986328125, + "learning_rate": 2.3649906890130353e-07, + "loss": 0.0025, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 410 + }, + { + "completion_length": 7.0, + "epoch": 0.7646511627906977, + "grad_norm": 0.45113766053276283, + "kl": 0.310302734375, + "learning_rate": 2.3463687150837988e-07, + "loss": 0.0124, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 411 + }, + { + "completion_length": 7.46875, + "epoch": 0.7665116279069767, + "grad_norm": 0.9389815200447205, + "kl": 0.3212890625, + "learning_rate": 2.3277467411545623e-07, + "loss": 0.0129, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 412 + }, + { + "completion_length": 7.0, + "epoch": 0.7683720930232558, + "grad_norm": 0.03683901810467234, + "kl": 0.146484375, + "learning_rate": 2.3091247672253258e-07, + "loss": 0.0059, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 413 + }, + { + "completion_length": 9.125, + "epoch": 0.7702325581395348, + "grad_norm": 2.4552899691689722, + "kl": 0.0423583984375, + "learning_rate": 2.2905027932960893e-07, + "loss": 0.0017, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 414 + }, + { + "completion_length": 7.0, + "epoch": 0.772093023255814, + "grad_norm": 0.10291839011391096, + "kl": 0.094970703125, + "learning_rate": 2.2718808193668528e-07, + "loss": 0.0038, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 415 + }, + { + "completion_length": 10.0625, + "epoch": 0.773953488372093, + "grad_norm": 6.931955667484236, + "kl": 0.20166015625, + "learning_rate": 2.2532588454376163e-07, + "loss": 0.0081, + "reward": 1.6875, + "reward_std": 0.375, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.90625, + "step": 416 + }, + { + "completion_length": 8.40625, + "epoch": 0.7758139534883721, + "grad_norm": 6.09198472562067, + "kl": 0.66552734375, + "learning_rate": 2.2346368715083796e-07, + "loss": 0.0265, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 417 + }, + { + "completion_length": 7.0, + "epoch": 0.7776744186046511, + "grad_norm": 2.36734367168827, + "kl": 0.0665283203125, + "learning_rate": 2.216014897579143e-07, + "loss": 0.0027, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 418 + }, + { + "completion_length": 7.25, + "epoch": 0.7795348837209303, + "grad_norm": 4.06960401270215, + "kl": 0.15283203125, + "learning_rate": 2.1973929236499066e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 419 + }, + { + "completion_length": 7.0, + "epoch": 0.7813953488372093, + "grad_norm": 0.03272753742763259, + "kl": 0.1162109375, + "learning_rate": 2.17877094972067e-07, + "loss": 0.0046, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 420 + }, + { + "completion_length": 8.65625, + "epoch": 0.7832558139534884, + "grad_norm": 10.520165231988505, + "kl": 0.23193359375, + "learning_rate": 2.1601489757914336e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 421 + }, + { + "completion_length": 7.625, + "epoch": 0.7851162790697674, + "grad_norm": 9.782694231648783, + "kl": 0.185546875, + "learning_rate": 2.141527001862197e-07, + "loss": 0.0074, + "reward": 1.75, + "reward_std": 0.2992308586835861, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 422 + }, + { + "completion_length": 7.0, + "epoch": 0.7869767441860465, + "grad_norm": 31.26666755677358, + "kl": 0.405029296875, + "learning_rate": 2.122905027932961e-07, + "loss": 0.0162, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 423 + }, + { + "completion_length": 7.0, + "epoch": 0.7888372093023256, + "grad_norm": 0.031605702352443296, + "kl": 0.108642578125, + "learning_rate": 2.1042830540037244e-07, + "loss": 0.0043, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 424 + }, + { + "completion_length": 7.25, + "epoch": 0.7906976744186046, + "grad_norm": 2.796064371547676, + "kl": 0.331787109375, + "learning_rate": 2.085661080074488e-07, + "loss": 0.0133, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 425 + }, + { + "completion_length": 7.3125, + "epoch": 0.7925581395348837, + "grad_norm": 1.715419721323946, + "kl": 0.307373046875, + "learning_rate": 2.0670391061452514e-07, + "loss": 0.0124, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 426 + }, + { + "completion_length": 7.25, + "epoch": 0.7944186046511628, + "grad_norm": 3.4012806962391386, + "kl": 0.1259765625, + "learning_rate": 2.048417132216015e-07, + "loss": 0.0051, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 427 + }, + { + "completion_length": 7.375, + "epoch": 0.7962790697674419, + "grad_norm": 3.9687104794309893, + "kl": 0.3046875, + "learning_rate": 2.0297951582867785e-07, + "loss": 0.0122, + "reward": 1.875, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 428 + }, + { + "completion_length": 7.0, + "epoch": 0.7981395348837209, + "grad_norm": 0.038946578926059544, + "kl": 0.1181640625, + "learning_rate": 2.011173184357542e-07, + "loss": 0.0047, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 429 + }, + { + "completion_length": 7.03125, + "epoch": 0.8, + "grad_norm": 5.768381660087222, + "kl": 0.20361328125, + "learning_rate": 1.9925512104283055e-07, + "loss": 0.0082, + "reward": 1.71875, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96875, + "step": 430 + }, + { + "completion_length": 7.0, + "epoch": 0.8018604651162791, + "grad_norm": 6.681428053418435, + "kl": 0.116455078125, + "learning_rate": 1.973929236499069e-07, + "loss": 0.0047, + "reward": 1.75, + "reward_std": 0.3221687823534012, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 431 + }, + { + "completion_length": 7.0, + "epoch": 0.8037209302325582, + "grad_norm": 5.06861283760002, + "kl": 0.13916015625, + "learning_rate": 1.9553072625698322e-07, + "loss": 0.0056, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 432 + }, + { + "completion_length": 7.0, + "epoch": 0.8055813953488372, + "grad_norm": 0.44380376845753133, + "kl": 0.259765625, + "learning_rate": 1.9366852886405957e-07, + "loss": 0.0104, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 433 + }, + { + "completion_length": 7.0, + "epoch": 0.8074418604651162, + "grad_norm": 2.0368099246344435, + "kl": 0.08984375, + "learning_rate": 1.9180633147113592e-07, + "loss": 0.0036, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 434 + }, + { + "completion_length": 7.0, + "epoch": 0.8093023255813954, + "grad_norm": 1.7892346086244424, + "kl": 0.1533203125, + "learning_rate": 1.8994413407821228e-07, + "loss": 0.0061, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 435 + }, + { + "completion_length": 7.0, + "epoch": 0.8111627906976744, + "grad_norm": 0.023368778567537024, + "kl": 0.096435546875, + "learning_rate": 1.8808193668528863e-07, + "loss": 0.0039, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 436 + }, + { + "completion_length": 7.0, + "epoch": 0.8130232558139535, + "grad_norm": 0.05734305546297211, + "kl": 0.087890625, + "learning_rate": 1.8621973929236498e-07, + "loss": 0.0035, + "reward": 1.875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 437 + }, + { + "completion_length": 7.0, + "epoch": 0.8148837209302325, + "grad_norm": 0.026275681981962812, + "kl": 0.103515625, + "learning_rate": 1.8435754189944133e-07, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 438 + }, + { + "completion_length": 7.28125, + "epoch": 0.8167441860465117, + "grad_norm": 1.349859991662317, + "kl": 0.11212158203125, + "learning_rate": 1.8249534450651768e-07, + "loss": 0.0045, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 439 + }, + { + "completion_length": 7.03125, + "epoch": 0.8186046511627907, + "grad_norm": 3.792764525583862, + "kl": 0.4970703125, + "learning_rate": 1.8063314711359403e-07, + "loss": 0.02, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 440 + }, + { + "completion_length": 8.0, + "epoch": 0.8204651162790698, + "grad_norm": 1.3457621629969303, + "kl": 0.14404296875, + "learning_rate": 1.7877094972067038e-07, + "loss": 0.0058, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 441 + }, + { + "completion_length": 9.3125, + "epoch": 0.8223255813953488, + "grad_norm": 2.7332918551666543, + "kl": 0.104736328125, + "learning_rate": 1.7690875232774673e-07, + "loss": 0.0042, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 442 + }, + { + "completion_length": 7.0625, + "epoch": 0.8241860465116279, + "grad_norm": 3.3177141179710588, + "kl": 0.12255859375, + "learning_rate": 1.7504655493482308e-07, + "loss": 0.0049, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 443 + }, + { + "completion_length": 11.25, + "epoch": 0.826046511627907, + "grad_norm": 17.80721487685858, + "kl": 0.208984375, + "learning_rate": 1.7318435754189943e-07, + "loss": 0.0084, + "reward": 1.71875, + "reward_std": 0.38466876745224, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9375, + "step": 444 + }, + { + "completion_length": 7.0, + "epoch": 0.827906976744186, + "grad_norm": 1.2392993964899597, + "kl": 0.2119140625, + "learning_rate": 1.7132216014897579e-07, + "loss": 0.0085, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 445 + }, + { + "completion_length": 7.0, + "epoch": 0.8297674418604651, + "grad_norm": 8.811266355510783, + "kl": 0.130859375, + "learning_rate": 1.694599627560521e-07, + "loss": 0.0052, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 446 + }, + { + "completion_length": 7.0, + "epoch": 0.8316279069767442, + "grad_norm": 3.0167403426841974, + "kl": 0.095947265625, + "learning_rate": 1.6759776536312846e-07, + "loss": 0.0038, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 447 + }, + { + "completion_length": 13.65625, + "epoch": 0.8334883720930233, + "grad_norm": 0.5231663822296391, + "kl": 0.076416015625, + "learning_rate": 1.6573556797020484e-07, + "loss": 0.0031, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 448 + }, + { + "completion_length": 7.0, + "epoch": 0.8353488372093023, + "grad_norm": 2.3731098838950366, + "kl": 0.1279296875, + "learning_rate": 1.638733705772812e-07, + "loss": 0.0051, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 449 + }, + { + "completion_length": 7.0, + "epoch": 0.8372093023255814, + "grad_norm": 2.3906909594561725, + "kl": 0.0791015625, + "learning_rate": 1.6201117318435754e-07, + "loss": 0.0032, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 450 + }, + { + "completion_length": 8.6875, + "epoch": 0.8390697674418605, + "grad_norm": 5.535313714462902, + "kl": 0.1343994140625, + "learning_rate": 1.601489757914339e-07, + "loss": 0.0054, + "reward": 1.75, + "reward_std": 0.2992308586835861, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 451 + }, + { + "completion_length": 7.0, + "epoch": 0.8409302325581396, + "grad_norm": 2.4650154317824953, + "kl": 0.04742431640625, + "learning_rate": 1.5828677839851024e-07, + "loss": 0.0019, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 452 + }, + { + "completion_length": 7.0, + "epoch": 0.8427906976744186, + "grad_norm": 0.03815817213324913, + "kl": 0.106201171875, + "learning_rate": 1.564245810055866e-07, + "loss": 0.0043, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 453 + }, + { + "completion_length": 7.15625, + "epoch": 0.8446511627906976, + "grad_norm": 5.139037724904585, + "kl": 0.19482421875, + "learning_rate": 1.5456238361266295e-07, + "loss": 0.0078, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 454 + }, + { + "completion_length": 7.0, + "epoch": 0.8465116279069768, + "grad_norm": 2.150812823090355, + "kl": 0.0869140625, + "learning_rate": 1.527001862197393e-07, + "loss": 0.0035, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 455 + }, + { + "completion_length": 7.0, + "epoch": 0.8483720930232558, + "grad_norm": 0.06091183471397165, + "kl": 0.12939453125, + "learning_rate": 1.5083798882681565e-07, + "loss": 0.0052, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 456 + }, + { + "completion_length": 7.0, + "epoch": 0.8502325581395349, + "grad_norm": 2.2699661160465023, + "kl": 0.081298828125, + "learning_rate": 1.48975791433892e-07, + "loss": 0.0032, + "reward": 1.8125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 457 + }, + { + "completion_length": 7.0, + "epoch": 0.8520930232558139, + "grad_norm": 0.06428851542425035, + "kl": 0.0830078125, + "learning_rate": 1.4711359404096835e-07, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 458 + }, + { + "completion_length": 7.0, + "epoch": 0.8539534883720931, + "grad_norm": 2.338050191000454, + "kl": 0.39892578125, + "learning_rate": 1.452513966480447e-07, + "loss": 0.016, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 459 + }, + { + "completion_length": 9.6875, + "epoch": 0.8558139534883721, + "grad_norm": 9.915004000911917, + "kl": 0.12841796875, + "learning_rate": 1.4338919925512102e-07, + "loss": 0.0051, + "reward": 1.75, + "reward_std": 0.4471687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 460 + }, + { + "completion_length": 7.71875, + "epoch": 0.8576744186046512, + "grad_norm": 8.775895303396899, + "kl": 0.416015625, + "learning_rate": 1.4152700186219738e-07, + "loss": 0.0166, + "reward": 1.65625, + "reward_std": 0.3846687823534012, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.90625, + "step": 461 + }, + { + "completion_length": 7.0, + "epoch": 0.8595348837209302, + "grad_norm": 5.590809145755228, + "kl": 0.1298828125, + "learning_rate": 1.3966480446927373e-07, + "loss": 0.0052, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 462 + }, + { + "completion_length": 10.53125, + "epoch": 0.8613953488372093, + "grad_norm": 3.177821609944796, + "kl": 0.1051025390625, + "learning_rate": 1.3780260707635008e-07, + "loss": 0.0042, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 463 + }, + { + "completion_length": 7.125, + "epoch": 0.8632558139534884, + "grad_norm": 2.958043977529243, + "kl": 0.13525390625, + "learning_rate": 1.3594040968342643e-07, + "loss": 0.0054, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 464 + }, + { + "completion_length": 7.0, + "epoch": 0.8651162790697674, + "grad_norm": 0.03155221663252405, + "kl": 0.148681640625, + "learning_rate": 1.3407821229050278e-07, + "loss": 0.0059, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 465 + }, + { + "completion_length": 8.84375, + "epoch": 0.8669767441860465, + "grad_norm": 4.326010957178043, + "kl": 0.1474609375, + "learning_rate": 1.3221601489757913e-07, + "loss": 0.0059, + "reward": 1.65625, + "reward_std": 0.31684717535972595, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9375, + "step": 466 + }, + { + "completion_length": 7.25, + "epoch": 0.8688372093023256, + "grad_norm": 1.5929956970865176, + "kl": 0.14306640625, + "learning_rate": 1.3035381750465548e-07, + "loss": 0.0057, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 467 + }, + { + "completion_length": 7.0, + "epoch": 0.8706976744186047, + "grad_norm": 7.080331333785726, + "kl": 0.070556640625, + "learning_rate": 1.2849162011173183e-07, + "loss": 0.0028, + "reward": 1.71875, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 468 + }, + { + "completion_length": 7.0, + "epoch": 0.8725581395348837, + "grad_norm": 3.246397883840746, + "kl": 0.20849609375, + "learning_rate": 1.2662942271880818e-07, + "loss": 0.0083, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 469 + }, + { + "completion_length": 7.0, + "epoch": 0.8744186046511628, + "grad_norm": 3.3783622509537823, + "kl": 0.0865478515625, + "learning_rate": 1.2476722532588453e-07, + "loss": 0.0035, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 470 + }, + { + "completion_length": 7.0, + "epoch": 0.8762790697674419, + "grad_norm": 2.542539488948452, + "kl": 0.12744140625, + "learning_rate": 1.2290502793296089e-07, + "loss": 0.0051, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 471 + }, + { + "completion_length": 7.0, + "epoch": 0.878139534883721, + "grad_norm": 2.8686857472208196, + "kl": 0.18408203125, + "learning_rate": 1.2104283054003724e-07, + "loss": 0.0074, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 472 + }, + { + "completion_length": 7.0, + "epoch": 0.88, + "grad_norm": 3.0368200476833436, + "kl": 0.16455078125, + "learning_rate": 1.1918063314711359e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 473 + }, + { + "completion_length": 7.25, + "epoch": 0.881860465116279, + "grad_norm": 7.608901431650917, + "kl": 0.640625, + "learning_rate": 1.1731843575418994e-07, + "loss": 0.0256, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 474 + }, + { + "completion_length": 7.0, + "epoch": 0.8837209302325582, + "grad_norm": 6.805298725373223, + "kl": 0.626220703125, + "learning_rate": 1.1545623836126629e-07, + "loss": 0.025, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 475 + }, + { + "completion_length": 9.125, + "epoch": 0.8855813953488372, + "grad_norm": 5.122731354719896, + "kl": 0.17578125, + "learning_rate": 1.1359404096834264e-07, + "loss": 0.007, + "reward": 1.8125, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 476 + }, + { + "completion_length": 7.0, + "epoch": 0.8874418604651163, + "grad_norm": 0.040397173013598145, + "kl": 0.09814453125, + "learning_rate": 1.1173184357541898e-07, + "loss": 0.0039, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "completion_length": 7.0, + "epoch": 0.8893023255813953, + "grad_norm": 2.8351829254789727, + "kl": 0.1142578125, + "learning_rate": 1.0986964618249533e-07, + "loss": 0.0046, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 478 + }, + { + "completion_length": 7.0, + "epoch": 0.8911627906976745, + "grad_norm": 6.222124990426549, + "kl": 0.234375, + "learning_rate": 1.0800744878957168e-07, + "loss": 0.0094, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 479 + }, + { + "completion_length": 7.0, + "epoch": 0.8930232558139535, + "grad_norm": 0.029016116972822817, + "kl": 0.12939453125, + "learning_rate": 1.0614525139664805e-07, + "loss": 0.0052, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 480 + }, + { + "completion_length": 7.25, + "epoch": 0.8948837209302326, + "grad_norm": 2.6815597960860376, + "kl": 0.1650390625, + "learning_rate": 1.042830540037244e-07, + "loss": 0.0066, + "reward": 1.78125, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 481 + }, + { + "completion_length": 8.1875, + "epoch": 0.8967441860465116, + "grad_norm": 7.332149951009178, + "kl": 0.13330078125, + "learning_rate": 1.0242085661080075e-07, + "loss": 0.0053, + "reward": 1.78125, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.96875, + "step": 482 + }, + { + "completion_length": 7.0, + "epoch": 0.8986046511627906, + "grad_norm": 0.02743375589032812, + "kl": 0.090087890625, + "learning_rate": 1.005586592178771e-07, + "loss": 0.0036, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 483 + }, + { + "completion_length": 7.0, + "epoch": 0.9004651162790698, + "grad_norm": 5.808996269984639, + "kl": 0.12451171875, + "learning_rate": 9.869646182495345e-08, + "loss": 0.005, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 484 + }, + { + "completion_length": 7.25, + "epoch": 0.9023255813953488, + "grad_norm": 0.7628945445022374, + "kl": 0.21484375, + "learning_rate": 9.683426443202979e-08, + "loss": 0.0086, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 485 + }, + { + "completion_length": 7.0, + "epoch": 0.9041860465116279, + "grad_norm": 4.704605869366476, + "kl": 0.3779296875, + "learning_rate": 9.497206703910614e-08, + "loss": 0.0151, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 486 + }, + { + "completion_length": 11.53125, + "epoch": 0.906046511627907, + "grad_norm": 2.1204619689316395, + "kl": 0.15234375, + "learning_rate": 9.310986964618249e-08, + "loss": 0.0061, + "reward": 1.78125, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9375, + "step": 487 + }, + { + "completion_length": 7.34375, + "epoch": 0.9079069767441861, + "grad_norm": 2.91107939031998, + "kl": 0.2178955078125, + "learning_rate": 9.124767225325884e-08, + "loss": 0.0087, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 488 + }, + { + "completion_length": 7.21875, + "epoch": 0.9097674418604651, + "grad_norm": 2.607117606617554, + "kl": 0.1689453125, + "learning_rate": 8.938547486033519e-08, + "loss": 0.0068, + "reward": 1.90625, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 489 + }, + { + "completion_length": 7.21875, + "epoch": 0.9116279069767442, + "grad_norm": 4.994939384451428, + "kl": 0.19384765625, + "learning_rate": 8.752327746741154e-08, + "loss": 0.0077, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 490 + }, + { + "completion_length": 9.34375, + "epoch": 0.9134883720930232, + "grad_norm": 3.354207128215869, + "kl": 0.14892578125, + "learning_rate": 8.566108007448789e-08, + "loss": 0.006, + "reward": 1.84375, + "reward_std": 0.2596687823534012, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 491 + }, + { + "completion_length": 14.78125, + "epoch": 0.9153488372093024, + "grad_norm": 4.642867073290136, + "kl": 0.075439453125, + "learning_rate": 8.379888268156423e-08, + "loss": 0.003, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 492 + }, + { + "completion_length": 7.0, + "epoch": 0.9172093023255814, + "grad_norm": 0.03479355521987353, + "kl": 0.1025390625, + "learning_rate": 8.19366852886406e-08, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 493 + }, + { + "completion_length": 7.0, + "epoch": 0.9190697674418604, + "grad_norm": 0.04893579430089106, + "kl": 0.06634521484375, + "learning_rate": 8.007448789571695e-08, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 494 + }, + { + "completion_length": 7.0, + "epoch": 0.9209302325581395, + "grad_norm": 10.585172785164104, + "kl": 0.42236328125, + "learning_rate": 7.82122905027933e-08, + "loss": 0.0169, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 495 + }, + { + "completion_length": 7.5, + "epoch": 0.9227906976744186, + "grad_norm": 4.864822279720747, + "kl": 0.123779296875, + "learning_rate": 7.635009310986965e-08, + "loss": 0.005, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 496 + }, + { + "completion_length": 7.21875, + "epoch": 0.9246511627906977, + "grad_norm": 3.147796919264851, + "kl": 0.158935546875, + "learning_rate": 7.4487895716946e-08, + "loss": 0.0063, + "reward": 1.4375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.96875, + "step": 497 + }, + { + "completion_length": 7.0, + "epoch": 0.9265116279069767, + "grad_norm": 2.9308789707828606, + "kl": 0.101318359375, + "learning_rate": 7.262569832402235e-08, + "loss": 0.0041, + "reward": 1.90625, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 498 + }, + { + "completion_length": 7.0, + "epoch": 0.9283720930232559, + "grad_norm": 2.8568571893814134, + "kl": 0.4677734375, + "learning_rate": 7.076350093109869e-08, + "loss": 0.0187, + "reward": 1.8125, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 499 + }, + { + "completion_length": 7.0, + "epoch": 0.9302325581395349, + "grad_norm": 0.03523435402716282, + "kl": 0.0418701171875, + "learning_rate": 6.890130353817504e-08, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 500 + }, + { + "completion_length": 7.0, + "epoch": 0.932093023255814, + "grad_norm": 3.405435352426359, + "kl": 0.13330078125, + "learning_rate": 6.703910614525139e-08, + "loss": 0.0053, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 501 + }, + { + "completion_length": 7.0, + "epoch": 0.933953488372093, + "grad_norm": 4.679666176451282, + "kl": 0.21875, + "learning_rate": 6.517690875232774e-08, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 502 + }, + { + "completion_length": 7.0, + "epoch": 0.935813953488372, + "grad_norm": 3.22839496036919, + "kl": 0.16552734375, + "learning_rate": 6.331471135940409e-08, + "loss": 0.0066, + "reward": 1.90625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 503 + }, + { + "completion_length": 7.90625, + "epoch": 0.9376744186046512, + "grad_norm": 4.1167849039133175, + "kl": 0.19970703125, + "learning_rate": 6.145251396648044e-08, + "loss": 0.008, + "reward": 1.90625, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 504 + }, + { + "completion_length": 7.0, + "epoch": 0.9395348837209302, + "grad_norm": 0.21703165114629014, + "kl": 0.14990234375, + "learning_rate": 5.9590316573556794e-08, + "loss": 0.006, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 505 + }, + { + "completion_length": 7.0, + "epoch": 0.9413953488372093, + "grad_norm": 0.16054964336375094, + "kl": 0.069091796875, + "learning_rate": 5.7728119180633145e-08, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 506 + }, + { + "completion_length": 7.0, + "epoch": 0.9432558139534883, + "grad_norm": 0.03352956996508878, + "kl": 0.06787109375, + "learning_rate": 5.586592178770949e-08, + "loss": 0.0027, + "reward": 1.75, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 507 + }, + { + "completion_length": 7.0, + "epoch": 0.9451162790697675, + "grad_norm": 6.391544669711613, + "kl": 0.17431640625, + "learning_rate": 5.400372439478584e-08, + "loss": 0.007, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 508 + }, + { + "completion_length": 7.25, + "epoch": 0.9469767441860465, + "grad_norm": 2.7522410396977843, + "kl": 0.14306640625, + "learning_rate": 5.21415270018622e-08, + "loss": 0.0057, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 509 + }, + { + "completion_length": 7.0, + "epoch": 0.9488372093023256, + "grad_norm": 3.8521435564559217, + "kl": 0.13037109375, + "learning_rate": 5.027932960893855e-08, + "loss": 0.0052, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 510 + }, + { + "completion_length": 7.0, + "epoch": 0.9506976744186046, + "grad_norm": 0.027863920978306585, + "kl": 0.14013671875, + "learning_rate": 4.8417132216014893e-08, + "loss": 0.0056, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 511 + }, + { + "completion_length": 7.0, + "epoch": 0.9525581395348838, + "grad_norm": 0.025359619006064767, + "kl": 0.132080078125, + "learning_rate": 4.6554934823091244e-08, + "loss": 0.0053, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 512 + }, + { + "completion_length": 7.0, + "epoch": 0.9544186046511628, + "grad_norm": 0.023203369636127968, + "kl": 0.073486328125, + "learning_rate": 4.4692737430167595e-08, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 513 + }, + { + "completion_length": 7.0, + "epoch": 0.9562790697674418, + "grad_norm": 1.6004859022960831, + "kl": 0.1650390625, + "learning_rate": 4.2830540037243946e-08, + "loss": 0.0066, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 514 + }, + { + "completion_length": 7.0, + "epoch": 0.958139534883721, + "grad_norm": 0.04480313386601444, + "kl": 0.125, + "learning_rate": 4.09683426443203e-08, + "loss": 0.005, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 515 + }, + { + "completion_length": 7.34375, + "epoch": 0.96, + "grad_norm": 8.569929577522952, + "kl": 0.0833740234375, + "learning_rate": 3.910614525139665e-08, + "loss": 0.0033, + "reward": 1.75, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.96875, + "step": 516 + }, + { + "completion_length": 7.90625, + "epoch": 0.9618604651162791, + "grad_norm": 1.8430630428960513, + "kl": 0.101318359375, + "learning_rate": 3.7243947858473e-08, + "loss": 0.0041, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 517 + }, + { + "completion_length": 7.0, + "epoch": 0.9637209302325581, + "grad_norm": 2.094931019527106, + "kl": 0.126708984375, + "learning_rate": 3.5381750465549344e-08, + "loss": 0.0051, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 518 + }, + { + "completion_length": 7.0, + "epoch": 0.9655813953488372, + "grad_norm": 0.046525993922275806, + "kl": 0.1142578125, + "learning_rate": 3.3519553072625695e-08, + "loss": 0.0046, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 519 + }, + { + "completion_length": 7.46875, + "epoch": 0.9674418604651163, + "grad_norm": 3.1346333125962076, + "kl": 0.086181640625, + "learning_rate": 3.1657355679702046e-08, + "loss": 0.0034, + "reward": 1.84375, + "reward_std": 0.0625, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 520 + }, + { + "completion_length": 7.71875, + "epoch": 0.9693023255813954, + "grad_norm": 3.166713484203719, + "kl": 0.1318359375, + "learning_rate": 2.9795158286778397e-08, + "loss": 0.0053, + "reward": 1.84375, + "reward_std": 0.16456207633018494, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9375, + "step": 521 + }, + { + "completion_length": 9.28125, + "epoch": 0.9711627906976744, + "grad_norm": 3.364806582786205, + "kl": 0.171630859375, + "learning_rate": 2.7932960893854745e-08, + "loss": 0.0069, + "reward": 1.84375, + "reward_std": 0.1875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 522 + }, + { + "completion_length": 7.09375, + "epoch": 0.9730232558139534, + "grad_norm": 3.582506618774392, + "kl": 0.24365234375, + "learning_rate": 2.60707635009311e-08, + "loss": 0.0098, + "reward": 1.875, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 523 + }, + { + "completion_length": 7.0, + "epoch": 0.9748837209302326, + "grad_norm": 0.10864559009692361, + "kl": 0.06146240234375, + "learning_rate": 2.4208566108007447e-08, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 524 + }, + { + "completion_length": 7.0, + "epoch": 0.9767441860465116, + "grad_norm": 5.2372214555634065, + "kl": 0.12109375, + "learning_rate": 2.2346368715083798e-08, + "loss": 0.0048, + "reward": 1.875, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 525 + }, + { + "completion_length": 7.0, + "epoch": 0.9786046511627907, + "grad_norm": 0.04672885193938469, + "kl": 0.15771484375, + "learning_rate": 2.048417132216015e-08, + "loss": 0.0063, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 526 + }, + { + "completion_length": 7.0, + "epoch": 0.9804651162790697, + "grad_norm": 0.744070844139475, + "kl": 0.13232421875, + "learning_rate": 1.86219739292365e-08, + "loss": 0.0053, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 527 + }, + { + "completion_length": 9.25, + "epoch": 0.9823255813953489, + "grad_norm": 2.7644117683785643, + "kl": 0.236572265625, + "learning_rate": 1.6759776536312847e-08, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.25, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 528 + }, + { + "completion_length": 8.0, + "epoch": 0.9841860465116279, + "grad_norm": 1.6359506271175113, + "kl": 0.22265625, + "learning_rate": 1.4897579143389198e-08, + "loss": 0.0089, + "reward": 1.9375, + "reward_std": 0.125, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.96875, + "step": 529 + }, + { + "completion_length": 7.0, + "epoch": 0.986046511627907, + "grad_norm": 0.02564692985240761, + "kl": 0.147705078125, + "learning_rate": 1.303538175046555e-08, + "loss": 0.0059, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 530 + }, + { + "completion_length": 7.1875, + "epoch": 0.987906976744186, + "grad_norm": 1.4286490759609458, + "kl": 0.14208984375, + "learning_rate": 1.1173184357541899e-08, + "loss": 0.0057, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 531 + }, + { + "completion_length": 7.1875, + "epoch": 0.9897674418604652, + "grad_norm": 2.076394897503195, + "kl": 0.24609375, + "learning_rate": 9.31098696461825e-09, + "loss": 0.0098, + "reward": 1.90625, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.96875, + "step": 532 + }, + { + "completion_length": 7.0, + "epoch": 0.9916279069767442, + "grad_norm": 16.12846129415168, + "kl": 0.875732421875, + "learning_rate": 7.448789571694599e-09, + "loss": 0.0351, + "reward": 1.65625, + "reward_std": 0.13466878235340118, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 533 + }, + { + "completion_length": 7.0, + "epoch": 0.9934883720930232, + "grad_norm": 0.02077524351081622, + "kl": 0.078125, + "learning_rate": 5.5865921787709494e-09, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 534 + }, + { + "completion_length": 7.0, + "epoch": 0.9953488372093023, + "grad_norm": 2.268978581943144, + "kl": 0.101318359375, + "learning_rate": 3.7243947858472996e-09, + "loss": 0.004, + "reward": 1.9375, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 535 + }, + { + "completion_length": 7.0625, + "epoch": 0.9972093023255814, + "grad_norm": 3.9428910672444815, + "kl": 0.25390625, + "learning_rate": 1.8621973929236498e-09, + "loss": 0.0101, + "reward": 1.8125, + "reward_std": 0.19716878235340118, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.96875, + "step": 536 + }, + { + "completion_length": 8.0, + "epoch": 0.9990697674418605, + "grad_norm": 0.6436676293459469, + "kl": 0.079833984375, + "learning_rate": 0.0, + "loss": 0.0032, + "reward": 1.96875, + "reward_std": 0.0625, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.96875, + "step": 537 + } + ], + "logging_steps": 1.0, + "max_steps": 537, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}