Spaces:
Running
Running
| { | |
| "model": "Qwen/Qwen2.5-0.5B-Instruct", | |
| "episodes": 5000, | |
| "epochs": 1, | |
| "batch_size": 8, | |
| "learning_rate": 5e-06, | |
| "global_step": 2500, | |
| "training_loss": 0.005647265207767487, | |
| "training_reward_curve": { | |
| "type": "env_http_reward", | |
| "note": "Reward from live environment via POST /reset + /step (MR-2 compliant). Not comparable to eval_reward which is clamped [0,1].", | |
| "mean_start": 0.13, | |
| "mean_end": 0.469 | |
| }, | |
| "eval_reward_before": { | |
| "Fraud detection": 0.0, | |
| "Decision accuracy": 0.0, | |
| "Evidence quality": 0.3333333333333333, | |
| "Calibration": 0.0, | |
| "Reasoning quality": 0.8333333333333334 | |
| }, | |
| "eval_reward_after": { | |
| "Fraud detection": 0.3333333333333333, | |
| "Decision accuracy": 1.0, | |
| "Evidence quality": 0.3333333333333333, | |
| "Calibration": 1.0, | |
| "Reasoning quality": 0.7916666666666666 | |
| }, | |
| "component_shift": { | |
| "before": { | |
| "Fraud detection": 0.0, | |
| "Decision accuracy": 0.0, | |
| "Evidence quality": 0.3333333333333333, | |
| "Calibration": 0.0, | |
| "Reasoning quality": 0.8333333333333334 | |
| }, | |
| "after": { | |
| "Fraud detection": 0.3333333333333333, | |
| "Decision accuracy": 1.0, | |
| "Evidence quality": 0.3333333333333333, | |
| "Calibration": 1.0, | |
| "Reasoning quality": 0.7916666666666666 | |
| } | |
| }, | |
| "log_history": [ | |
| { | |
| "loss": 0.0008, | |
| "grad_norm": 22.5, | |
| "learning_rate": 4.9900000000000005e-06, | |
| "rewards/reward_fn": 0.12996437549591064, | |
| "reward": 0.12996437549591064, | |
| "reward_std": 0.15663783259224145, | |
| "completion_length": 72.6125, | |
| "kl": 0.01886011641472578, | |
| "epoch": 0.002, | |
| "step": 5 | |
| }, | |
| { | |
| "loss": 0.0017, | |
| "grad_norm": 25.375, | |
| "learning_rate": 4.980000000000001e-06, | |
| "rewards/reward_fn": 0.28686500089243056, | |
| "reward": 0.28686500089243056, | |
| "reward_std": 0.1139603321440518, | |
| "completion_length": 71.45, | |
| "kl": 0.04206784293055534, | |
| "epoch": 0.004, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": 0.0018, | |
| "grad_norm": 26.125, | |
| "learning_rate": 4.970000000000001e-06, | |
| "rewards/reward_fn": 0.33125562937930225, | |
| "reward": 0.33125562937930225, | |
| "reward_std": 0.10047997636720538, | |
| "completion_length": 69.7625, | |
| "kl": 0.04418694227933884, | |
| "epoch": 0.006, | |
| "step": 15 | |
| }, | |
| { | |
| "loss": 0.0024, | |
| "grad_norm": 29.5, | |
| "learning_rate": 4.960000000000001e-06, | |
| "rewards/reward_fn": 0.38998999876203017, | |
| "reward": 0.38998999876203017, | |
| "reward_std": 0.05469522252678871, | |
| "completion_length": 66.0125, | |
| "kl": 0.061039629578590396, | |
| "epoch": 0.008, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 0.0121, | |
| "grad_norm": 105.5, | |
| "learning_rate": 4.95e-06, | |
| "rewards/reward_fn": 0.31268125153146686, | |
| "reward": 0.31268125153146686, | |
| "reward_std": 0.05678519255015999, | |
| "completion_length": 68.3625, | |
| "kl": 0.30179612897336483, | |
| "epoch": 0.01, | |
| "step": 25 | |
| }, | |
| { | |
| "loss": 0.0028, | |
| "grad_norm": 31.0, | |
| "learning_rate": 4.94e-06, | |
| "rewards/reward_fn": 0.2681674983672565, | |
| "reward": 0.2681674983672565, | |
| "reward_std": 0.0353069698670879, | |
| "completion_length": 65.6875, | |
| "kl": 0.07095254212617874, | |
| "epoch": 0.012, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": 0.0041, | |
| "grad_norm": 26.25, | |
| "learning_rate": 4.93e-06, | |
| "rewards/reward_fn": 0.3527887500880752, | |
| "reward": 0.3527887500880752, | |
| "reward_std": 0.05785412744153291, | |
| "completion_length": 63.4, | |
| "kl": 0.10367086306214332, | |
| "epoch": 0.014, | |
| "step": 35 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 25.875, | |
| "learning_rate": 4.92e-06, | |
| "rewards/reward_fn": 0.34420499864791054, | |
| "reward": 0.34420499864791054, | |
| "reward_std": 0.06693777176551521, | |
| "completion_length": 63.125, | |
| "kl": 0.11868430003523826, | |
| "epoch": 0.016, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 24.0, | |
| "learning_rate": 4.9100000000000004e-06, | |
| "rewards/reward_fn": 0.19720500293187798, | |
| "reward": 0.19720500293187798, | |
| "reward_std": 0.09952702496666462, | |
| "completion_length": 62.625, | |
| "kl": 0.17344569861888887, | |
| "epoch": 0.018, | |
| "step": 45 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.375, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "rewards/reward_fn": 0.37614874897699335, | |
| "reward": 0.37614874897699335, | |
| "reward_std": 0.05041897173505276, | |
| "completion_length": 63.65, | |
| "kl": 0.14020639136433602, | |
| "epoch": 0.02, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.375, | |
| "learning_rate": 4.890000000000001e-06, | |
| "rewards/reward_fn": 0.20948062620591373, | |
| "reward": 0.20948062620591373, | |
| "reward_std": 0.11797074675559997, | |
| "completion_length": 65.2, | |
| "kl": 0.12582977935671807, | |
| "epoch": 0.022, | |
| "step": 55 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 24.25, | |
| "learning_rate": 4.880000000000001e-06, | |
| "rewards/reward_fn": 0.2675649975077249, | |
| "reward": 0.2675649975077249, | |
| "reward_std": 0.10371575457975268, | |
| "completion_length": 65.775, | |
| "kl": 0.12268042787909508, | |
| "epoch": 0.024, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 27.25, | |
| "learning_rate": 4.87e-06, | |
| "rewards/reward_fn": 0.17759875237825326, | |
| "reward": 0.17759875237825326, | |
| "reward_std": 0.09766199714504183, | |
| "completion_length": 64.4375, | |
| "kl": 0.1204748086631298, | |
| "epoch": 0.026, | |
| "step": 65 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 20.5, | |
| "learning_rate": 4.86e-06, | |
| "rewards/reward_fn": 0.3662331223487854, | |
| "reward": 0.3662331223487854, | |
| "reward_std": 0.13943021052982657, | |
| "completion_length": 65.525, | |
| "kl": 0.1409070000052452, | |
| "epoch": 0.028, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": 0.0041, | |
| "grad_norm": 28.375, | |
| "learning_rate": 4.85e-06, | |
| "rewards/reward_fn": 0.35237686783075334, | |
| "reward": 0.35237686783075334, | |
| "reward_std": 0.14571735821664333, | |
| "completion_length": 67.475, | |
| "kl": 0.10161374881863594, | |
| "epoch": 0.03, | |
| "step": 75 | |
| }, | |
| { | |
| "loss": 0.0081, | |
| "grad_norm": 29.0, | |
| "learning_rate": 4.84e-06, | |
| "rewards/reward_fn": 0.34102812483906747, | |
| "reward": 0.34102812483906747, | |
| "reward_std": 0.12838326790370047, | |
| "completion_length": 66.0875, | |
| "kl": 0.2030480533838272, | |
| "epoch": 0.032, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 29.0, | |
| "learning_rate": 4.83e-06, | |
| "rewards/reward_fn": 0.38738313168287275, | |
| "reward": 0.38738313168287275, | |
| "reward_std": 0.08913053697906434, | |
| "completion_length": 68.675, | |
| "kl": 0.10850983113050461, | |
| "epoch": 0.034, | |
| "step": 85 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 26.75, | |
| "learning_rate": 4.8200000000000004e-06, | |
| "rewards/reward_fn": 0.37884062230587007, | |
| "reward": 0.37884062230587007, | |
| "reward_std": 0.11409456301480532, | |
| "completion_length": 70.4, | |
| "kl": 0.1510870262980461, | |
| "epoch": 0.036, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 28.5, | |
| "learning_rate": 4.8100000000000005e-06, | |
| "rewards/reward_fn": 0.3212599984370172, | |
| "reward": 0.3212599984370172, | |
| "reward_std": 0.11497495661024004, | |
| "completion_length": 68.7875, | |
| "kl": 0.10497871562838554, | |
| "epoch": 0.038, | |
| "step": 95 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 25.25, | |
| "learning_rate": 4.800000000000001e-06, | |
| "rewards/reward_fn": 0.2606187478464562, | |
| "reward": 0.2606187478464562, | |
| "reward_std": 0.11127406840678304, | |
| "completion_length": 70.1125, | |
| "kl": 0.13605541437864305, | |
| "epoch": 0.04, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 0.004, | |
| "grad_norm": 159.0, | |
| "learning_rate": 4.79e-06, | |
| "rewards/reward_fn": 0.3490543693304062, | |
| "reward": 0.3490543693304062, | |
| "reward_std": 0.17655093723442405, | |
| "completion_length": 71.875, | |
| "kl": 0.10081770941615105, | |
| "epoch": 0.042, | |
| "step": 105 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 24.75, | |
| "learning_rate": 4.78e-06, | |
| "rewards/reward_fn": 0.36398687958717346, | |
| "reward": 0.36398687958717346, | |
| "reward_std": 0.15169972144067287, | |
| "completion_length": 72.0625, | |
| "kl": 0.10862671732902526, | |
| "epoch": 0.044, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": 0.0037, | |
| "grad_norm": 23.625, | |
| "learning_rate": 4.77e-06, | |
| "rewards/reward_fn": 0.3380812492221594, | |
| "reward": 0.3380812492221594, | |
| "reward_std": 0.1447692496702075, | |
| "completion_length": 73.5125, | |
| "kl": 0.09267130568623543, | |
| "epoch": 0.046, | |
| "step": 115 | |
| }, | |
| { | |
| "loss": 0.0045, | |
| "grad_norm": 20.0, | |
| "learning_rate": 4.76e-06, | |
| "rewards/reward_fn": 0.39886312037706373, | |
| "reward": 0.39886312037706373, | |
| "reward_std": 0.13123975209891797, | |
| "completion_length": 75.6, | |
| "kl": 0.11189883872866631, | |
| "epoch": 0.048, | |
| "step": 120 | |
| }, | |
| { | |
| "loss": 0.0038, | |
| "grad_norm": 25.125, | |
| "learning_rate": 4.75e-06, | |
| "rewards/reward_fn": 0.4117881193757057, | |
| "reward": 0.4117881193757057, | |
| "reward_std": 0.1342116856947541, | |
| "completion_length": 77.5875, | |
| "kl": 0.09554292932152748, | |
| "epoch": 0.05, | |
| "step": 125 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 22.125, | |
| "learning_rate": 4.74e-06, | |
| "rewards/reward_fn": 0.43608374893665314, | |
| "reward": 0.43608374893665314, | |
| "reward_std": 0.10520601402968169, | |
| "completion_length": 77.4625, | |
| "kl": 0.10588956028223037, | |
| "epoch": 0.052, | |
| "step": 130 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 20.0, | |
| "learning_rate": 4.7300000000000005e-06, | |
| "rewards/reward_fn": 0.4558625012636185, | |
| "reward": 0.4558625012636185, | |
| "reward_std": 0.06957857511006296, | |
| "completion_length": 78.1875, | |
| "kl": 0.12035084962844848, | |
| "epoch": 0.054, | |
| "step": 135 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 21.5, | |
| "learning_rate": 4.7200000000000005e-06, | |
| "rewards/reward_fn": 0.40547625310719015, | |
| "reward": 0.40547625310719015, | |
| "reward_std": 0.09707445108797401, | |
| "completion_length": 77.9, | |
| "kl": 0.11794439107179641, | |
| "epoch": 0.056, | |
| "step": 140 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 22.875, | |
| "learning_rate": 4.71e-06, | |
| "rewards/reward_fn": 0.33485061936080457, | |
| "reward": 0.33485061936080457, | |
| "reward_std": 0.09993105094181373, | |
| "completion_length": 79.125, | |
| "kl": 0.10510653629899025, | |
| "epoch": 0.058, | |
| "step": 145 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 19.5, | |
| "learning_rate": 4.7e-06, | |
| "rewards/reward_fn": 0.3309887422248721, | |
| "reward": 0.3309887422248721, | |
| "reward_std": 0.06645656500477344, | |
| "completion_length": 78.6125, | |
| "kl": 0.10446615666151046, | |
| "epoch": 0.06, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 22.375, | |
| "learning_rate": 4.69e-06, | |
| "rewards/reward_fn": 0.3643562486220617, | |
| "reward": 0.3643562486220617, | |
| "reward_std": 0.06148011786863208, | |
| "completion_length": 76.725, | |
| "kl": 0.1252933219075203, | |
| "epoch": 0.062, | |
| "step": 155 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 25.625, | |
| "learning_rate": 4.680000000000001e-06, | |
| "rewards/reward_fn": 0.3345375001837965, | |
| "reward": 0.3345375001837965, | |
| "reward_std": 0.0954028001986444, | |
| "completion_length": 75.45, | |
| "kl": 0.12887531742453576, | |
| "epoch": 0.064, | |
| "step": 160 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 22.375, | |
| "learning_rate": 4.670000000000001e-06, | |
| "rewards/reward_fn": 0.3415462435106747, | |
| "reward": 0.3415462435106747, | |
| "reward_std": 0.07221131722908466, | |
| "completion_length": 75.9375, | |
| "kl": 0.11467845514416694, | |
| "epoch": 0.066, | |
| "step": 165 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 22.625, | |
| "learning_rate": 4.66e-06, | |
| "rewards/reward_fn": 0.38888062462210654, | |
| "reward": 0.38888062462210654, | |
| "reward_std": 0.11567582259885967, | |
| "completion_length": 76.375, | |
| "kl": 0.12577201426029205, | |
| "epoch": 0.068, | |
| "step": 170 | |
| }, | |
| { | |
| "loss": 0.0044, | |
| "grad_norm": 23.75, | |
| "learning_rate": 4.65e-06, | |
| "rewards/reward_fn": 0.27768062038812785, | |
| "reward": 0.27768062038812785, | |
| "reward_std": 0.04161863022018224, | |
| "completion_length": 75.275, | |
| "kl": 0.11094974502921104, | |
| "epoch": 0.07, | |
| "step": 175 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 21.875, | |
| "learning_rate": 4.6400000000000005e-06, | |
| "rewards/reward_fn": 0.31952374114189297, | |
| "reward": 0.31952374114189297, | |
| "reward_std": 0.07254288513213396, | |
| "completion_length": 77.3625, | |
| "kl": 0.11998703256249428, | |
| "epoch": 0.072, | |
| "step": 180 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 25.125, | |
| "learning_rate": 4.6300000000000006e-06, | |
| "rewards/reward_fn": 0.34861375503242015, | |
| "reward": 0.34861375503242015, | |
| "reward_std": 0.10978359731379897, | |
| "completion_length": 75.9625, | |
| "kl": 0.15532704591751098, | |
| "epoch": 0.074, | |
| "step": 185 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 22.5, | |
| "learning_rate": 4.620000000000001e-06, | |
| "rewards/reward_fn": 0.3256543739698827, | |
| "reward": 0.3256543739698827, | |
| "reward_std": 0.07666705958545209, | |
| "completion_length": 76.825, | |
| "kl": 0.1190482571721077, | |
| "epoch": 0.076, | |
| "step": 190 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 26.25, | |
| "learning_rate": 4.610000000000001e-06, | |
| "rewards/reward_fn": 0.3306443728506565, | |
| "reward": 0.3306443728506565, | |
| "reward_std": 0.12050404832698405, | |
| "completion_length": 76.2375, | |
| "kl": 0.11852994039654732, | |
| "epoch": 0.078, | |
| "step": 195 | |
| }, | |
| { | |
| "loss": 0.0041, | |
| "grad_norm": 23.625, | |
| "learning_rate": 4.600000000000001e-06, | |
| "rewards/reward_fn": 0.33713062135502697, | |
| "reward": 0.33713062135502697, | |
| "reward_std": 0.09549199095927179, | |
| "completion_length": 75.725, | |
| "kl": 0.10264018401503563, | |
| "epoch": 0.08, | |
| "step": 200 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 21.0, | |
| "learning_rate": 4.590000000000001e-06, | |
| "rewards/reward_fn": 0.35562500059604646, | |
| "reward": 0.35562500059604646, | |
| "reward_std": 0.11331822639331221, | |
| "completion_length": 76.55, | |
| "kl": 0.11866414025425912, | |
| "epoch": 0.082, | |
| "step": 205 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 25.0, | |
| "learning_rate": 4.58e-06, | |
| "rewards/reward_fn": 0.3766068793833256, | |
| "reward": 0.3766068793833256, | |
| "reward_std": 0.10549901332706213, | |
| "completion_length": 73.0875, | |
| "kl": 0.14588759168982507, | |
| "epoch": 0.084, | |
| "step": 210 | |
| }, | |
| { | |
| "loss": 0.004, | |
| "grad_norm": 26.75, | |
| "learning_rate": 4.57e-06, | |
| "rewards/reward_fn": 0.38299812823534013, | |
| "reward": 0.38299812823534013, | |
| "reward_std": 0.09799009431153535, | |
| "completion_length": 73.625, | |
| "kl": 0.10109216421842575, | |
| "epoch": 0.086, | |
| "step": 215 | |
| }, | |
| { | |
| "loss": 0.0041, | |
| "grad_norm": 28.0, | |
| "learning_rate": 4.56e-06, | |
| "rewards/reward_fn": 0.37175500094890596, | |
| "reward": 0.37175500094890596, | |
| "reward_std": 0.10488205360015854, | |
| "completion_length": 72.3375, | |
| "kl": 0.10273240357637406, | |
| "epoch": 0.088, | |
| "step": 220 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 22.5, | |
| "learning_rate": 4.5500000000000005e-06, | |
| "rewards/reward_fn": 0.3897412523627281, | |
| "reward": 0.3897412523627281, | |
| "reward_std": 0.14026562571525575, | |
| "completion_length": 74.6125, | |
| "kl": 0.1044769786298275, | |
| "epoch": 0.09, | |
| "step": 225 | |
| }, | |
| { | |
| "loss": 0.0039, | |
| "grad_norm": 23.375, | |
| "learning_rate": 4.540000000000001e-06, | |
| "rewards/reward_fn": 0.41331062465906143, | |
| "reward": 0.41331062465906143, | |
| "reward_std": 0.09166353384498507, | |
| "completion_length": 73.95, | |
| "kl": 0.0984603650867939, | |
| "epoch": 0.092, | |
| "step": 230 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 25.25, | |
| "learning_rate": 4.530000000000001e-06, | |
| "rewards/reward_fn": 0.3803025022149086, | |
| "reward": 0.3803025022149086, | |
| "reward_std": 0.11351661148946732, | |
| "completion_length": 74.6, | |
| "kl": 0.1073625199496746, | |
| "epoch": 0.094, | |
| "step": 235 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 40.25, | |
| "learning_rate": 4.520000000000001e-06, | |
| "rewards/reward_fn": 0.37668500542640687, | |
| "reward": 0.37668500542640687, | |
| "reward_std": 0.14612680403515696, | |
| "completion_length": 73.75, | |
| "kl": 0.1317383050918579, | |
| "epoch": 0.096, | |
| "step": 240 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 28.25, | |
| "learning_rate": 4.510000000000001e-06, | |
| "rewards/reward_fn": 0.37795437276363375, | |
| "reward": 0.37795437276363375, | |
| "reward_std": 0.11509951823391021, | |
| "completion_length": 74.675, | |
| "kl": 0.13217320367693902, | |
| "epoch": 0.098, | |
| "step": 245 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 24.0, | |
| "learning_rate": 4.5e-06, | |
| "rewards/reward_fn": 0.4587212562561035, | |
| "reward": 0.4587212562561035, | |
| "reward_std": 0.05489388951100409, | |
| "completion_length": 72.6375, | |
| "kl": 0.14324783831834792, | |
| "epoch": 0.1, | |
| "step": 250 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 28.25, | |
| "learning_rate": 4.49e-06, | |
| "rewards/reward_fn": 0.36413499563932417, | |
| "reward": 0.36413499563932417, | |
| "reward_std": 0.14084610100835562, | |
| "completion_length": 73.175, | |
| "kl": 0.1550510197877884, | |
| "epoch": 0.102, | |
| "step": 255 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 22.75, | |
| "learning_rate": 4.48e-06, | |
| "rewards/reward_fn": 0.41378499418497083, | |
| "reward": 0.41378499418497083, | |
| "reward_std": 0.09481649375520647, | |
| "completion_length": 75.3625, | |
| "kl": 0.1283886268734932, | |
| "epoch": 0.104, | |
| "step": 260 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 27.25, | |
| "learning_rate": 4.47e-06, | |
| "rewards/reward_fn": 0.45114499926567075, | |
| "reward": 0.45114499926567075, | |
| "reward_std": 0.04992847095709294, | |
| "completion_length": 74.15, | |
| "kl": 0.1244954839348793, | |
| "epoch": 0.106, | |
| "step": 265 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 27.625, | |
| "learning_rate": 4.4600000000000005e-06, | |
| "rewards/reward_fn": 0.4430062472820282, | |
| "reward": 0.4430062472820282, | |
| "reward_std": 0.06626461511477828, | |
| "completion_length": 73.775, | |
| "kl": 0.11374877691268921, | |
| "epoch": 0.108, | |
| "step": 270 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 23.0, | |
| "learning_rate": 4.450000000000001e-06, | |
| "rewards/reward_fn": 0.3933093786239624, | |
| "reward": 0.3933093786239624, | |
| "reward_std": 0.07578937450889497, | |
| "completion_length": 73.4875, | |
| "kl": 0.11540523990988731, | |
| "epoch": 0.11, | |
| "step": 275 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 26.125, | |
| "learning_rate": 4.440000000000001e-06, | |
| "rewards/reward_fn": 0.34966062209568916, | |
| "reward": 0.34966062209568916, | |
| "reward_std": 0.12435578326694667, | |
| "completion_length": 75.5375, | |
| "kl": 0.13259521648287773, | |
| "epoch": 0.112, | |
| "step": 280 | |
| }, | |
| { | |
| "loss": 0.0039, | |
| "grad_norm": 24.625, | |
| "learning_rate": 4.430000000000001e-06, | |
| "rewards/reward_fn": 0.40091561824083327, | |
| "reward": 0.40091561824083327, | |
| "reward_std": 0.10028558413032443, | |
| "completion_length": 75.9875, | |
| "kl": 0.09868917912244797, | |
| "epoch": 0.114, | |
| "step": 285 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 24.0, | |
| "learning_rate": 4.42e-06, | |
| "rewards/reward_fn": 0.3874093756079674, | |
| "reward": 0.3874093756079674, | |
| "reward_std": 0.11204615456517786, | |
| "completion_length": 75.7, | |
| "kl": 0.1249419741332531, | |
| "epoch": 0.116, | |
| "step": 290 | |
| }, | |
| { | |
| "loss": 0.0039, | |
| "grad_norm": 21.5, | |
| "learning_rate": 4.41e-06, | |
| "rewards/reward_fn": 0.40853061974048616, | |
| "reward": 0.40853061974048616, | |
| "reward_std": 0.1080384837463498, | |
| "completion_length": 74.325, | |
| "kl": 0.0986015535891056, | |
| "epoch": 0.118, | |
| "step": 295 | |
| }, | |
| { | |
| "loss": 0.0037, | |
| "grad_norm": 24.75, | |
| "learning_rate": 4.4e-06, | |
| "rewards/reward_fn": 0.41577999889850614, | |
| "reward": 0.41577999889850614, | |
| "reward_std": 0.10275121238082648, | |
| "completion_length": 71.7125, | |
| "kl": 0.09238781034946442, | |
| "epoch": 0.12, | |
| "step": 300 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 31.0, | |
| "learning_rate": 4.39e-06, | |
| "rewards/reward_fn": 0.4253518760204315, | |
| "reward": 0.4253518760204315, | |
| "reward_std": 0.09691239511594177, | |
| "completion_length": 72.15, | |
| "kl": 0.11536458730697632, | |
| "epoch": 0.122, | |
| "step": 305 | |
| }, | |
| { | |
| "loss": 0.0039, | |
| "grad_norm": 29.875, | |
| "learning_rate": 4.38e-06, | |
| "rewards/reward_fn": 0.437681245803833, | |
| "reward": 0.437681245803833, | |
| "reward_std": 0.08544279797933996, | |
| "completion_length": 73.7875, | |
| "kl": 0.09754163324832917, | |
| "epoch": 0.124, | |
| "step": 310 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 23.5, | |
| "learning_rate": 4.3700000000000005e-06, | |
| "rewards/reward_fn": 0.4518656224012375, | |
| "reward": 0.4518656224012375, | |
| "reward_std": 0.07031336800428108, | |
| "completion_length": 72.075, | |
| "kl": 0.16523725241422654, | |
| "epoch": 0.126, | |
| "step": 315 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 22.875, | |
| "learning_rate": 4.360000000000001e-06, | |
| "rewards/reward_fn": 0.4219950050115585, | |
| "reward": 0.4219950050115585, | |
| "reward_std": 0.10292997076176107, | |
| "completion_length": 73.8375, | |
| "kl": 0.1384974516928196, | |
| "epoch": 0.128, | |
| "step": 320 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 30.25, | |
| "learning_rate": 4.350000000000001e-06, | |
| "rewards/reward_fn": 0.3595293749123812, | |
| "reward": 0.3595293749123812, | |
| "reward_std": 0.09363621571101248, | |
| "completion_length": 73.2125, | |
| "kl": 0.11762727722525597, | |
| "epoch": 0.13, | |
| "step": 325 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 20.875, | |
| "learning_rate": 4.34e-06, | |
| "rewards/reward_fn": 0.41323124766349795, | |
| "reward": 0.41323124766349795, | |
| "reward_std": 0.07829355036374182, | |
| "completion_length": 75.7375, | |
| "kl": 0.12051494792103767, | |
| "epoch": 0.132, | |
| "step": 330 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 22.25, | |
| "learning_rate": 4.33e-06, | |
| "rewards/reward_fn": 0.4376243770122528, | |
| "reward": 0.4376243770122528, | |
| "reward_std": 0.08593541735317559, | |
| "completion_length": 74.2125, | |
| "kl": 0.11745435148477554, | |
| "epoch": 0.134, | |
| "step": 335 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 21.875, | |
| "learning_rate": 4.32e-06, | |
| "rewards/reward_fn": 0.40846686959266665, | |
| "reward": 0.40846686959266665, | |
| "reward_std": 0.08748328550718724, | |
| "completion_length": 74.1375, | |
| "kl": 0.1059987798333168, | |
| "epoch": 0.136, | |
| "step": 340 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 24.25, | |
| "learning_rate": 4.31e-06, | |
| "rewards/reward_fn": 0.4022818714380264, | |
| "reward": 0.4022818714380264, | |
| "reward_std": 0.12827726462855935, | |
| "completion_length": 73.7375, | |
| "kl": 0.12174244895577431, | |
| "epoch": 0.138, | |
| "step": 345 | |
| }, | |
| { | |
| "loss": 0.0067, | |
| "grad_norm": 25.375, | |
| "learning_rate": 4.3e-06, | |
| "rewards/reward_fn": 0.390729995071888, | |
| "reward": 0.390729995071888, | |
| "reward_std": 0.14277701806277038, | |
| "completion_length": 75.6625, | |
| "kl": 0.16858599781990052, | |
| "epoch": 0.14, | |
| "step": 350 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.25, | |
| "learning_rate": 4.2900000000000004e-06, | |
| "rewards/reward_fn": 0.430366250872612, | |
| "reward": 0.430366250872612, | |
| "reward_std": 0.10421461121877655, | |
| "completion_length": 75.8125, | |
| "kl": 0.13511769324541092, | |
| "epoch": 0.142, | |
| "step": 355 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 24.5, | |
| "learning_rate": 4.2800000000000005e-06, | |
| "rewards/reward_fn": 0.4330950051546097, | |
| "reward": 0.4330950051546097, | |
| "reward_std": 0.09782969739753752, | |
| "completion_length": 74.5875, | |
| "kl": 0.10710541978478431, | |
| "epoch": 0.144, | |
| "step": 360 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 23.375, | |
| "learning_rate": 4.270000000000001e-06, | |
| "rewards/reward_fn": 0.4299006313085556, | |
| "reward": 0.4299006313085556, | |
| "reward_std": 0.09049425637349487, | |
| "completion_length": 74.0125, | |
| "kl": 0.13614988327026367, | |
| "epoch": 0.146, | |
| "step": 365 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 27.75, | |
| "learning_rate": 4.26e-06, | |
| "rewards/reward_fn": 0.4390475034713745, | |
| "reward": 0.4390475034713745, | |
| "reward_std": 0.08892962010577321, | |
| "completion_length": 74.075, | |
| "kl": 0.14286103025078772, | |
| "epoch": 0.148, | |
| "step": 370 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 22.25, | |
| "learning_rate": 4.25e-06, | |
| "rewards/reward_fn": 0.4013475000858307, | |
| "reward": 0.4013475000858307, | |
| "reward_std": 0.13384937290102245, | |
| "completion_length": 74.2375, | |
| "kl": 0.15888455584645272, | |
| "epoch": 0.15, | |
| "step": 375 | |
| }, | |
| { | |
| "loss": 0.0044, | |
| "grad_norm": 24.5, | |
| "learning_rate": 4.24e-06, | |
| "rewards/reward_fn": 0.4265299946069717, | |
| "reward": 0.4265299946069717, | |
| "reward_std": 0.1011309385765344, | |
| "completion_length": 75.3125, | |
| "kl": 0.10906772464513778, | |
| "epoch": 0.152, | |
| "step": 380 | |
| }, | |
| { | |
| "loss": 0.0044, | |
| "grad_norm": 20.875, | |
| "learning_rate": 4.23e-06, | |
| "rewards/reward_fn": 0.4264387458562851, | |
| "reward": 0.4264387458562851, | |
| "reward_std": 0.09354419643059372, | |
| "completion_length": 75.5625, | |
| "kl": 0.11007295995950699, | |
| "epoch": 0.154, | |
| "step": 385 | |
| }, | |
| { | |
| "loss": 0.0042, | |
| "grad_norm": 20.75, | |
| "learning_rate": 4.22e-06, | |
| "rewards/reward_fn": 0.4492681235074997, | |
| "reward": 0.4492681235074997, | |
| "reward_std": 0.06859665396623313, | |
| "completion_length": 75.8125, | |
| "kl": 0.10581666082143784, | |
| "epoch": 0.156, | |
| "step": 390 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 22.125, | |
| "learning_rate": 4.21e-06, | |
| "rewards/reward_fn": 0.4439112454652786, | |
| "reward": 0.4439112454652786, | |
| "reward_std": 0.06631400538608431, | |
| "completion_length": 76.9375, | |
| "kl": 0.13060626164078712, | |
| "epoch": 0.158, | |
| "step": 395 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.625, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "rewards/reward_fn": 0.42808999717235563, | |
| "reward": 0.42808999717235563, | |
| "reward_std": 0.10399190871976316, | |
| "completion_length": 77.9375, | |
| "kl": 0.12519487142562866, | |
| "epoch": 0.16, | |
| "step": 400 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 23.875, | |
| "learning_rate": 4.1900000000000005e-06, | |
| "rewards/reward_fn": 0.4617268741130829, | |
| "reward": 0.4617268741130829, | |
| "reward_std": 0.024455197062343358, | |
| "completion_length": 78.35, | |
| "kl": 0.1178566724061966, | |
| "epoch": 0.162, | |
| "step": 405 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 22.125, | |
| "learning_rate": 4.18e-06, | |
| "rewards/reward_fn": 0.443281877040863, | |
| "reward": 0.443281877040863, | |
| "reward_std": 0.0685680250171572, | |
| "completion_length": 78.2125, | |
| "kl": 0.13702442422509192, | |
| "epoch": 0.164, | |
| "step": 410 | |
| }, | |
| { | |
| "loss": 0.0044, | |
| "grad_norm": 22.875, | |
| "learning_rate": 4.17e-06, | |
| "rewards/reward_fn": 0.4668212473392487, | |
| "reward": 0.4668212473392487, | |
| "reward_std": 0.011207807157188655, | |
| "completion_length": 77.575, | |
| "kl": 0.11055121570825577, | |
| "epoch": 0.166, | |
| "step": 415 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 21.625, | |
| "learning_rate": 4.16e-06, | |
| "rewards/reward_fn": 0.4402331173419952, | |
| "reward": 0.4402331173419952, | |
| "reward_std": 0.058068437944166364, | |
| "completion_length": 77.5, | |
| "kl": 0.10733927562832832, | |
| "epoch": 0.168, | |
| "step": 420 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 19.875, | |
| "learning_rate": 4.15e-06, | |
| "rewards/reward_fn": 0.4000462591648102, | |
| "reward": 0.4000462591648102, | |
| "reward_std": 0.12040392335038633, | |
| "completion_length": 76.4, | |
| "kl": 0.1479562886059284, | |
| "epoch": 0.17, | |
| "step": 425 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 24.75, | |
| "learning_rate": 4.14e-06, | |
| "rewards/reward_fn": 0.43545125126838685, | |
| "reward": 0.43545125126838685, | |
| "reward_std": 0.083320726826787, | |
| "completion_length": 77.6875, | |
| "kl": 0.10773153975605965, | |
| "epoch": 0.172, | |
| "step": 430 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 25.75, | |
| "learning_rate": 4.13e-06, | |
| "rewards/reward_fn": 0.44696625173091886, | |
| "reward": 0.44696625173091886, | |
| "reward_std": 0.0702914291061461, | |
| "completion_length": 77.6875, | |
| "kl": 0.14093699380755426, | |
| "epoch": 0.174, | |
| "step": 435 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 22.375, | |
| "learning_rate": 4.12e-06, | |
| "rewards/reward_fn": 0.4151681214570999, | |
| "reward": 0.4151681214570999, | |
| "reward_std": 0.13133891765028238, | |
| "completion_length": 77.3875, | |
| "kl": 0.14335689023137094, | |
| "epoch": 0.176, | |
| "step": 440 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 37.5, | |
| "learning_rate": 4.1100000000000005e-06, | |
| "rewards/reward_fn": 0.4624956250190735, | |
| "reward": 0.4624956250190735, | |
| "reward_std": 0.02820514002814889, | |
| "completion_length": 78.2625, | |
| "kl": 0.1433185674250126, | |
| "epoch": 0.178, | |
| "step": 445 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 24.5, | |
| "learning_rate": 4.1e-06, | |
| "rewards/reward_fn": 0.4109575003385544, | |
| "reward": 0.4109575003385544, | |
| "reward_std": 0.10657719178125262, | |
| "completion_length": 77.425, | |
| "kl": 0.14092796370387078, | |
| "epoch": 0.18, | |
| "step": 450 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 19.25, | |
| "learning_rate": 4.09e-06, | |
| "rewards/reward_fn": 0.4427300065755844, | |
| "reward": 0.4427300065755844, | |
| "reward_std": 0.07360082946252078, | |
| "completion_length": 77.2, | |
| "kl": 0.12334202900528908, | |
| "epoch": 0.182, | |
| "step": 455 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 21.25, | |
| "learning_rate": 4.08e-06, | |
| "rewards/reward_fn": 0.4475331217050552, | |
| "reward": 0.4475331217050552, | |
| "reward_std": 0.07192960330285132, | |
| "completion_length": 77.3, | |
| "kl": 0.1150731973350048, | |
| "epoch": 0.184, | |
| "step": 460 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 23.625, | |
| "learning_rate": 4.07e-06, | |
| "rewards/reward_fn": 0.4609056174755096, | |
| "reward": 0.4609056174755096, | |
| "reward_std": 0.03011263143271208, | |
| "completion_length": 78.1375, | |
| "kl": 0.11868541091680526, | |
| "epoch": 0.186, | |
| "step": 465 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 24.0, | |
| "learning_rate": 4.060000000000001e-06, | |
| "rewards/reward_fn": 0.43565624952316284, | |
| "reward": 0.43565624952316284, | |
| "reward_std": 0.09692498000804335, | |
| "completion_length": 77.7875, | |
| "kl": 0.10825898423790932, | |
| "epoch": 0.188, | |
| "step": 470 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.625, | |
| "learning_rate": 4.05e-06, | |
| "rewards/reward_fn": 0.4492074936628342, | |
| "reward": 0.4492074936628342, | |
| "reward_std": 0.054914072714746, | |
| "completion_length": 77.5375, | |
| "kl": 0.13897996991872788, | |
| "epoch": 0.19, | |
| "step": 475 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.25, | |
| "learning_rate": 4.04e-06, | |
| "rewards/reward_fn": 0.4413031220436096, | |
| "reward": 0.4413031220436096, | |
| "reward_std": 0.09486053336877376, | |
| "completion_length": 77.65, | |
| "kl": 0.12382525056600571, | |
| "epoch": 0.192, | |
| "step": 480 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 25.375, | |
| "learning_rate": 4.03e-06, | |
| "rewards/reward_fn": 0.4565912544727325, | |
| "reward": 0.4565912544727325, | |
| "reward_std": 0.048468802426941696, | |
| "completion_length": 77.8125, | |
| "kl": 0.12045493870973586, | |
| "epoch": 0.194, | |
| "step": 485 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.5, | |
| "learning_rate": 4.0200000000000005e-06, | |
| "rewards/reward_fn": 0.43663875162601473, | |
| "reward": 0.43663875162601473, | |
| "reward_std": 0.08210341725498438, | |
| "completion_length": 78.4, | |
| "kl": 0.12900268211960791, | |
| "epoch": 0.196, | |
| "step": 490 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.0, | |
| "learning_rate": 4.0100000000000006e-06, | |
| "rewards/reward_fn": 0.45537562370300294, | |
| "reward": 0.45537562370300294, | |
| "reward_std": 0.0482569785322994, | |
| "completion_length": 76.7125, | |
| "kl": 0.14251393526792527, | |
| "epoch": 0.198, | |
| "step": 495 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 25.875, | |
| "learning_rate": 4.000000000000001e-06, | |
| "rewards/reward_fn": 0.4432006269693375, | |
| "reward": 0.4432006269693375, | |
| "reward_std": 0.06335797258652746, | |
| "completion_length": 73.8625, | |
| "kl": 0.1357534795999527, | |
| "epoch": 0.2, | |
| "step": 500 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 22.0, | |
| "learning_rate": 3.990000000000001e-06, | |
| "rewards/reward_fn": 0.4444637417793274, | |
| "reward": 0.4444637417793274, | |
| "reward_std": 0.07501828772947192, | |
| "completion_length": 77.9375, | |
| "kl": 0.11708598956465721, | |
| "epoch": 0.202, | |
| "step": 505 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 23.5, | |
| "learning_rate": 3.980000000000001e-06, | |
| "rewards/reward_fn": 0.4472743809223175, | |
| "reward": 0.4472743809223175, | |
| "reward_std": 0.05749309537932277, | |
| "completion_length": 74.5875, | |
| "kl": 0.14097338169813156, | |
| "epoch": 0.204, | |
| "step": 510 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 24.5, | |
| "learning_rate": 3.97e-06, | |
| "rewards/reward_fn": 0.44517249464988706, | |
| "reward": 0.44517249464988706, | |
| "reward_std": 0.04624197790399194, | |
| "completion_length": 74.3625, | |
| "kl": 0.11476076990365983, | |
| "epoch": 0.206, | |
| "step": 515 | |
| }, | |
| { | |
| "loss": 0.0045, | |
| "grad_norm": 23.0, | |
| "learning_rate": 3.96e-06, | |
| "rewards/reward_fn": 0.46824624538421633, | |
| "reward": 0.46824624538421633, | |
| "reward_std": 0.01596033286768943, | |
| "completion_length": 76.65, | |
| "kl": 0.11366325318813324, | |
| "epoch": 0.208, | |
| "step": 520 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 32.25, | |
| "learning_rate": 3.95e-06, | |
| "rewards/reward_fn": 0.42703562378883364, | |
| "reward": 0.42703562378883364, | |
| "reward_std": 0.12119532297365368, | |
| "completion_length": 72.925, | |
| "kl": 0.15730374231934546, | |
| "epoch": 0.21, | |
| "step": 525 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.94e-06, | |
| "rewards/reward_fn": 0.4221406221389771, | |
| "reward": 0.4221406221389771, | |
| "reward_std": 0.10592716310638935, | |
| "completion_length": 74.5125, | |
| "kl": 0.15577242150902748, | |
| "epoch": 0.212, | |
| "step": 530 | |
| }, | |
| { | |
| "loss": 0.0045, | |
| "grad_norm": 21.125, | |
| "learning_rate": 3.9300000000000005e-06, | |
| "rewards/reward_fn": 0.4661912500858307, | |
| "reward": 0.4661912500858307, | |
| "reward_std": 0.020173130772309377, | |
| "completion_length": 75.8375, | |
| "kl": 0.11145939379930496, | |
| "epoch": 0.214, | |
| "step": 535 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 24.25, | |
| "learning_rate": 3.920000000000001e-06, | |
| "rewards/reward_fn": 0.441836878657341, | |
| "reward": 0.441836878657341, | |
| "reward_std": 0.07485336323734373, | |
| "completion_length": 76.2125, | |
| "kl": 0.12274321988224983, | |
| "epoch": 0.216, | |
| "step": 540 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 27.875, | |
| "learning_rate": 3.910000000000001e-06, | |
| "rewards/reward_fn": 0.41665250062942505, | |
| "reward": 0.41665250062942505, | |
| "reward_std": 0.11695102071389556, | |
| "completion_length": 75.5375, | |
| "kl": 0.1784944050014019, | |
| "epoch": 0.218, | |
| "step": 545 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 22.125, | |
| "learning_rate": 3.900000000000001e-06, | |
| "rewards/reward_fn": 0.46246500313282013, | |
| "reward": 0.46246500313282013, | |
| "reward_std": 0.025297004880849273, | |
| "completion_length": 77.3125, | |
| "kl": 0.1214751310646534, | |
| "epoch": 0.22, | |
| "step": 550 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 22.0, | |
| "learning_rate": 3.89e-06, | |
| "rewards/reward_fn": 0.4644468754529953, | |
| "reward": 0.4644468754529953, | |
| "reward_std": 0.012496462906710804, | |
| "completion_length": 75.7375, | |
| "kl": 0.11581535264849663, | |
| "epoch": 0.222, | |
| "step": 555 | |
| }, | |
| { | |
| "loss": 0.0082, | |
| "grad_norm": 36.0, | |
| "learning_rate": 3.88e-06, | |
| "rewards/reward_fn": 0.4410806208848953, | |
| "reward": 0.4410806208848953, | |
| "reward_std": 0.06957816896028816, | |
| "completion_length": 74.8875, | |
| "kl": 0.2040191449224949, | |
| "epoch": 0.224, | |
| "step": 560 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 20.25, | |
| "learning_rate": 3.87e-06, | |
| "rewards/reward_fn": 0.4340968787670135, | |
| "reward": 0.4340968787670135, | |
| "reward_std": 0.08061990649439395, | |
| "completion_length": 75.425, | |
| "kl": 0.13628464713692665, | |
| "epoch": 0.226, | |
| "step": 565 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.86e-06, | |
| "rewards/reward_fn": 0.4491756230592728, | |
| "reward": 0.4491756230592728, | |
| "reward_std": 0.05307391991373152, | |
| "completion_length": 75.0875, | |
| "kl": 0.12775095850229262, | |
| "epoch": 0.228, | |
| "step": 570 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 23.5, | |
| "learning_rate": 3.85e-06, | |
| "rewards/reward_fn": 0.44790937304496764, | |
| "reward": 0.44790937304496764, | |
| "reward_std": 0.04875197249930352, | |
| "completion_length": 76.5625, | |
| "kl": 0.13741603270173072, | |
| "epoch": 0.23, | |
| "step": 575 | |
| }, | |
| { | |
| "loss": 0.0075, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.8400000000000005e-06, | |
| "rewards/reward_fn": 0.4636618733406067, | |
| "reward": 0.4636618733406067, | |
| "reward_std": 0.027970095619093627, | |
| "completion_length": 75.525, | |
| "kl": 0.18872758597135544, | |
| "epoch": 0.232, | |
| "step": 580 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 22.875, | |
| "learning_rate": 3.830000000000001e-06, | |
| "rewards/reward_fn": 0.44757687151432035, | |
| "reward": 0.44757687151432035, | |
| "reward_std": 0.05606174336280674, | |
| "completion_length": 78.5875, | |
| "kl": 0.143553277105093, | |
| "epoch": 0.234, | |
| "step": 585 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.820000000000001e-06, | |
| "rewards/reward_fn": 0.474083748459816, | |
| "reward": 0.474083748459816, | |
| "reward_std": 0.013858947483822704, | |
| "completion_length": 77.1375, | |
| "kl": 0.1158306747674942, | |
| "epoch": 0.236, | |
| "step": 590 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 23.125, | |
| "learning_rate": 3.8100000000000004e-06, | |
| "rewards/reward_fn": 0.46378999650478364, | |
| "reward": 0.46378999650478364, | |
| "reward_std": 0.02867411085171625, | |
| "completion_length": 78.075, | |
| "kl": 0.1382530927658081, | |
| "epoch": 0.238, | |
| "step": 595 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "rewards/reward_fn": 0.44207625091075897, | |
| "reward": 0.44207625091075897, | |
| "reward_std": 0.07887064684182406, | |
| "completion_length": 78.2125, | |
| "kl": 0.17263479977846147, | |
| "epoch": 0.24, | |
| "step": 600 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 22.875, | |
| "learning_rate": 3.79e-06, | |
| "rewards/reward_fn": 0.45089874863624574, | |
| "reward": 0.45089874863624574, | |
| "reward_std": 0.05866381305968389, | |
| "completion_length": 78.15, | |
| "kl": 0.14266471862792968, | |
| "epoch": 0.242, | |
| "step": 605 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "rewards/reward_fn": 0.44535249173641206, | |
| "reward": 0.44535249173641206, | |
| "reward_std": 0.06417759947944432, | |
| "completion_length": 77.725, | |
| "kl": 0.15949834659695625, | |
| "epoch": 0.244, | |
| "step": 610 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 21.5, | |
| "learning_rate": 3.7700000000000003e-06, | |
| "rewards/reward_fn": 0.45778937339782716, | |
| "reward": 0.45778937339782716, | |
| "reward_std": 0.03863266622647643, | |
| "completion_length": 78.0375, | |
| "kl": 0.14478488713502885, | |
| "epoch": 0.246, | |
| "step": 615 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 19.5, | |
| "learning_rate": 3.7600000000000004e-06, | |
| "rewards/reward_fn": 0.4707600027322769, | |
| "reward": 0.4707600027322769, | |
| "reward_std": 0.01137657801155001, | |
| "completion_length": 78.65, | |
| "kl": 0.12897173911333085, | |
| "epoch": 0.248, | |
| "step": 620 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 17.875, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "rewards/reward_fn": 0.46352937519550325, | |
| "reward": 0.46352937519550325, | |
| "reward_std": 0.024159080686513335, | |
| "completion_length": 77.9375, | |
| "kl": 0.1265575334429741, | |
| "epoch": 0.25, | |
| "step": 625 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 27.0, | |
| "learning_rate": 3.74e-06, | |
| "rewards/reward_fn": 0.42510437667369844, | |
| "reward": 0.42510437667369844, | |
| "reward_std": 0.0986353380489163, | |
| "completion_length": 77.4875, | |
| "kl": 0.16288376674056054, | |
| "epoch": 0.252, | |
| "step": 630 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 27.25, | |
| "learning_rate": 3.7300000000000003e-06, | |
| "rewards/reward_fn": 0.45724311769008635, | |
| "reward": 0.45724311769008635, | |
| "reward_std": 0.04627569923177362, | |
| "completion_length": 79.15, | |
| "kl": 0.14429674297571182, | |
| "epoch": 0.254, | |
| "step": 635 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 21.0, | |
| "learning_rate": 3.7200000000000004e-06, | |
| "rewards/reward_fn": 0.45629062950611116, | |
| "reward": 0.45629062950611116, | |
| "reward_std": 0.04499068569857627, | |
| "completion_length": 78.575, | |
| "kl": 0.13493222519755363, | |
| "epoch": 0.256, | |
| "step": 640 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3.7100000000000005e-06, | |
| "rewards/reward_fn": 0.45364187359809877, | |
| "reward": 0.45364187359809877, | |
| "reward_std": 0.06047176127322018, | |
| "completion_length": 78.075, | |
| "kl": 0.14743178635835646, | |
| "epoch": 0.258, | |
| "step": 645 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.125, | |
| "learning_rate": 3.7e-06, | |
| "rewards/reward_fn": 0.46636125445365906, | |
| "reward": 0.46636125445365906, | |
| "reward_std": 0.024842010554857553, | |
| "completion_length": 77.7, | |
| "kl": 0.12465962767601013, | |
| "epoch": 0.26, | |
| "step": 650 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.625, | |
| "learning_rate": 3.6900000000000002e-06, | |
| "rewards/reward_fn": 0.46913875043392184, | |
| "reward": 0.46913875043392184, | |
| "reward_std": 0.014119817013852298, | |
| "completion_length": 79.1375, | |
| "kl": 0.13569475561380387, | |
| "epoch": 0.262, | |
| "step": 655 | |
| }, | |
| { | |
| "loss": 0.0068, | |
| "grad_norm": 20.25, | |
| "learning_rate": 3.6800000000000003e-06, | |
| "rewards/reward_fn": 0.44111000895500185, | |
| "reward": 0.44111000895500185, | |
| "reward_std": 0.09162386588286608, | |
| "completion_length": 78.7125, | |
| "kl": 0.17013774663209916, | |
| "epoch": 0.264, | |
| "step": 660 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 23.625, | |
| "learning_rate": 3.6700000000000004e-06, | |
| "rewards/reward_fn": 0.4559825032949448, | |
| "reward": 0.4559825032949448, | |
| "reward_std": 0.062304181954823436, | |
| "completion_length": 77.8875, | |
| "kl": 0.13616653084754943, | |
| "epoch": 0.266, | |
| "step": 665 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.375, | |
| "learning_rate": 3.66e-06, | |
| "rewards/reward_fn": 0.45857687294483185, | |
| "reward": 0.45857687294483185, | |
| "reward_std": 0.027881676610559226, | |
| "completion_length": 77.1125, | |
| "kl": 0.1250321976840496, | |
| "epoch": 0.268, | |
| "step": 670 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3.65e-06, | |
| "rewards/reward_fn": 0.46213499903678895, | |
| "reward": 0.46213499903678895, | |
| "reward_std": 0.026366882980801164, | |
| "completion_length": 78.0625, | |
| "kl": 0.1547384850680828, | |
| "epoch": 0.27, | |
| "step": 675 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 22.875, | |
| "learning_rate": 3.6400000000000003e-06, | |
| "rewards/reward_fn": 0.4569937527179718, | |
| "reward": 0.4569937527179718, | |
| "reward_std": 0.04252268351847306, | |
| "completion_length": 77.85, | |
| "kl": 0.14238858669996263, | |
| "epoch": 0.272, | |
| "step": 680 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 22.125, | |
| "learning_rate": 3.6300000000000004e-06, | |
| "rewards/reward_fn": 0.4151043713092804, | |
| "reward": 0.4151043713092804, | |
| "reward_std": 0.12278079790994526, | |
| "completion_length": 77.125, | |
| "kl": 0.15316254496574402, | |
| "epoch": 0.274, | |
| "step": 685 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 24.125, | |
| "learning_rate": 3.62e-06, | |
| "rewards/reward_fn": 0.45251187682151794, | |
| "reward": 0.45251187682151794, | |
| "reward_std": 0.05636680471943691, | |
| "completion_length": 78.075, | |
| "kl": 0.14139395952224731, | |
| "epoch": 0.276, | |
| "step": 690 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 24.375, | |
| "learning_rate": 3.61e-06, | |
| "rewards/reward_fn": 0.462823748588562, | |
| "reward": 0.462823748588562, | |
| "reward_std": 0.021253089199308305, | |
| "completion_length": 77.7625, | |
| "kl": 0.1295616790652275, | |
| "epoch": 0.278, | |
| "step": 695 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 25.75, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "rewards/reward_fn": 0.4587912499904633, | |
| "reward": 0.4587912499904633, | |
| "reward_std": 0.03155275412136689, | |
| "completion_length": 79.1375, | |
| "kl": 0.11457905992865562, | |
| "epoch": 0.28, | |
| "step": 700 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.5, | |
| "learning_rate": 3.5900000000000004e-06, | |
| "rewards/reward_fn": 0.45730499029159544, | |
| "reward": 0.45730499029159544, | |
| "reward_std": 0.04703305826988071, | |
| "completion_length": 77.0, | |
| "kl": 0.14419187232851982, | |
| "epoch": 0.282, | |
| "step": 705 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 19.125, | |
| "learning_rate": 3.58e-06, | |
| "rewards/reward_fn": 0.44802438020706176, | |
| "reward": 0.44802438020706176, | |
| "reward_std": 0.05318908016197384, | |
| "completion_length": 76.4375, | |
| "kl": 0.15136009827256203, | |
| "epoch": 0.284, | |
| "step": 710 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 21.0, | |
| "learning_rate": 3.57e-06, | |
| "rewards/reward_fn": 0.45345875024795534, | |
| "reward": 0.45345875024795534, | |
| "reward_std": 0.05543687182944268, | |
| "completion_length": 77.1125, | |
| "kl": 0.14021009653806688, | |
| "epoch": 0.286, | |
| "step": 715 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 21.25, | |
| "learning_rate": 3.5600000000000002e-06, | |
| "rewards/reward_fn": 0.45240687429904936, | |
| "reward": 0.45240687429904936, | |
| "reward_std": 0.05269500815775245, | |
| "completion_length": 77.8125, | |
| "kl": 0.1341713160276413, | |
| "epoch": 0.288, | |
| "step": 720 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 23.375, | |
| "learning_rate": 3.5500000000000003e-06, | |
| "rewards/reward_fn": 0.4583718776702881, | |
| "reward": 0.4583718776702881, | |
| "reward_std": 0.04077405421994627, | |
| "completion_length": 78.5375, | |
| "kl": 0.13093890696763993, | |
| "epoch": 0.29, | |
| "step": 725 | |
| }, | |
| { | |
| "loss": 0.0072, | |
| "grad_norm": 20.125, | |
| "learning_rate": 3.54e-06, | |
| "rewards/reward_fn": 0.434508752822876, | |
| "reward": 0.434508752822876, | |
| "reward_std": 0.09370574047788978, | |
| "completion_length": 76.6375, | |
| "kl": 0.18059465438127517, | |
| "epoch": 0.292, | |
| "step": 730 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.375, | |
| "learning_rate": 3.53e-06, | |
| "rewards/reward_fn": 0.4609118640422821, | |
| "reward": 0.4609118640422821, | |
| "reward_std": 0.04159380637574941, | |
| "completion_length": 77.5875, | |
| "kl": 0.14632384702563286, | |
| "epoch": 0.294, | |
| "step": 735 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 21.5, | |
| "learning_rate": 3.52e-06, | |
| "rewards/reward_fn": 0.416993123292923, | |
| "reward": 0.416993123292923, | |
| "reward_std": 0.11569311295170337, | |
| "completion_length": 76.4875, | |
| "kl": 0.15963388308882714, | |
| "epoch": 0.296, | |
| "step": 740 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 22.0, | |
| "learning_rate": 3.5100000000000003e-06, | |
| "rewards/reward_fn": 0.4675106227397919, | |
| "reward": 0.4675106227397919, | |
| "reward_std": 0.013280918868258596, | |
| "completion_length": 78.3, | |
| "kl": 0.13514449894428254, | |
| "epoch": 0.298, | |
| "step": 745 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 20.0, | |
| "learning_rate": 3.5e-06, | |
| "rewards/reward_fn": 0.45719312131404877, | |
| "reward": 0.45719312131404877, | |
| "reward_std": 0.03967158079613, | |
| "completion_length": 78.35, | |
| "kl": 0.1188413679599762, | |
| "epoch": 0.3, | |
| "step": 750 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 23.875, | |
| "learning_rate": 3.49e-06, | |
| "rewards/reward_fn": 0.45698000490665436, | |
| "reward": 0.45698000490665436, | |
| "reward_std": 0.040315793512854727, | |
| "completion_length": 77.0875, | |
| "kl": 0.15275436490774155, | |
| "epoch": 0.302, | |
| "step": 755 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.75, | |
| "learning_rate": 3.48e-06, | |
| "rewards/reward_fn": 0.4397631257772446, | |
| "reward": 0.4397631257772446, | |
| "reward_std": 0.05836378745734692, | |
| "completion_length": 78.3, | |
| "kl": 0.14592362120747565, | |
| "epoch": 0.304, | |
| "step": 760 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 23.625, | |
| "learning_rate": 3.4700000000000002e-06, | |
| "rewards/reward_fn": 0.43903999626636503, | |
| "reward": 0.43903999626636503, | |
| "reward_std": 0.08307434991002083, | |
| "completion_length": 78.5875, | |
| "kl": 0.1567191883921623, | |
| "epoch": 0.306, | |
| "step": 765 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 16.25, | |
| "learning_rate": 3.46e-06, | |
| "rewards/reward_fn": 0.46542062163352965, | |
| "reward": 0.46542062163352965, | |
| "reward_std": 0.024025356164202094, | |
| "completion_length": 78.125, | |
| "kl": 0.14618832543492316, | |
| "epoch": 0.308, | |
| "step": 770 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 25.375, | |
| "learning_rate": 3.45e-06, | |
| "rewards/reward_fn": 0.46039311587810516, | |
| "reward": 0.46039311587810516, | |
| "reward_std": 0.03917545401491225, | |
| "completion_length": 76.3375, | |
| "kl": 0.15213449746370317, | |
| "epoch": 0.31, | |
| "step": 775 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 23.5, | |
| "learning_rate": 3.44e-06, | |
| "rewards/reward_fn": 0.4595668792724609, | |
| "reward": 0.4595668792724609, | |
| "reward_std": 0.03896486459998414, | |
| "completion_length": 77.925, | |
| "kl": 0.1365116611123085, | |
| "epoch": 0.312, | |
| "step": 780 | |
| }, | |
| { | |
| "loss": 0.0077, | |
| "grad_norm": 22.375, | |
| "learning_rate": 3.4300000000000006e-06, | |
| "rewards/reward_fn": 0.4467168778181076, | |
| "reward": 0.4467168778181076, | |
| "reward_std": 0.06692771762609481, | |
| "completion_length": 77.9, | |
| "kl": 0.19316297993063927, | |
| "epoch": 0.314, | |
| "step": 785 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 21.0, | |
| "learning_rate": 3.4200000000000007e-06, | |
| "rewards/reward_fn": 0.4581025063991547, | |
| "reward": 0.4581025063991547, | |
| "reward_std": 0.043769028829410674, | |
| "completion_length": 75.4875, | |
| "kl": 0.161041110008955, | |
| "epoch": 0.316, | |
| "step": 790 | |
| }, | |
| { | |
| "loss": 0.0077, | |
| "grad_norm": 24.5, | |
| "learning_rate": 3.4100000000000004e-06, | |
| "rewards/reward_fn": 0.4519962579011917, | |
| "reward": 0.4519962579011917, | |
| "reward_std": 0.07411843243753538, | |
| "completion_length": 75.875, | |
| "kl": 0.19167449921369553, | |
| "epoch": 0.318, | |
| "step": 795 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 23.125, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "rewards/reward_fn": 0.45835437476634977, | |
| "reward": 0.45835437476634977, | |
| "reward_std": 0.03227461196947843, | |
| "completion_length": 76.7125, | |
| "kl": 0.14751672148704528, | |
| "epoch": 0.32, | |
| "step": 800 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 20.75, | |
| "learning_rate": 3.3900000000000006e-06, | |
| "rewards/reward_fn": 0.4664868742227554, | |
| "reward": 0.4664868742227554, | |
| "reward_std": 0.0312751340912655, | |
| "completion_length": 75.575, | |
| "kl": 0.15016857534646988, | |
| "epoch": 0.322, | |
| "step": 805 | |
| }, | |
| { | |
| "loss": 0.0073, | |
| "grad_norm": 18.0, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "rewards/reward_fn": 0.45181562900543215, | |
| "reward": 0.45181562900543215, | |
| "reward_std": 0.06425200761295854, | |
| "completion_length": 77.3125, | |
| "kl": 0.18286750614643096, | |
| "epoch": 0.324, | |
| "step": 810 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.3700000000000003e-06, | |
| "rewards/reward_fn": 0.45358812212944033, | |
| "reward": 0.45358812212944033, | |
| "reward_std": 0.05638027461245656, | |
| "completion_length": 77.3125, | |
| "kl": 0.14141111373901366, | |
| "epoch": 0.326, | |
| "step": 815 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.125, | |
| "learning_rate": 3.3600000000000004e-06, | |
| "rewards/reward_fn": 0.46734937429428103, | |
| "reward": 0.46734937429428103, | |
| "reward_std": 0.02419458368094638, | |
| "completion_length": 77.1, | |
| "kl": 0.13360125049948693, | |
| "epoch": 0.328, | |
| "step": 820 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 23.75, | |
| "learning_rate": 3.3500000000000005e-06, | |
| "rewards/reward_fn": 0.45999937057495116, | |
| "reward": 0.45999937057495116, | |
| "reward_std": 0.0442831747001037, | |
| "completion_length": 76.3375, | |
| "kl": 0.14247470945119858, | |
| "epoch": 0.33, | |
| "step": 825 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.3400000000000006e-06, | |
| "rewards/reward_fn": 0.4691068768501282, | |
| "reward": 0.4691068768501282, | |
| "reward_std": 0.013599430792964995, | |
| "completion_length": 76.95, | |
| "kl": 0.1476905442774296, | |
| "epoch": 0.332, | |
| "step": 830 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 22.25, | |
| "learning_rate": 3.3300000000000003e-06, | |
| "rewards/reward_fn": 0.43902124762535094, | |
| "reward": 0.43902124762535094, | |
| "reward_std": 0.09761263309046626, | |
| "completion_length": 76.625, | |
| "kl": 0.16074835285544395, | |
| "epoch": 0.334, | |
| "step": 835 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 22.375, | |
| "learning_rate": 3.3200000000000004e-06, | |
| "rewards/reward_fn": 0.4529812455177307, | |
| "reward": 0.4529812455177307, | |
| "reward_std": 0.04783163331449032, | |
| "completion_length": 77.6125, | |
| "kl": 0.1611533671617508, | |
| "epoch": 0.336, | |
| "step": 840 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 24.375, | |
| "learning_rate": 3.3100000000000005e-06, | |
| "rewards/reward_fn": 0.45019249618053436, | |
| "reward": 0.45019249618053436, | |
| "reward_std": 0.0602539261453785, | |
| "completion_length": 76.8125, | |
| "kl": 0.1599690869450569, | |
| "epoch": 0.338, | |
| "step": 845 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 25.125, | |
| "learning_rate": 3.3000000000000006e-06, | |
| "rewards/reward_fn": 0.4448312520980835, | |
| "reward": 0.4448312520980835, | |
| "reward_std": 0.08103471701033413, | |
| "completion_length": 74.9875, | |
| "kl": 0.15435032844543456, | |
| "epoch": 0.34, | |
| "step": 850 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 24.5, | |
| "learning_rate": 3.2900000000000003e-06, | |
| "rewards/reward_fn": 0.4587881326675415, | |
| "reward": 0.4587881326675415, | |
| "reward_std": 0.03882696847431362, | |
| "completion_length": 75.775, | |
| "kl": 0.14002252742648125, | |
| "epoch": 0.342, | |
| "step": 855 | |
| }, | |
| { | |
| "loss": 0.0074, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.2800000000000004e-06, | |
| "rewards/reward_fn": 0.4507631242275238, | |
| "reward": 0.4507631242275238, | |
| "reward_std": 0.05658294195309281, | |
| "completion_length": 76.5125, | |
| "kl": 0.18587008863687515, | |
| "epoch": 0.344, | |
| "step": 860 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 20.75, | |
| "learning_rate": 3.2700000000000005e-06, | |
| "rewards/reward_fn": 0.46349187195301056, | |
| "reward": 0.46349187195301056, | |
| "reward_std": 0.032273246673867106, | |
| "completion_length": 77.775, | |
| "kl": 0.1273516111075878, | |
| "epoch": 0.346, | |
| "step": 865 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 23.5, | |
| "learning_rate": 3.2600000000000006e-06, | |
| "rewards/reward_fn": 0.45839687883853913, | |
| "reward": 0.45839687883853913, | |
| "reward_std": 0.041816312330774964, | |
| "completion_length": 76.4375, | |
| "kl": 0.15825477614998817, | |
| "epoch": 0.348, | |
| "step": 870 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 21.375, | |
| "learning_rate": 3.2500000000000002e-06, | |
| "rewards/reward_fn": 0.45482062697410586, | |
| "reward": 0.45482062697410586, | |
| "reward_std": 0.0653240518644452, | |
| "completion_length": 76.7125, | |
| "kl": 0.16268835961818695, | |
| "epoch": 0.35, | |
| "step": 875 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 21.375, | |
| "learning_rate": 3.2400000000000003e-06, | |
| "rewards/reward_fn": 0.4411893755197525, | |
| "reward": 0.4411893755197525, | |
| "reward_std": 0.08931890472304076, | |
| "completion_length": 76.125, | |
| "kl": 0.15622055530548096, | |
| "epoch": 0.352, | |
| "step": 880 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 24.375, | |
| "learning_rate": 3.2300000000000004e-06, | |
| "rewards/reward_fn": 0.4543387472629547, | |
| "reward": 0.4543387472629547, | |
| "reward_std": 0.05997409771662206, | |
| "completion_length": 77.4875, | |
| "kl": 0.14859429150819778, | |
| "epoch": 0.354, | |
| "step": 885 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 23.5, | |
| "learning_rate": 3.2200000000000005e-06, | |
| "rewards/reward_fn": 0.4376031279563904, | |
| "reward": 0.4376031279563904, | |
| "reward_std": 0.07789694773964584, | |
| "completion_length": 78.6875, | |
| "kl": 0.13580713272094727, | |
| "epoch": 0.356, | |
| "step": 890 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 27.0, | |
| "learning_rate": 3.21e-06, | |
| "rewards/reward_fn": 0.4595912516117096, | |
| "reward": 0.4595912516117096, | |
| "reward_std": 0.03936622152104974, | |
| "completion_length": 77.8, | |
| "kl": 0.17524173483252525, | |
| "epoch": 0.358, | |
| "step": 895 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.625, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "rewards/reward_fn": 0.45086687207221987, | |
| "reward": 0.45086687207221987, | |
| "reward_std": 0.06653416159097106, | |
| "completion_length": 78.225, | |
| "kl": 0.13200628608465195, | |
| "epoch": 0.36, | |
| "step": 900 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 21.5, | |
| "learning_rate": 3.1900000000000004e-06, | |
| "rewards/reward_fn": 0.44835312366485597, | |
| "reward": 0.44835312366485597, | |
| "reward_std": 0.061607802627258935, | |
| "completion_length": 78.45, | |
| "kl": 0.14225002825260163, | |
| "epoch": 0.362, | |
| "step": 905 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "rewards/reward_fn": 0.45717000365257265, | |
| "reward": 0.45717000365257265, | |
| "reward_std": 0.041998466942459345, | |
| "completion_length": 78.5125, | |
| "kl": 0.12377910763025284, | |
| "epoch": 0.364, | |
| "step": 910 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 22.75, | |
| "learning_rate": 3.17e-06, | |
| "rewards/reward_fn": 0.4658468782901764, | |
| "reward": 0.4658468782901764, | |
| "reward_std": 0.021458613453432918, | |
| "completion_length": 77.6625, | |
| "kl": 0.12044140994548798, | |
| "epoch": 0.366, | |
| "step": 915 | |
| }, | |
| { | |
| "loss": 0.0068, | |
| "grad_norm": 22.875, | |
| "learning_rate": 3.1600000000000002e-06, | |
| "rewards/reward_fn": 0.4457137495279312, | |
| "reward": 0.4457137495279312, | |
| "reward_std": 0.07773053634446114, | |
| "completion_length": 77.0125, | |
| "kl": 0.16893841549754143, | |
| "epoch": 0.368, | |
| "step": 920 | |
| }, | |
| { | |
| "loss": 0.0072, | |
| "grad_norm": 22.875, | |
| "learning_rate": 3.1500000000000003e-06, | |
| "rewards/reward_fn": 0.4391768783330917, | |
| "reward": 0.4391768783330917, | |
| "reward_std": 0.08680278662359342, | |
| "completion_length": 76.8375, | |
| "kl": 0.1803253024816513, | |
| "epoch": 0.37, | |
| "step": 925 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.625, | |
| "learning_rate": 3.1400000000000004e-06, | |
| "rewards/reward_fn": 0.4521843731403351, | |
| "reward": 0.4521843731403351, | |
| "reward_std": 0.06457424827385694, | |
| "completion_length": 77.875, | |
| "kl": 0.14004313349723815, | |
| "epoch": 0.372, | |
| "step": 930 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 22.625, | |
| "learning_rate": 3.13e-06, | |
| "rewards/reward_fn": 0.4524868756532669, | |
| "reward": 0.4524868756532669, | |
| "reward_std": 0.048214147449471056, | |
| "completion_length": 77.3375, | |
| "kl": 0.15322432667016983, | |
| "epoch": 0.374, | |
| "step": 935 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 22.375, | |
| "learning_rate": 3.12e-06, | |
| "rewards/reward_fn": 0.4452850043773651, | |
| "reward": 0.4452850043773651, | |
| "reward_std": 0.07152452755253762, | |
| "completion_length": 77.6, | |
| "kl": 0.17651870474219322, | |
| "epoch": 0.376, | |
| "step": 940 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.25, | |
| "learning_rate": 3.1100000000000003e-06, | |
| "rewards/reward_fn": 0.4586562544107437, | |
| "reward": 0.4586562544107437, | |
| "reward_std": 0.04483227517921477, | |
| "completion_length": 78.25, | |
| "kl": 0.13818887621164322, | |
| "epoch": 0.378, | |
| "step": 945 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.1000000000000004e-06, | |
| "rewards/reward_fn": 0.4671787559986115, | |
| "reward": 0.4671787559986115, | |
| "reward_std": 0.02326571140438318, | |
| "completion_length": 78.575, | |
| "kl": 0.1284794516861439, | |
| "epoch": 0.38, | |
| "step": 950 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 24.25, | |
| "learning_rate": 3.09e-06, | |
| "rewards/reward_fn": 0.4639474958181381, | |
| "reward": 0.4639474958181381, | |
| "reward_std": 0.03198056248947978, | |
| "completion_length": 78.525, | |
| "kl": 0.1249109148979187, | |
| "epoch": 0.382, | |
| "step": 955 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 26.5, | |
| "learning_rate": 3.08e-06, | |
| "rewards/reward_fn": 0.4462443798780441, | |
| "reward": 0.4462443798780441, | |
| "reward_std": 0.06451276817824692, | |
| "completion_length": 77.025, | |
| "kl": 0.13725997805595397, | |
| "epoch": 0.384, | |
| "step": 960 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 24.375, | |
| "learning_rate": 3.0700000000000003e-06, | |
| "rewards/reward_fn": 0.43646687269210815, | |
| "reward": 0.43646687269210815, | |
| "reward_std": 0.10176013394957409, | |
| "completion_length": 77.9625, | |
| "kl": 0.14228134751319885, | |
| "epoch": 0.386, | |
| "step": 965 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3.0600000000000003e-06, | |
| "rewards/reward_fn": 0.45134938657283785, | |
| "reward": 0.45134938657283785, | |
| "reward_std": 0.06808145013637841, | |
| "completion_length": 78.4875, | |
| "kl": 0.1274636261165142, | |
| "epoch": 0.388, | |
| "step": 970 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 21.125, | |
| "learning_rate": 3.05e-06, | |
| "rewards/reward_fn": 0.43994062542915346, | |
| "reward": 0.43994062542915346, | |
| "reward_std": 0.09077681568451226, | |
| "completion_length": 77.375, | |
| "kl": 0.13992855474352836, | |
| "epoch": 0.39, | |
| "step": 975 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.25, | |
| "learning_rate": 3.04e-06, | |
| "rewards/reward_fn": 0.4459106236696243, | |
| "reward": 0.4459106236696243, | |
| "reward_std": 0.07173144910484552, | |
| "completion_length": 77.6875, | |
| "kl": 0.13269591480493545, | |
| "epoch": 0.392, | |
| "step": 980 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.5, | |
| "learning_rate": 3.0300000000000002e-06, | |
| "rewards/reward_fn": 0.452276873588562, | |
| "reward": 0.452276873588562, | |
| "reward_std": 0.058129315462429075, | |
| "completion_length": 76.2125, | |
| "kl": 0.14192070737481116, | |
| "epoch": 0.394, | |
| "step": 985 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.0200000000000003e-06, | |
| "rewards/reward_fn": 0.4616843730211258, | |
| "reward": 0.4616843730211258, | |
| "reward_std": 0.027600679779425263, | |
| "completion_length": 76.6, | |
| "kl": 0.12908575385808946, | |
| "epoch": 0.396, | |
| "step": 990 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.125, | |
| "learning_rate": 3.01e-06, | |
| "rewards/reward_fn": 0.45958187282085416, | |
| "reward": 0.45958187282085416, | |
| "reward_std": 0.041698419768363235, | |
| "completion_length": 77.6375, | |
| "kl": 0.14417157247662543, | |
| "epoch": 0.398, | |
| "step": 995 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3e-06, | |
| "rewards/reward_fn": 0.45577124059200286, | |
| "reward": 0.45577124059200286, | |
| "reward_std": 0.061070334317628296, | |
| "completion_length": 77.5625, | |
| "kl": 0.11728422567248345, | |
| "epoch": 0.4, | |
| "step": 1000 | |
| }, | |
| { | |
| "loss": 0.0043, | |
| "grad_norm": 23.0, | |
| "learning_rate": 2.99e-06, | |
| "rewards/reward_fn": 0.4588618755340576, | |
| "reward": 0.4588618755340576, | |
| "reward_std": 0.036662753293057904, | |
| "completion_length": 76.8625, | |
| "kl": 0.10696139335632324, | |
| "epoch": 0.402, | |
| "step": 1005 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 22.125, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "rewards/reward_fn": 0.4509599953889847, | |
| "reward": 0.4509599953889847, | |
| "reward_std": 0.04541698046959937, | |
| "completion_length": 78.7375, | |
| "kl": 0.1384617082774639, | |
| "epoch": 0.404, | |
| "step": 1010 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 25.25, | |
| "learning_rate": 2.97e-06, | |
| "rewards/reward_fn": 0.4705031216144562, | |
| "reward": 0.4705031216144562, | |
| "reward_std": 0.013416963210329414, | |
| "completion_length": 76.9, | |
| "kl": 0.11976072862744332, | |
| "epoch": 0.406, | |
| "step": 1015 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.75, | |
| "learning_rate": 2.96e-06, | |
| "rewards/reward_fn": 0.46544250547885896, | |
| "reward": 0.46544250547885896, | |
| "reward_std": 0.026560991373844444, | |
| "completion_length": 76.4625, | |
| "kl": 0.13976338282227516, | |
| "epoch": 0.408, | |
| "step": 1020 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 21.125, | |
| "learning_rate": 2.95e-06, | |
| "rewards/reward_fn": 0.46479061543941497, | |
| "reward": 0.46479061543941497, | |
| "reward_std": 0.02370762478094548, | |
| "completion_length": 75.7625, | |
| "kl": 0.1556813433766365, | |
| "epoch": 0.41, | |
| "step": 1025 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 19.625, | |
| "learning_rate": 2.9400000000000002e-06, | |
| "rewards/reward_fn": 0.4578593820333481, | |
| "reward": 0.4578593820333481, | |
| "reward_std": 0.04385443233186379, | |
| "completion_length": 76.7875, | |
| "kl": 0.13361710608005523, | |
| "epoch": 0.412, | |
| "step": 1030 | |
| }, | |
| { | |
| "loss": 0.0081, | |
| "grad_norm": 25.875, | |
| "learning_rate": 2.93e-06, | |
| "rewards/reward_fn": 0.42873625457286835, | |
| "reward": 0.42873625457286835, | |
| "reward_std": 0.11082857861183584, | |
| "completion_length": 74.8125, | |
| "kl": 0.20237903594970702, | |
| "epoch": 0.414, | |
| "step": 1035 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 18.5, | |
| "learning_rate": 2.92e-06, | |
| "rewards/reward_fn": 0.4592456161975861, | |
| "reward": 0.4592456161975861, | |
| "reward_std": 0.042502091301139446, | |
| "completion_length": 76.25, | |
| "kl": 0.1265183039009571, | |
| "epoch": 0.416, | |
| "step": 1040 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.91e-06, | |
| "rewards/reward_fn": 0.44570625126361846, | |
| "reward": 0.44570625126361846, | |
| "reward_std": 0.07765153090003878, | |
| "completion_length": 76.525, | |
| "kl": 0.14906007573008537, | |
| "epoch": 0.418, | |
| "step": 1045 | |
| }, | |
| { | |
| "loss": 0.0067, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.9e-06, | |
| "rewards/reward_fn": 0.4367462515830994, | |
| "reward": 0.4367462515830994, | |
| "reward_std": 0.0928474075277336, | |
| "completion_length": 77.8, | |
| "kl": 0.16817878931760788, | |
| "epoch": 0.42, | |
| "step": 1050 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 20.25, | |
| "learning_rate": 2.89e-06, | |
| "rewards/reward_fn": 0.45984499156475067, | |
| "reward": 0.45984499156475067, | |
| "reward_std": 0.03933965916512534, | |
| "completion_length": 77.275, | |
| "kl": 0.16533141881227492, | |
| "epoch": 0.422, | |
| "step": 1055 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 22.25, | |
| "learning_rate": 2.88e-06, | |
| "rewards/reward_fn": 0.43899562656879426, | |
| "reward": 0.43899562656879426, | |
| "reward_std": 0.08788106166757644, | |
| "completion_length": 75.75, | |
| "kl": 0.15769053027033805, | |
| "epoch": 0.424, | |
| "step": 1060 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.25, | |
| "learning_rate": 2.87e-06, | |
| "rewards/reward_fn": 0.423912501335144, | |
| "reward": 0.423912501335144, | |
| "reward_std": 0.11766294327098877, | |
| "completion_length": 76.475, | |
| "kl": 0.14753883704543114, | |
| "epoch": 0.426, | |
| "step": 1065 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 25.25, | |
| "learning_rate": 2.86e-06, | |
| "rewards/reward_fn": 0.46032374203205106, | |
| "reward": 0.46032374203205106, | |
| "reward_std": 0.03572893298696726, | |
| "completion_length": 77.825, | |
| "kl": 0.13863224387168885, | |
| "epoch": 0.428, | |
| "step": 1070 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 20.625, | |
| "learning_rate": 2.85e-06, | |
| "rewards/reward_fn": 0.4649974972009659, | |
| "reward": 0.4649974972009659, | |
| "reward_std": 0.03036914155818522, | |
| "completion_length": 75.8, | |
| "kl": 0.12206159606575966, | |
| "epoch": 0.43, | |
| "step": 1075 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.125, | |
| "learning_rate": 2.84e-06, | |
| "rewards/reward_fn": 0.4573018759489059, | |
| "reward": 0.4573018759489059, | |
| "reward_std": 0.06353021854301914, | |
| "completion_length": 77.575, | |
| "kl": 0.1350351519882679, | |
| "epoch": 0.432, | |
| "step": 1080 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 22.25, | |
| "learning_rate": 2.83e-06, | |
| "rewards/reward_fn": 0.43689249753952025, | |
| "reward": 0.43689249753952025, | |
| "reward_std": 0.09802878738846629, | |
| "completion_length": 76.1625, | |
| "kl": 0.13650911152362824, | |
| "epoch": 0.434, | |
| "step": 1085 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.875, | |
| "learning_rate": 2.82e-06, | |
| "rewards/reward_fn": 0.465862500667572, | |
| "reward": 0.465862500667572, | |
| "reward_std": 0.023497561831027268, | |
| "completion_length": 77.325, | |
| "kl": 0.13802992850542067, | |
| "epoch": 0.436, | |
| "step": 1090 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.75, | |
| "learning_rate": 2.8100000000000006e-06, | |
| "rewards/reward_fn": 0.4442381262779236, | |
| "reward": 0.4442381262779236, | |
| "reward_std": 0.0735843145288527, | |
| "completion_length": 77.4875, | |
| "kl": 0.14121268913149834, | |
| "epoch": 0.438, | |
| "step": 1095 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.375, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "rewards/reward_fn": 0.44780624806880953, | |
| "reward": 0.44780624806880953, | |
| "reward_std": 0.08063485231250525, | |
| "completion_length": 77.8, | |
| "kl": 0.14030690044164656, | |
| "epoch": 0.44, | |
| "step": 1100 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.5, | |
| "learning_rate": 2.7900000000000004e-06, | |
| "rewards/reward_fn": 0.45844624638557435, | |
| "reward": 0.45844624638557435, | |
| "reward_std": 0.05268092898186296, | |
| "completion_length": 77.7, | |
| "kl": 0.14547136351466178, | |
| "epoch": 0.442, | |
| "step": 1105 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 19.5, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "rewards/reward_fn": 0.43803000152111055, | |
| "reward": 0.43803000152111055, | |
| "reward_std": 0.10099421259947121, | |
| "completion_length": 78.0375, | |
| "kl": 0.14180475547909738, | |
| "epoch": 0.444, | |
| "step": 1110 | |
| }, | |
| { | |
| "loss": 0.0045, | |
| "grad_norm": 21.375, | |
| "learning_rate": 2.7700000000000006e-06, | |
| "rewards/reward_fn": 0.4747843772172928, | |
| "reward": 0.4747843772172928, | |
| "reward_std": 0.01129134335787967, | |
| "completion_length": 77.5875, | |
| "kl": 0.11243945509195327, | |
| "epoch": 0.446, | |
| "step": 1115 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 21.625, | |
| "learning_rate": 2.7600000000000003e-06, | |
| "rewards/reward_fn": 0.4492399960756302, | |
| "reward": 0.4492399960756302, | |
| "reward_std": 0.06491480625700205, | |
| "completion_length": 78.9375, | |
| "kl": 0.15152825638651848, | |
| "epoch": 0.448, | |
| "step": 1120 | |
| }, | |
| { | |
| "loss": 0.0068, | |
| "grad_norm": 23.25, | |
| "learning_rate": 2.7500000000000004e-06, | |
| "rewards/reward_fn": 0.44580812752246857, | |
| "reward": 0.44580812752246857, | |
| "reward_std": 0.08126231417991221, | |
| "completion_length": 78.075, | |
| "kl": 0.1693297281861305, | |
| "epoch": 0.45, | |
| "step": 1125 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.125, | |
| "learning_rate": 2.7400000000000004e-06, | |
| "rewards/reward_fn": 0.4513862580060959, | |
| "reward": 0.4513862580060959, | |
| "reward_std": 0.04983757671434432, | |
| "completion_length": 75.9625, | |
| "kl": 0.1426179051399231, | |
| "epoch": 0.452, | |
| "step": 1130 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 21.875, | |
| "learning_rate": 2.7300000000000005e-06, | |
| "rewards/reward_fn": 0.449181866645813, | |
| "reward": 0.449181866645813, | |
| "reward_std": 0.05518764650914818, | |
| "completion_length": 76.775, | |
| "kl": 0.1532064698636532, | |
| "epoch": 0.454, | |
| "step": 1135 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 21.5, | |
| "learning_rate": 2.7200000000000002e-06, | |
| "rewards/reward_fn": 0.45171125829219816, | |
| "reward": 0.45171125829219816, | |
| "reward_std": 0.05260382960550487, | |
| "completion_length": 77.9375, | |
| "kl": 0.15185603350400925, | |
| "epoch": 0.456, | |
| "step": 1140 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 21.375, | |
| "learning_rate": 2.7100000000000003e-06, | |
| "rewards/reward_fn": 0.4606387555599213, | |
| "reward": 0.4606387555599213, | |
| "reward_std": 0.03888747000601143, | |
| "completion_length": 76.275, | |
| "kl": 0.1298865035176277, | |
| "epoch": 0.458, | |
| "step": 1145 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 24.375, | |
| "learning_rate": 2.7000000000000004e-06, | |
| "rewards/reward_fn": 0.43960937559604646, | |
| "reward": 0.43960937559604646, | |
| "reward_std": 0.1048707491834648, | |
| "completion_length": 77.4625, | |
| "kl": 0.14175623878836632, | |
| "epoch": 0.46, | |
| "step": 1150 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.375, | |
| "learning_rate": 2.6900000000000005e-06, | |
| "rewards/reward_fn": 0.44681625366210936, | |
| "reward": 0.44681625366210936, | |
| "reward_std": 0.07011992897605523, | |
| "completion_length": 78.4375, | |
| "kl": 0.12534804567694663, | |
| "epoch": 0.462, | |
| "step": 1155 | |
| }, | |
| { | |
| "loss": 0.0044, | |
| "grad_norm": 20.5, | |
| "learning_rate": 2.68e-06, | |
| "rewards/reward_fn": 0.4641531229019165, | |
| "reward": 0.4641531229019165, | |
| "reward_std": 0.025870742078404875, | |
| "completion_length": 78.5, | |
| "kl": 0.10891071110963821, | |
| "epoch": 0.464, | |
| "step": 1160 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 21.125, | |
| "learning_rate": 2.6700000000000003e-06, | |
| "rewards/reward_fn": 0.4655087530612946, | |
| "reward": 0.4655087530612946, | |
| "reward_std": 0.03508747317828238, | |
| "completion_length": 78.4125, | |
| "kl": 0.1420759491622448, | |
| "epoch": 0.466, | |
| "step": 1165 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 19.875, | |
| "learning_rate": 2.6600000000000004e-06, | |
| "rewards/reward_fn": 0.45731625854969027, | |
| "reward": 0.45731625854969027, | |
| "reward_std": 0.03957532516214997, | |
| "completion_length": 78.4375, | |
| "kl": 0.13185337632894517, | |
| "epoch": 0.468, | |
| "step": 1170 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 22.25, | |
| "learning_rate": 2.6500000000000005e-06, | |
| "rewards/reward_fn": 0.44373124837875366, | |
| "reward": 0.44373124837875366, | |
| "reward_std": 0.07896788076031953, | |
| "completion_length": 76.2625, | |
| "kl": 0.13021735474467278, | |
| "epoch": 0.47, | |
| "step": 1175 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.64e-06, | |
| "rewards/reward_fn": 0.45281187295913694, | |
| "reward": 0.45281187295913694, | |
| "reward_std": 0.05061942492611706, | |
| "completion_length": 78.2375, | |
| "kl": 0.13731320798397065, | |
| "epoch": 0.472, | |
| "step": 1180 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.875, | |
| "learning_rate": 2.6300000000000002e-06, | |
| "rewards/reward_fn": 0.4562093824148178, | |
| "reward": 0.4562093824148178, | |
| "reward_std": 0.040638361941091716, | |
| "completion_length": 77.3625, | |
| "kl": 0.1344783328473568, | |
| "epoch": 0.474, | |
| "step": 1185 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 18.125, | |
| "learning_rate": 2.6200000000000003e-06, | |
| "rewards/reward_fn": 0.4580037474632263, | |
| "reward": 0.4580037474632263, | |
| "reward_std": 0.05447399332770146, | |
| "completion_length": 78.775, | |
| "kl": 0.12405369728803635, | |
| "epoch": 0.476, | |
| "step": 1190 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 21.875, | |
| "learning_rate": 2.6100000000000004e-06, | |
| "rewards/reward_fn": 0.45782187283039094, | |
| "reward": 0.45782187283039094, | |
| "reward_std": 0.04352846188703552, | |
| "completion_length": 78.2375, | |
| "kl": 0.14039622321724893, | |
| "epoch": 0.478, | |
| "step": 1195 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.6e-06, | |
| "rewards/reward_fn": 0.470593124628067, | |
| "reward": 0.470593124628067, | |
| "reward_std": 0.007097184634767472, | |
| "completion_length": 77.8875, | |
| "kl": 0.1220773808658123, | |
| "epoch": 0.48, | |
| "step": 1200 | |
| }, | |
| { | |
| "loss": 0.0068, | |
| "grad_norm": 19.125, | |
| "learning_rate": 2.59e-06, | |
| "rewards/reward_fn": 0.45062249302864077, | |
| "reward": 0.45062249302864077, | |
| "reward_std": 0.06360151261324062, | |
| "completion_length": 78.1875, | |
| "kl": 0.16903574615716935, | |
| "epoch": 0.482, | |
| "step": 1205 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 21.875, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "rewards/reward_fn": 0.45593812465667727, | |
| "reward": 0.45593812465667727, | |
| "reward_std": 0.036037556815426794, | |
| "completion_length": 77.325, | |
| "kl": 0.17566560804843903, | |
| "epoch": 0.484, | |
| "step": 1210 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.5, | |
| "learning_rate": 2.5700000000000004e-06, | |
| "rewards/reward_fn": 0.45412937700748446, | |
| "reward": 0.45412937700748446, | |
| "reward_std": 0.060397130448836836, | |
| "completion_length": 78.6625, | |
| "kl": 0.14816011264920234, | |
| "epoch": 0.486, | |
| "step": 1215 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 21.625, | |
| "learning_rate": 2.56e-06, | |
| "rewards/reward_fn": 0.4632093787193298, | |
| "reward": 0.4632093787193298, | |
| "reward_std": 0.044997752620838584, | |
| "completion_length": 79.2625, | |
| "kl": 0.1313982665538788, | |
| "epoch": 0.488, | |
| "step": 1220 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.75, | |
| "learning_rate": 2.55e-06, | |
| "rewards/reward_fn": 0.4657293736934662, | |
| "reward": 0.4657293736934662, | |
| "reward_std": 0.022073199006263165, | |
| "completion_length": 78.9, | |
| "kl": 0.12473629713058472, | |
| "epoch": 0.49, | |
| "step": 1225 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 23.125, | |
| "learning_rate": 2.5400000000000002e-06, | |
| "rewards/reward_fn": 0.4348493814468384, | |
| "reward": 0.4348493814468384, | |
| "reward_std": 0.07554407969582826, | |
| "completion_length": 79.5125, | |
| "kl": 0.1481925331056118, | |
| "epoch": 0.492, | |
| "step": 1230 | |
| }, | |
| { | |
| "loss": 0.0077, | |
| "grad_norm": 24.0, | |
| "learning_rate": 2.5300000000000003e-06, | |
| "rewards/reward_fn": 0.43550437688827515, | |
| "reward": 0.43550437688827515, | |
| "reward_std": 0.10341594566125423, | |
| "completion_length": 79.35, | |
| "kl": 0.1917330376803875, | |
| "epoch": 0.494, | |
| "step": 1235 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 22.375, | |
| "learning_rate": 2.52e-06, | |
| "rewards/reward_fn": 0.46648249924182894, | |
| "reward": 0.46648249924182894, | |
| "reward_std": 0.030170188657939433, | |
| "completion_length": 78.1375, | |
| "kl": 0.16498119458556176, | |
| "epoch": 0.496, | |
| "step": 1240 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 22.875, | |
| "learning_rate": 2.51e-06, | |
| "rewards/reward_fn": 0.450721874833107, | |
| "reward": 0.450721874833107, | |
| "reward_std": 0.054543074569664896, | |
| "completion_length": 78.0125, | |
| "kl": 0.16144041568040848, | |
| "epoch": 0.498, | |
| "step": 1245 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 19.875, | |
| "learning_rate": 2.5e-06, | |
| "rewards/reward_fn": 0.47377062737941744, | |
| "reward": 0.47377062737941744, | |
| "reward_std": 0.011301003873813897, | |
| "completion_length": 77.975, | |
| "kl": 0.13532672077417374, | |
| "epoch": 0.5, | |
| "step": 1250 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 18.875, | |
| "learning_rate": 2.4900000000000003e-06, | |
| "rewards/reward_fn": 0.4674256265163422, | |
| "reward": 0.4674256265163422, | |
| "reward_std": 0.0174906364409253, | |
| "completion_length": 79.825, | |
| "kl": 0.12367920055985451, | |
| "epoch": 0.502, | |
| "step": 1255 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 22.25, | |
| "learning_rate": 2.4800000000000004e-06, | |
| "rewards/reward_fn": 0.4363387554883957, | |
| "reward": 0.4363387554883957, | |
| "reward_std": 0.10196942522889003, | |
| "completion_length": 78.7, | |
| "kl": 0.15936801359057426, | |
| "epoch": 0.504, | |
| "step": 1260 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 21.625, | |
| "learning_rate": 2.47e-06, | |
| "rewards/reward_fn": 0.45225499868392943, | |
| "reward": 0.45225499868392943, | |
| "reward_std": 0.059183214767836036, | |
| "completion_length": 78.9125, | |
| "kl": 0.17264233008027077, | |
| "epoch": 0.506, | |
| "step": 1265 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.375, | |
| "learning_rate": 2.46e-06, | |
| "rewards/reward_fn": 0.4540149927139282, | |
| "reward": 0.4540149927139282, | |
| "reward_std": 0.05151141767855734, | |
| "completion_length": 78.875, | |
| "kl": 0.14860266521573068, | |
| "epoch": 0.508, | |
| "step": 1270 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.4500000000000003e-06, | |
| "rewards/reward_fn": 0.46672500371932985, | |
| "reward": 0.46672500371932985, | |
| "reward_std": 0.0238963620737195, | |
| "completion_length": 79.5875, | |
| "kl": 0.12659351155161858, | |
| "epoch": 0.51, | |
| "step": 1275 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.4400000000000004e-06, | |
| "rewards/reward_fn": 0.4593931257724762, | |
| "reward": 0.4593931257724762, | |
| "reward_std": 0.0301577219623141, | |
| "completion_length": 79.65, | |
| "kl": 0.15002150908112527, | |
| "epoch": 0.512, | |
| "step": 1280 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 18.0, | |
| "learning_rate": 2.43e-06, | |
| "rewards/reward_fn": 0.4625087469816208, | |
| "reward": 0.4625087469816208, | |
| "reward_std": 0.03460253309458494, | |
| "completion_length": 79.1125, | |
| "kl": 0.14838578924536705, | |
| "epoch": 0.514, | |
| "step": 1285 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 18.375, | |
| "learning_rate": 2.42e-06, | |
| "rewards/reward_fn": 0.4678725004196167, | |
| "reward": 0.4678725004196167, | |
| "reward_std": 0.02502680493053049, | |
| "completion_length": 78.5125, | |
| "kl": 0.11876562908291817, | |
| "epoch": 0.516, | |
| "step": 1290 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.4100000000000002e-06, | |
| "rewards/reward_fn": 0.44823938310146333, | |
| "reward": 0.44823938310146333, | |
| "reward_std": 0.04440039648325182, | |
| "completion_length": 78.2, | |
| "kl": 0.13063137009739875, | |
| "epoch": 0.518, | |
| "step": 1295 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 25.125, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "rewards/reward_fn": 0.44891312420368196, | |
| "reward": 0.44891312420368196, | |
| "reward_std": 0.07504934098105878, | |
| "completion_length": 78.3625, | |
| "kl": 0.15268274173140525, | |
| "epoch": 0.52, | |
| "step": 1300 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.39e-06, | |
| "rewards/reward_fn": 0.4568462461233139, | |
| "reward": 0.4568462461233139, | |
| "reward_std": 0.056088435545098035, | |
| "completion_length": 79.4125, | |
| "kl": 0.14741537049412728, | |
| "epoch": 0.522, | |
| "step": 1305 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 21.375, | |
| "learning_rate": 2.38e-06, | |
| "rewards/reward_fn": 0.4558568805456161, | |
| "reward": 0.4558568805456161, | |
| "reward_std": 0.05745224840939045, | |
| "completion_length": 78.8375, | |
| "kl": 0.1723767749965191, | |
| "epoch": 0.524, | |
| "step": 1310 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.5, | |
| "learning_rate": 2.37e-06, | |
| "rewards/reward_fn": 0.4718281179666519, | |
| "reward": 0.4718281179666519, | |
| "reward_std": 0.014395223173778504, | |
| "completion_length": 78.9875, | |
| "kl": 0.14733590111136435, | |
| "epoch": 0.526, | |
| "step": 1315 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.3600000000000003e-06, | |
| "rewards/reward_fn": 0.4554156303405762, | |
| "reward": 0.4554156303405762, | |
| "reward_std": 0.05159756838111207, | |
| "completion_length": 79.175, | |
| "kl": 0.14898578226566314, | |
| "epoch": 0.528, | |
| "step": 1320 | |
| }, | |
| { | |
| "loss": 0.0075, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.35e-06, | |
| "rewards/reward_fn": 0.4550568699836731, | |
| "reward": 0.4550568699836731, | |
| "reward_std": 0.06614897139370442, | |
| "completion_length": 78.3, | |
| "kl": 0.18677168115973472, | |
| "epoch": 0.53, | |
| "step": 1325 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 22.375, | |
| "learning_rate": 2.3400000000000005e-06, | |
| "rewards/reward_fn": 0.44545812010765073, | |
| "reward": 0.44545812010765073, | |
| "reward_std": 0.07346066441386938, | |
| "completion_length": 79.625, | |
| "kl": 0.1740099720656872, | |
| "epoch": 0.532, | |
| "step": 1330 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 19.0, | |
| "learning_rate": 2.33e-06, | |
| "rewards/reward_fn": 0.45567687749862673, | |
| "reward": 0.45567687749862673, | |
| "reward_std": 0.05622612689621746, | |
| "completion_length": 79.0875, | |
| "kl": 0.17623607516288758, | |
| "epoch": 0.534, | |
| "step": 1335 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 21.25, | |
| "learning_rate": 2.3200000000000002e-06, | |
| "rewards/reward_fn": 0.4575300008058548, | |
| "reward": 0.4575300008058548, | |
| "reward_std": 0.06325785100925714, | |
| "completion_length": 78.85, | |
| "kl": 0.1518963485956192, | |
| "epoch": 0.536, | |
| "step": 1340 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 19.875, | |
| "learning_rate": 2.3100000000000003e-06, | |
| "rewards/reward_fn": 0.4771687567234039, | |
| "reward": 0.4771687567234039, | |
| "reward_std": 0.01310007597785443, | |
| "completion_length": 78.25, | |
| "kl": 0.14608021229505538, | |
| "epoch": 0.538, | |
| "step": 1345 | |
| }, | |
| { | |
| "loss": 0.0076, | |
| "grad_norm": 22.125, | |
| "learning_rate": 2.3000000000000004e-06, | |
| "rewards/reward_fn": 0.440699377655983, | |
| "reward": 0.440699377655983, | |
| "reward_std": 0.07997361421585084, | |
| "completion_length": 78.5, | |
| "kl": 0.19073922261595727, | |
| "epoch": 0.54, | |
| "step": 1350 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 23.375, | |
| "learning_rate": 2.29e-06, | |
| "rewards/reward_fn": 0.46355812549591063, | |
| "reward": 0.46355812549591063, | |
| "reward_std": 0.042679897602647544, | |
| "completion_length": 78.8125, | |
| "kl": 0.15155968442559242, | |
| "epoch": 0.542, | |
| "step": 1355 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 19.375, | |
| "learning_rate": 2.28e-06, | |
| "rewards/reward_fn": 0.47639000713825225, | |
| "reward": 0.47639000713825225, | |
| "reward_std": 0.018505998922046275, | |
| "completion_length": 79.6375, | |
| "kl": 0.13178130090236664, | |
| "epoch": 0.544, | |
| "step": 1360 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 19.0, | |
| "learning_rate": 2.2700000000000003e-06, | |
| "rewards/reward_fn": 0.463755002617836, | |
| "reward": 0.463755002617836, | |
| "reward_std": 0.02542402143590152, | |
| "completion_length": 78.775, | |
| "kl": 0.1374554641544819, | |
| "epoch": 0.546, | |
| "step": 1365 | |
| }, | |
| { | |
| "loss": 0.0067, | |
| "grad_norm": 22.25, | |
| "learning_rate": 2.2600000000000004e-06, | |
| "rewards/reward_fn": 0.45715188086032865, | |
| "reward": 0.45715188086032865, | |
| "reward_std": 0.0426810149801895, | |
| "completion_length": 79.3375, | |
| "kl": 0.16694772839546204, | |
| "epoch": 0.548, | |
| "step": 1370 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 19.875, | |
| "learning_rate": 2.25e-06, | |
| "rewards/reward_fn": 0.46370500326156616, | |
| "reward": 0.46370500326156616, | |
| "reward_std": 0.023371401114854962, | |
| "completion_length": 78.325, | |
| "kl": 0.13150209859013556, | |
| "epoch": 0.55, | |
| "step": 1375 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 19.375, | |
| "learning_rate": 2.24e-06, | |
| "rewards/reward_fn": 0.44826062619686124, | |
| "reward": 0.44826062619686124, | |
| "reward_std": 0.06448173672542908, | |
| "completion_length": 78.8375, | |
| "kl": 0.15249428376555443, | |
| "epoch": 0.552, | |
| "step": 1380 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 23.25, | |
| "learning_rate": 2.2300000000000002e-06, | |
| "rewards/reward_fn": 0.46055562794208527, | |
| "reward": 0.46055562794208527, | |
| "reward_std": 0.04732920726528391, | |
| "completion_length": 78.475, | |
| "kl": 0.13974663913249968, | |
| "epoch": 0.554, | |
| "step": 1385 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.625, | |
| "learning_rate": 2.2200000000000003e-06, | |
| "rewards/reward_fn": 0.4677337437868118, | |
| "reward": 0.4677337437868118, | |
| "reward_std": 0.02635425798362121, | |
| "completion_length": 78.4125, | |
| "kl": 0.1423714838922024, | |
| "epoch": 0.556, | |
| "step": 1390 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 19.125, | |
| "learning_rate": 2.21e-06, | |
| "rewards/reward_fn": 0.4616131275892258, | |
| "reward": 0.4616131275892258, | |
| "reward_std": 0.03302627064986154, | |
| "completion_length": 78.9375, | |
| "kl": 0.1436442255973816, | |
| "epoch": 0.558, | |
| "step": 1395 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.875, | |
| "learning_rate": 2.2e-06, | |
| "rewards/reward_fn": 0.46321562230587005, | |
| "reward": 0.46321562230587005, | |
| "reward_std": 0.04756553352344781, | |
| "completion_length": 78.5, | |
| "kl": 0.1262364447116852, | |
| "epoch": 0.56, | |
| "step": 1400 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.0, | |
| "learning_rate": 2.19e-06, | |
| "rewards/reward_fn": 0.44202812314033507, | |
| "reward": 0.44202812314033507, | |
| "reward_std": 0.0773768131621182, | |
| "completion_length": 78.9125, | |
| "kl": 0.14810121133923532, | |
| "epoch": 0.562, | |
| "step": 1405 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.125, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "rewards/reward_fn": 0.46586625576019286, | |
| "reward": 0.46586625576019286, | |
| "reward_std": 0.032051419792696836, | |
| "completion_length": 78.5625, | |
| "kl": 0.1482535183429718, | |
| "epoch": 0.564, | |
| "step": 1410 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 24.75, | |
| "learning_rate": 2.17e-06, | |
| "rewards/reward_fn": 0.46913000345230105, | |
| "reward": 0.46913000345230105, | |
| "reward_std": 0.032656107540242375, | |
| "completion_length": 78.5, | |
| "kl": 0.11947640255093575, | |
| "epoch": 0.566, | |
| "step": 1415 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 19.125, | |
| "learning_rate": 2.16e-06, | |
| "rewards/reward_fn": 0.439087501168251, | |
| "reward": 0.439087501168251, | |
| "reward_std": 0.09692132237832993, | |
| "completion_length": 79.6625, | |
| "kl": 0.1427506759762764, | |
| "epoch": 0.568, | |
| "step": 1420 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 21.75, | |
| "learning_rate": 2.15e-06, | |
| "rewards/reward_fn": 0.4551968663930893, | |
| "reward": 0.4551968663930893, | |
| "reward_std": 0.043816833925666286, | |
| "completion_length": 77.55, | |
| "kl": 0.17263874933123588, | |
| "epoch": 0.57, | |
| "step": 1425 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.625, | |
| "learning_rate": 2.1400000000000003e-06, | |
| "rewards/reward_fn": 0.4475862592458725, | |
| "reward": 0.4475862592458725, | |
| "reward_std": 0.061305654630996284, | |
| "completion_length": 79.3375, | |
| "kl": 0.1420199103653431, | |
| "epoch": 0.572, | |
| "step": 1430 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 19.5, | |
| "learning_rate": 2.13e-06, | |
| "rewards/reward_fn": 0.4612312436103821, | |
| "reward": 0.4612312436103821, | |
| "reward_std": 0.04303327279048972, | |
| "completion_length": 78.9125, | |
| "kl": 0.13926436081528665, | |
| "epoch": 0.574, | |
| "step": 1435 | |
| }, | |
| { | |
| "loss": 0.0079, | |
| "grad_norm": 38.0, | |
| "learning_rate": 2.12e-06, | |
| "rewards/reward_fn": 0.4627299964427948, | |
| "reward": 0.4627299964427948, | |
| "reward_std": 0.042749036371242256, | |
| "completion_length": 77.8625, | |
| "kl": 0.1982392191886902, | |
| "epoch": 0.576, | |
| "step": 1440 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 18.75, | |
| "learning_rate": 2.11e-06, | |
| "rewards/reward_fn": 0.4696300059556961, | |
| "reward": 0.4696300059556961, | |
| "reward_std": 0.029448882048018276, | |
| "completion_length": 79.9125, | |
| "kl": 0.14228403344750404, | |
| "epoch": 0.578, | |
| "step": 1445 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.5, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "rewards/reward_fn": 0.4625418782234192, | |
| "reward": 0.4625418782234192, | |
| "reward_std": 0.023554211598820984, | |
| "completion_length": 78.7375, | |
| "kl": 0.12463297769427299, | |
| "epoch": 0.58, | |
| "step": 1450 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.09e-06, | |
| "rewards/reward_fn": 0.4666400045156479, | |
| "reward": 0.4666400045156479, | |
| "reward_std": 0.01901569733163342, | |
| "completion_length": 79.2875, | |
| "kl": 0.13114793226122856, | |
| "epoch": 0.582, | |
| "step": 1455 | |
| }, | |
| { | |
| "loss": 0.0074, | |
| "grad_norm": 22.125, | |
| "learning_rate": 2.08e-06, | |
| "rewards/reward_fn": 0.4477406233549118, | |
| "reward": 0.4477406233549118, | |
| "reward_std": 0.06840260641183704, | |
| "completion_length": 78.7625, | |
| "kl": 0.1860959157347679, | |
| "epoch": 0.584, | |
| "step": 1460 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 18.25, | |
| "learning_rate": 2.07e-06, | |
| "rewards/reward_fn": 0.47093687057495115, | |
| "reward": 0.47093687057495115, | |
| "reward_std": 0.009799153183121235, | |
| "completion_length": 77.5, | |
| "kl": 0.1460045598447323, | |
| "epoch": 0.586, | |
| "step": 1465 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 20.375, | |
| "learning_rate": 2.06e-06, | |
| "rewards/reward_fn": 0.4671725004911423, | |
| "reward": 0.4671725004911423, | |
| "reward_std": 0.029030334879644216, | |
| "completion_length": 78.0125, | |
| "kl": 0.14976133704185485, | |
| "epoch": 0.588, | |
| "step": 1470 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 19.625, | |
| "learning_rate": 2.05e-06, | |
| "rewards/reward_fn": 0.4621724963188171, | |
| "reward": 0.4621724963188171, | |
| "reward_std": 0.042897804221138355, | |
| "completion_length": 78.5875, | |
| "kl": 0.1281472846865654, | |
| "epoch": 0.59, | |
| "step": 1475 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.875, | |
| "learning_rate": 2.04e-06, | |
| "rewards/reward_fn": 0.4698318690061569, | |
| "reward": 0.4698318690061569, | |
| "reward_std": 0.023317102977307512, | |
| "completion_length": 78.5375, | |
| "kl": 0.12476283833384513, | |
| "epoch": 0.592, | |
| "step": 1480 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 22.375, | |
| "learning_rate": 2.0300000000000005e-06, | |
| "rewards/reward_fn": 0.45168625712394717, | |
| "reward": 0.45168625712394717, | |
| "reward_std": 0.06396679894533008, | |
| "completion_length": 78.775, | |
| "kl": 0.15403145402669907, | |
| "epoch": 0.594, | |
| "step": 1485 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.0, | |
| "learning_rate": 2.02e-06, | |
| "rewards/reward_fn": 0.45557625591754913, | |
| "reward": 0.45557625591754913, | |
| "reward_std": 0.04759975708439015, | |
| "completion_length": 77.5875, | |
| "kl": 0.14153004586696624, | |
| "epoch": 0.596, | |
| "step": 1490 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 22.625, | |
| "learning_rate": 2.0100000000000002e-06, | |
| "rewards/reward_fn": 0.45877124965190885, | |
| "reward": 0.45877124965190885, | |
| "reward_std": 0.038299218472093347, | |
| "completion_length": 79.1875, | |
| "kl": 0.1336129680275917, | |
| "epoch": 0.598, | |
| "step": 1495 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "rewards/reward_fn": 0.45951750576496125, | |
| "reward": 0.45951750576496125, | |
| "reward_std": 0.043301355338189754, | |
| "completion_length": 78.8625, | |
| "kl": 0.1319414682686329, | |
| "epoch": 0.6, | |
| "step": 1500 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.9900000000000004e-06, | |
| "rewards/reward_fn": 0.4404612571001053, | |
| "reward": 0.4404612571001053, | |
| "reward_std": 0.07990776300430298, | |
| "completion_length": 77.75, | |
| "kl": 0.15399570986628533, | |
| "epoch": 0.602, | |
| "step": 1505 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.98e-06, | |
| "rewards/reward_fn": 0.4647749960422516, | |
| "reward": 0.4647749960422516, | |
| "reward_std": 0.047874861588934434, | |
| "completion_length": 78.975, | |
| "kl": 0.14728261902928352, | |
| "epoch": 0.604, | |
| "step": 1510 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.97e-06, | |
| "rewards/reward_fn": 0.45807936787605286, | |
| "reward": 0.45807936787605286, | |
| "reward_std": 0.060872105229645965, | |
| "completion_length": 78.6625, | |
| "kl": 0.1391053855419159, | |
| "epoch": 0.606, | |
| "step": 1515 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.9600000000000003e-06, | |
| "rewards/reward_fn": 0.4504493743181229, | |
| "reward": 0.4504493743181229, | |
| "reward_std": 0.06272484959335997, | |
| "completion_length": 78.0125, | |
| "kl": 0.17193232327699662, | |
| "epoch": 0.608, | |
| "step": 1520 | |
| }, | |
| { | |
| "loss": 0.0068, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.9500000000000004e-06, | |
| "rewards/reward_fn": 0.439431244134903, | |
| "reward": 0.439431244134903, | |
| "reward_std": 0.07358825565315782, | |
| "completion_length": 78.675, | |
| "kl": 0.16932241916656493, | |
| "epoch": 0.61, | |
| "step": 1525 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 21.5, | |
| "learning_rate": 1.94e-06, | |
| "rewards/reward_fn": 0.4701712429523468, | |
| "reward": 0.4701712429523468, | |
| "reward_std": 0.025754676898941398, | |
| "completion_length": 78.45, | |
| "kl": 0.1536574937403202, | |
| "epoch": 0.612, | |
| "step": 1530 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.93e-06, | |
| "rewards/reward_fn": 0.4685331225395203, | |
| "reward": 0.4685331225395203, | |
| "reward_std": 0.02594901086995378, | |
| "completion_length": 78.625, | |
| "kl": 0.12761929631233215, | |
| "epoch": 0.614, | |
| "step": 1535 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.9200000000000003e-06, | |
| "rewards/reward_fn": 0.46238250136375425, | |
| "reward": 0.46238250136375425, | |
| "reward_std": 0.04514178307726979, | |
| "completion_length": 77.6125, | |
| "kl": 0.1310683749616146, | |
| "epoch": 0.616, | |
| "step": 1540 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 26.375, | |
| "learning_rate": 1.9100000000000003e-06, | |
| "rewards/reward_fn": 0.46453936994075773, | |
| "reward": 0.46453936994075773, | |
| "reward_std": 0.05459905466996133, | |
| "completion_length": 78.525, | |
| "kl": 0.12022457122802735, | |
| "epoch": 0.618, | |
| "step": 1545 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "rewards/reward_fn": 0.46645999848842623, | |
| "reward": 0.46645999848842623, | |
| "reward_std": 0.024052193760871886, | |
| "completion_length": 78.6875, | |
| "kl": 0.16018542796373367, | |
| "epoch": 0.62, | |
| "step": 1550 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.8900000000000001e-06, | |
| "rewards/reward_fn": 0.4562818706035614, | |
| "reward": 0.4562818706035614, | |
| "reward_std": 0.043363090697675945, | |
| "completion_length": 78.7375, | |
| "kl": 0.1383350558578968, | |
| "epoch": 0.622, | |
| "step": 1555 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.8800000000000002e-06, | |
| "rewards/reward_fn": 0.47267499268054963, | |
| "reward": 0.47267499268054963, | |
| "reward_std": 0.03722939351573586, | |
| "completion_length": 78.1625, | |
| "kl": 0.14105435311794282, | |
| "epoch": 0.624, | |
| "step": 1560 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.87e-06, | |
| "rewards/reward_fn": 0.4576281189918518, | |
| "reward": 0.4576281189918518, | |
| "reward_std": 0.05807865222450346, | |
| "completion_length": 79.4, | |
| "kl": 0.13264633268117904, | |
| "epoch": 0.626, | |
| "step": 1565 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.8600000000000002e-06, | |
| "rewards/reward_fn": 0.42023812532424926, | |
| "reward": 0.42023812532424926, | |
| "reward_std": 0.11829792927019298, | |
| "completion_length": 76.625, | |
| "kl": 0.17229357063770295, | |
| "epoch": 0.628, | |
| "step": 1570 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.5, | |
| "learning_rate": 1.85e-06, | |
| "rewards/reward_fn": 0.46860812306404115, | |
| "reward": 0.46860812306404115, | |
| "reward_std": 0.03086728664347902, | |
| "completion_length": 78.7875, | |
| "kl": 0.13762294948101045, | |
| "epoch": 0.63, | |
| "step": 1575 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 22.625, | |
| "learning_rate": 1.8400000000000002e-06, | |
| "rewards/reward_fn": 0.46147062480449674, | |
| "reward": 0.46147062480449674, | |
| "reward_std": 0.04284065024694428, | |
| "completion_length": 77.675, | |
| "kl": 0.1324526160955429, | |
| "epoch": 0.632, | |
| "step": 1580 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 21.5, | |
| "learning_rate": 1.83e-06, | |
| "rewards/reward_fn": 0.4482531249523163, | |
| "reward": 0.4482531249523163, | |
| "reward_std": 0.07075127304997295, | |
| "completion_length": 75.85, | |
| "kl": 0.16137402653694152, | |
| "epoch": 0.634, | |
| "step": 1585 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.75, | |
| "learning_rate": 1.8200000000000002e-06, | |
| "rewards/reward_fn": 0.4649731248617172, | |
| "reward": 0.4649731248617172, | |
| "reward_std": 0.027589096594601868, | |
| "completion_length": 77.5625, | |
| "kl": 0.12578429877758027, | |
| "epoch": 0.636, | |
| "step": 1590 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.81e-06, | |
| "rewards/reward_fn": 0.46504874527454376, | |
| "reward": 0.46504874527454376, | |
| "reward_std": 0.02663288627518341, | |
| "completion_length": 78.3875, | |
| "kl": 0.12880957499146461, | |
| "epoch": 0.638, | |
| "step": 1595 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "rewards/reward_fn": 0.4477743715047836, | |
| "reward": 0.4477743715047836, | |
| "reward_std": 0.06575249675661325, | |
| "completion_length": 76.825, | |
| "kl": 0.14026456847786903, | |
| "epoch": 0.64, | |
| "step": 1600 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 21.375, | |
| "learning_rate": 1.79e-06, | |
| "rewards/reward_fn": 0.46851625442504885, | |
| "reward": 0.46851625442504885, | |
| "reward_std": 0.03404894776176661, | |
| "completion_length": 78.3, | |
| "kl": 0.13332988694310188, | |
| "epoch": 0.642, | |
| "step": 1605 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "rewards/reward_fn": 0.45667624771595, | |
| "reward": 0.45667624771595, | |
| "reward_std": 0.05264936711173505, | |
| "completion_length": 78.5875, | |
| "kl": 0.14069104120135306, | |
| "epoch": 0.644, | |
| "step": 1610 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 23.625, | |
| "learning_rate": 1.77e-06, | |
| "rewards/reward_fn": 0.4541974991559982, | |
| "reward": 0.4541974991559982, | |
| "reward_std": 0.06876377174630761, | |
| "completion_length": 78.325, | |
| "kl": 0.1617581441998482, | |
| "epoch": 0.646, | |
| "step": 1615 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.76e-06, | |
| "rewards/reward_fn": 0.47011750638484956, | |
| "reward": 0.47011750638484956, | |
| "reward_std": 0.027857921156100928, | |
| "completion_length": 78.5, | |
| "kl": 0.1346297614276409, | |
| "epoch": 0.648, | |
| "step": 1620 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 22.0, | |
| "learning_rate": 1.75e-06, | |
| "rewards/reward_fn": 0.4520518720149994, | |
| "reward": 0.4520518720149994, | |
| "reward_std": 0.0729821051703766, | |
| "completion_length": 77.825, | |
| "kl": 0.161976557970047, | |
| "epoch": 0.65, | |
| "step": 1625 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.74e-06, | |
| "rewards/reward_fn": 0.4532381296157837, | |
| "reward": 0.4532381296157837, | |
| "reward_std": 0.06829985191579908, | |
| "completion_length": 77.525, | |
| "kl": 0.13216826990246772, | |
| "epoch": 0.652, | |
| "step": 1630 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.73e-06, | |
| "rewards/reward_fn": 0.4630106300115585, | |
| "reward": 0.4630106300115585, | |
| "reward_std": 0.05130832166178152, | |
| "completion_length": 78.8875, | |
| "kl": 0.12697028666734694, | |
| "epoch": 0.654, | |
| "step": 1635 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.72e-06, | |
| "rewards/reward_fn": 0.4494799941778183, | |
| "reward": 0.4494799941778183, | |
| "reward_std": 0.06570386737585068, | |
| "completion_length": 76.425, | |
| "kl": 0.14841574504971505, | |
| "epoch": 0.656, | |
| "step": 1640 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 20.25, | |
| "learning_rate": 1.7100000000000004e-06, | |
| "rewards/reward_fn": 0.45594811737537383, | |
| "reward": 0.45594811737537383, | |
| "reward_std": 0.05052668444113806, | |
| "completion_length": 79.3875, | |
| "kl": 0.11954338103532791, | |
| "epoch": 0.658, | |
| "step": 1645 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "rewards/reward_fn": 0.46476125419139863, | |
| "reward": 0.46476125419139863, | |
| "reward_std": 0.043870922236237675, | |
| "completion_length": 78.7875, | |
| "kl": 0.1275065064430237, | |
| "epoch": 0.66, | |
| "step": 1650 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 21.125, | |
| "learning_rate": 1.6900000000000003e-06, | |
| "rewards/reward_fn": 0.46913999915122984, | |
| "reward": 0.46913999915122984, | |
| "reward_std": 0.006915005797054619, | |
| "completion_length": 78.5375, | |
| "kl": 0.12140461131930351, | |
| "epoch": 0.662, | |
| "step": 1655 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.6800000000000002e-06, | |
| "rewards/reward_fn": 0.4625368744134903, | |
| "reward": 0.4625368744134903, | |
| "reward_std": 0.02914451065007597, | |
| "completion_length": 77.95, | |
| "kl": 0.1269746668636799, | |
| "epoch": 0.664, | |
| "step": 1660 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.6700000000000003e-06, | |
| "rewards/reward_fn": 0.4738156259059906, | |
| "reward": 0.4738156259059906, | |
| "reward_std": 0.020672354963608086, | |
| "completion_length": 77.8, | |
| "kl": 0.16429368406534195, | |
| "epoch": 0.666, | |
| "step": 1665 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.6600000000000002e-06, | |
| "rewards/reward_fn": 0.4726106315851212, | |
| "reward": 0.4726106315851212, | |
| "reward_std": 0.011511084495577962, | |
| "completion_length": 79.3375, | |
| "kl": 0.13863546177744865, | |
| "epoch": 0.668, | |
| "step": 1670 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.5, | |
| "learning_rate": 1.6500000000000003e-06, | |
| "rewards/reward_fn": 0.46120937168598175, | |
| "reward": 0.46120937168598175, | |
| "reward_std": 0.04015400728676468, | |
| "completion_length": 78.25, | |
| "kl": 0.13780420050024986, | |
| "epoch": 0.67, | |
| "step": 1675 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 24.5, | |
| "learning_rate": 1.6400000000000002e-06, | |
| "rewards/reward_fn": 0.4591624945402145, | |
| "reward": 0.4591624945402145, | |
| "reward_std": 0.0403320163837634, | |
| "completion_length": 78.8875, | |
| "kl": 0.15152628272771834, | |
| "epoch": 0.672, | |
| "step": 1680 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.6300000000000003e-06, | |
| "rewards/reward_fn": 0.46432062685489656, | |
| "reward": 0.46432062685489656, | |
| "reward_std": 0.03836179277859628, | |
| "completion_length": 78.275, | |
| "kl": 0.12853171303868294, | |
| "epoch": 0.674, | |
| "step": 1685 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.6200000000000002e-06, | |
| "rewards/reward_fn": 0.4714624971151352, | |
| "reward": 0.4714624971151352, | |
| "reward_std": 0.028523307130672037, | |
| "completion_length": 78.3125, | |
| "kl": 0.14340822845697404, | |
| "epoch": 0.676, | |
| "step": 1690 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 22.75, | |
| "learning_rate": 1.6100000000000003e-06, | |
| "rewards/reward_fn": 0.4591949999332428, | |
| "reward": 0.4591949999332428, | |
| "reward_std": 0.038035544892773034, | |
| "completion_length": 78.15, | |
| "kl": 0.14982439056038857, | |
| "epoch": 0.678, | |
| "step": 1695 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "rewards/reward_fn": 0.4653699994087219, | |
| "reward": 0.4653699994087219, | |
| "reward_std": 0.020481601386563852, | |
| "completion_length": 76.7625, | |
| "kl": 0.14447411969304086, | |
| "epoch": 0.68, | |
| "step": 1700 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.5900000000000002e-06, | |
| "rewards/reward_fn": 0.4341324925422668, | |
| "reward": 0.4341324925422668, | |
| "reward_std": 0.09427430615760386, | |
| "completion_length": 77.8, | |
| "kl": 0.14823570474982262, | |
| "epoch": 0.682, | |
| "step": 1705 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "rewards/reward_fn": 0.45423062741756437, | |
| "reward": 0.45423062741756437, | |
| "reward_std": 0.04875338152050972, | |
| "completion_length": 79.475, | |
| "kl": 0.1357567824423313, | |
| "epoch": 0.684, | |
| "step": 1710 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.5700000000000002e-06, | |
| "rewards/reward_fn": 0.4559962421655655, | |
| "reward": 0.4559962421655655, | |
| "reward_std": 0.06438031857833267, | |
| "completion_length": 78.4875, | |
| "kl": 0.15894640609622002, | |
| "epoch": 0.686, | |
| "step": 1715 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 19.875, | |
| "learning_rate": 1.56e-06, | |
| "rewards/reward_fn": 0.4572743773460388, | |
| "reward": 0.4572743773460388, | |
| "reward_std": 0.04752160895150155, | |
| "completion_length": 79.1125, | |
| "kl": 0.16350691244006157, | |
| "epoch": 0.688, | |
| "step": 1720 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 22.875, | |
| "learning_rate": 1.5500000000000002e-06, | |
| "rewards/reward_fn": 0.451460000872612, | |
| "reward": 0.451460000872612, | |
| "reward_std": 0.06449790641199797, | |
| "completion_length": 78.6375, | |
| "kl": 0.17609091848134995, | |
| "epoch": 0.69, | |
| "step": 1725 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.54e-06, | |
| "rewards/reward_fn": 0.46678187251091, | |
| "reward": 0.46678187251091, | |
| "reward_std": 0.028732791543006897, | |
| "completion_length": 77.3875, | |
| "kl": 0.12948581501841544, | |
| "epoch": 0.692, | |
| "step": 1730 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.5300000000000002e-06, | |
| "rewards/reward_fn": 0.4550174981355667, | |
| "reward": 0.4550174981355667, | |
| "reward_std": 0.06296568798134103, | |
| "completion_length": 78.3875, | |
| "kl": 0.14993617683649063, | |
| "epoch": 0.694, | |
| "step": 1735 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.52e-06, | |
| "rewards/reward_fn": 0.45879937410354615, | |
| "reward": 0.45879937410354615, | |
| "reward_std": 0.04999549321364612, | |
| "completion_length": 77.975, | |
| "kl": 0.13548573106527328, | |
| "epoch": 0.696, | |
| "step": 1740 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.5100000000000002e-06, | |
| "rewards/reward_fn": 0.4645506262779236, | |
| "reward": 0.4645506262779236, | |
| "reward_std": 0.025334799219854175, | |
| "completion_length": 79.025, | |
| "kl": 0.1345573790371418, | |
| "epoch": 0.698, | |
| "step": 1745 | |
| }, | |
| { | |
| "loss": 0.0072, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.5e-06, | |
| "rewards/reward_fn": 0.4387993663549423, | |
| "reward": 0.4387993663549423, | |
| "reward_std": 0.0916181854379829, | |
| "completion_length": 78.8125, | |
| "kl": 0.18051299825310707, | |
| "epoch": 0.7, | |
| "step": 1750 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.4900000000000001e-06, | |
| "rewards/reward_fn": 0.4713474988937378, | |
| "reward": 0.4713474988937378, | |
| "reward_std": 0.02058067887555808, | |
| "completion_length": 78.3875, | |
| "kl": 0.14509371370077134, | |
| "epoch": 0.702, | |
| "step": 1755 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.48e-06, | |
| "rewards/reward_fn": 0.45444686710834503, | |
| "reward": 0.45444686710834503, | |
| "reward_std": 0.06304303905926645, | |
| "completion_length": 77.1, | |
| "kl": 0.1549811489880085, | |
| "epoch": 0.704, | |
| "step": 1760 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.4700000000000001e-06, | |
| "rewards/reward_fn": 0.4627524971961975, | |
| "reward": 0.4627524971961975, | |
| "reward_std": 0.04254062173422426, | |
| "completion_length": 78.4625, | |
| "kl": 0.1384074404835701, | |
| "epoch": 0.706, | |
| "step": 1765 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 23.5, | |
| "learning_rate": 1.46e-06, | |
| "rewards/reward_fn": 0.45813000202178955, | |
| "reward": 0.45813000202178955, | |
| "reward_std": 0.04893373708473518, | |
| "completion_length": 78.0, | |
| "kl": 0.15400241911411286, | |
| "epoch": 0.708, | |
| "step": 1770 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.45e-06, | |
| "rewards/reward_fn": 0.4570950001478195, | |
| "reward": 0.4570950001478195, | |
| "reward_std": 0.04461987121030688, | |
| "completion_length": 78.6625, | |
| "kl": 0.14551043882966042, | |
| "epoch": 0.71, | |
| "step": 1775 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.44e-06, | |
| "rewards/reward_fn": 0.45253312587738037, | |
| "reward": 0.45253312587738037, | |
| "reward_std": 0.04993348123971373, | |
| "completion_length": 79.375, | |
| "kl": 0.15369636416435242, | |
| "epoch": 0.712, | |
| "step": 1780 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.43e-06, | |
| "rewards/reward_fn": 0.4615906268358231, | |
| "reward": 0.4615906268358231, | |
| "reward_std": 0.0613109068479389, | |
| "completion_length": 76.65, | |
| "kl": 0.14984343126416205, | |
| "epoch": 0.714, | |
| "step": 1785 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.42e-06, | |
| "rewards/reward_fn": 0.4519368767738342, | |
| "reward": 0.4519368767738342, | |
| "reward_std": 0.05483808619901538, | |
| "completion_length": 78.6, | |
| "kl": 0.12471728846430778, | |
| "epoch": 0.716, | |
| "step": 1790 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.41e-06, | |
| "rewards/reward_fn": 0.4455212503671646, | |
| "reward": 0.4455212503671646, | |
| "reward_std": 0.06304481262341141, | |
| "completion_length": 78.2625, | |
| "kl": 0.14399517476558685, | |
| "epoch": 0.718, | |
| "step": 1795 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "rewards/reward_fn": 0.44046937823295595, | |
| "reward": 0.44046937823295595, | |
| "reward_std": 0.08519753144355491, | |
| "completion_length": 78.35, | |
| "kl": 0.17743645012378692, | |
| "epoch": 0.72, | |
| "step": 1800 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.3900000000000002e-06, | |
| "rewards/reward_fn": 0.44510937929153443, | |
| "reward": 0.44510937929153443, | |
| "reward_std": 0.064357951504644, | |
| "completion_length": 78.325, | |
| "kl": 0.16583998426795005, | |
| "epoch": 0.722, | |
| "step": 1805 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 23.5, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "rewards/reward_fn": 0.4451799988746643, | |
| "reward": 0.4451799988746643, | |
| "reward_std": 0.06354925713967532, | |
| "completion_length": 78.675, | |
| "kl": 0.1327526532113552, | |
| "epoch": 0.724, | |
| "step": 1810 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 23.875, | |
| "learning_rate": 1.3700000000000002e-06, | |
| "rewards/reward_fn": 0.4643556296825409, | |
| "reward": 0.4643556296825409, | |
| "reward_std": 0.03843736774288118, | |
| "completion_length": 77.925, | |
| "kl": 0.13086711019277572, | |
| "epoch": 0.726, | |
| "step": 1815 | |
| }, | |
| { | |
| "loss": 0.0047, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.3600000000000001e-06, | |
| "rewards/reward_fn": 0.47222812473773956, | |
| "reward": 0.47222812473773956, | |
| "reward_std": 0.014235112490132451, | |
| "completion_length": 78.6625, | |
| "kl": 0.11623715609312057, | |
| "epoch": 0.728, | |
| "step": 1820 | |
| }, | |
| { | |
| "loss": 0.0072, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.3500000000000002e-06, | |
| "rewards/reward_fn": 0.4484899967908859, | |
| "reward": 0.4484899967908859, | |
| "reward_std": 0.06967922276817262, | |
| "completion_length": 77.825, | |
| "kl": 0.17991492599248887, | |
| "epoch": 0.73, | |
| "step": 1825 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 24.5, | |
| "learning_rate": 1.34e-06, | |
| "rewards/reward_fn": 0.46708749830722807, | |
| "reward": 0.46708749830722807, | |
| "reward_std": 0.02486464052926749, | |
| "completion_length": 78.5875, | |
| "kl": 0.13221397027373313, | |
| "epoch": 0.732, | |
| "step": 1830 | |
| }, | |
| { | |
| "loss": 0.0073, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.3300000000000002e-06, | |
| "rewards/reward_fn": 0.45467875599861146, | |
| "reward": 0.45467875599861146, | |
| "reward_std": 0.06859695718158036, | |
| "completion_length": 78.8375, | |
| "kl": 0.18368308618664742, | |
| "epoch": 0.734, | |
| "step": 1835 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.32e-06, | |
| "rewards/reward_fn": 0.4613149970769882, | |
| "reward": 0.4613149970769882, | |
| "reward_std": 0.03261192251229659, | |
| "completion_length": 78.35, | |
| "kl": 0.12882784008979797, | |
| "epoch": 0.736, | |
| "step": 1840 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.3100000000000002e-06, | |
| "rewards/reward_fn": 0.4719943791627884, | |
| "reward": 0.4719943791627884, | |
| "reward_std": 0.009089648583903908, | |
| "completion_length": 76.3375, | |
| "kl": 0.13276104778051376, | |
| "epoch": 0.738, | |
| "step": 1845 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.3e-06, | |
| "rewards/reward_fn": 0.45800375044345853, | |
| "reward": 0.45800375044345853, | |
| "reward_std": 0.0387735236203298, | |
| "completion_length": 79.45, | |
| "kl": 0.1281396232545376, | |
| "epoch": 0.74, | |
| "step": 1850 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.2900000000000001e-06, | |
| "rewards/reward_fn": 0.46116250157356264, | |
| "reward": 0.46116250157356264, | |
| "reward_std": 0.03875681417994201, | |
| "completion_length": 79.425, | |
| "kl": 0.1366500124335289, | |
| "epoch": 0.742, | |
| "step": 1855 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.28e-06, | |
| "rewards/reward_fn": 0.4411043733358383, | |
| "reward": 0.4411043733358383, | |
| "reward_std": 0.07198944769334048, | |
| "completion_length": 78.2625, | |
| "kl": 0.14997942075133325, | |
| "epoch": 0.744, | |
| "step": 1860 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.2700000000000001e-06, | |
| "rewards/reward_fn": 0.4610200017690659, | |
| "reward": 0.4610200017690659, | |
| "reward_std": 0.028940725000575186, | |
| "completion_length": 79.1, | |
| "kl": 0.1250425823032856, | |
| "epoch": 0.746, | |
| "step": 1865 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.26e-06, | |
| "rewards/reward_fn": 0.44400312602519987, | |
| "reward": 0.44400312602519987, | |
| "reward_std": 0.07846251965966075, | |
| "completion_length": 79.3, | |
| "kl": 0.14869983717799187, | |
| "epoch": 0.748, | |
| "step": 1870 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.25e-06, | |
| "rewards/reward_fn": 0.44204375743865965, | |
| "reward": 0.44204375743865965, | |
| "reward_std": 0.09281483425293117, | |
| "completion_length": 78.525, | |
| "kl": 0.17744441479444503, | |
| "epoch": 0.75, | |
| "step": 1875 | |
| }, | |
| { | |
| "loss": 0.0044, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.2400000000000002e-06, | |
| "rewards/reward_fn": 0.4658162444829941, | |
| "reward": 0.4658162444829941, | |
| "reward_std": 0.02522226042347029, | |
| "completion_length": 78.8, | |
| "kl": 0.11068192198872566, | |
| "epoch": 0.752, | |
| "step": 1880 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.23e-06, | |
| "rewards/reward_fn": 0.4581906199455261, | |
| "reward": 0.4581906199455261, | |
| "reward_std": 0.03355656263884157, | |
| "completion_length": 77.15, | |
| "kl": 0.13748234882950783, | |
| "epoch": 0.754, | |
| "step": 1885 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 21.875, | |
| "learning_rate": 1.2200000000000002e-06, | |
| "rewards/reward_fn": 0.45401187539100646, | |
| "reward": 0.45401187539100646, | |
| "reward_std": 0.051014326070435344, | |
| "completion_length": 78.05, | |
| "kl": 0.14651698172092437, | |
| "epoch": 0.756, | |
| "step": 1890 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.21e-06, | |
| "rewards/reward_fn": 0.47379874885082246, | |
| "reward": 0.47379874885082246, | |
| "reward_std": 0.015307459211908282, | |
| "completion_length": 78.8125, | |
| "kl": 0.12749662175774573, | |
| "epoch": 0.758, | |
| "step": 1895 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "rewards/reward_fn": 0.44789875447750094, | |
| "reward": 0.44789875447750094, | |
| "reward_std": 0.0735718347132206, | |
| "completion_length": 78.3, | |
| "kl": 0.1748662807047367, | |
| "epoch": 0.76, | |
| "step": 1900 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 25.875, | |
| "learning_rate": 1.19e-06, | |
| "rewards/reward_fn": 0.4480418682098389, | |
| "reward": 0.4480418682098389, | |
| "reward_std": 0.068895304761827, | |
| "completion_length": 79.225, | |
| "kl": 0.15129087641835212, | |
| "epoch": 0.762, | |
| "step": 1905 | |
| }, | |
| { | |
| "loss": 0.0143, | |
| "grad_norm": 21.75, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "rewards/reward_fn": 0.4410243809223175, | |
| "reward": 0.4410243809223175, | |
| "reward_std": 0.08341788314282894, | |
| "completion_length": 76.725, | |
| "kl": 0.3567336067557335, | |
| "epoch": 0.764, | |
| "step": 1910 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.1700000000000002e-06, | |
| "rewards/reward_fn": 0.4720318764448166, | |
| "reward": 0.4720318764448166, | |
| "reward_std": 0.032237262232229114, | |
| "completion_length": 78.0375, | |
| "kl": 0.14518789127469062, | |
| "epoch": 0.766, | |
| "step": 1915 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.1600000000000001e-06, | |
| "rewards/reward_fn": 0.4640293687582016, | |
| "reward": 0.4640293687582016, | |
| "reward_std": 0.030499694612808527, | |
| "completion_length": 78.4, | |
| "kl": 0.1303658217191696, | |
| "epoch": 0.768, | |
| "step": 1920 | |
| }, | |
| { | |
| "loss": 0.0073, | |
| "grad_norm": 27.875, | |
| "learning_rate": 1.1500000000000002e-06, | |
| "rewards/reward_fn": 0.44796750247478484, | |
| "reward": 0.44796750247478484, | |
| "reward_std": 0.09624997415812686, | |
| "completion_length": 78.65, | |
| "kl": 0.18188868314027787, | |
| "epoch": 0.77, | |
| "step": 1925 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.14e-06, | |
| "rewards/reward_fn": 0.45636438131332396, | |
| "reward": 0.45636438131332396, | |
| "reward_std": 0.08401111733401194, | |
| "completion_length": 77.075, | |
| "kl": 0.1628888465464115, | |
| "epoch": 0.772, | |
| "step": 1930 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.1300000000000002e-06, | |
| "rewards/reward_fn": 0.4671268731355667, | |
| "reward": 0.4671268731355667, | |
| "reward_std": 0.024293193663470446, | |
| "completion_length": 78.3875, | |
| "kl": 0.13081972151994706, | |
| "epoch": 0.774, | |
| "step": 1935 | |
| }, | |
| { | |
| "loss": 0.0068, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.12e-06, | |
| "rewards/reward_fn": 0.45347937643527986, | |
| "reward": 0.45347937643527986, | |
| "reward_std": 0.06302163258660584, | |
| "completion_length": 79.0, | |
| "kl": 0.17032922431826591, | |
| "epoch": 0.776, | |
| "step": 1940 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.1100000000000002e-06, | |
| "rewards/reward_fn": 0.45575874745845796, | |
| "reward": 0.45575874745845796, | |
| "reward_std": 0.0562650595093146, | |
| "completion_length": 78.85, | |
| "kl": 0.12893958985805512, | |
| "epoch": 0.778, | |
| "step": 1945 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.1e-06, | |
| "rewards/reward_fn": 0.4621687412261963, | |
| "reward": 0.4621687412261963, | |
| "reward_std": 0.0637435567798093, | |
| "completion_length": 78.725, | |
| "kl": 0.15337565019726754, | |
| "epoch": 0.78, | |
| "step": 1950 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 23.25, | |
| "learning_rate": 1.0900000000000002e-06, | |
| "rewards/reward_fn": 0.46070688366889956, | |
| "reward": 0.46070688366889956, | |
| "reward_std": 0.03493543366203085, | |
| "completion_length": 77.375, | |
| "kl": 0.1603299029171467, | |
| "epoch": 0.782, | |
| "step": 1955 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.08e-06, | |
| "rewards/reward_fn": 0.46500125527381897, | |
| "reward": 0.46500125527381897, | |
| "reward_std": 0.024533626122865825, | |
| "completion_length": 78.7125, | |
| "kl": 0.1345980040729046, | |
| "epoch": 0.784, | |
| "step": 1960 | |
| }, | |
| { | |
| "loss": 0.0072, | |
| "grad_norm": 22.0, | |
| "learning_rate": 1.0700000000000001e-06, | |
| "rewards/reward_fn": 0.43493750393390657, | |
| "reward": 0.43493750393390657, | |
| "reward_std": 0.11233580666594208, | |
| "completion_length": 78.8625, | |
| "kl": 0.18072494119405746, | |
| "epoch": 0.786, | |
| "step": 1965 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.06e-06, | |
| "rewards/reward_fn": 0.46299062967300414, | |
| "reward": 0.46299062967300414, | |
| "reward_std": 0.0409371492365608, | |
| "completion_length": 78.275, | |
| "kl": 0.12988597080111502, | |
| "epoch": 0.788, | |
| "step": 1970 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.0500000000000001e-06, | |
| "rewards/reward_fn": 0.4538056284189224, | |
| "reward": 0.4538056284189224, | |
| "reward_std": 0.04634799053892493, | |
| "completion_length": 78.4375, | |
| "kl": 0.1439467839896679, | |
| "epoch": 0.79, | |
| "step": 1975 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.04e-06, | |
| "rewards/reward_fn": 0.4611237466335297, | |
| "reward": 0.4611237466335297, | |
| "reward_std": 0.04574344952125102, | |
| "completion_length": 78.6, | |
| "kl": 0.13221421986818313, | |
| "epoch": 0.792, | |
| "step": 1980 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.03e-06, | |
| "rewards/reward_fn": 0.4470118790864944, | |
| "reward": 0.4470118790864944, | |
| "reward_std": 0.08011215794831514, | |
| "completion_length": 79.0, | |
| "kl": 0.15436191707849503, | |
| "epoch": 0.794, | |
| "step": 1985 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.02e-06, | |
| "rewards/reward_fn": 0.46021624803543093, | |
| "reward": 0.46021624803543093, | |
| "reward_std": 0.040483302506618205, | |
| "completion_length": 79.525, | |
| "kl": 0.13130446001887322, | |
| "epoch": 0.796, | |
| "step": 1990 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.01e-06, | |
| "rewards/reward_fn": 0.45304437875747683, | |
| "reward": 0.45304437875747683, | |
| "reward_std": 0.06350767945405096, | |
| "completion_length": 77.9125, | |
| "kl": 0.1545679196715355, | |
| "epoch": 0.798, | |
| "step": 1995 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "rewards/reward_fn": 0.46792625188827514, | |
| "reward": 0.46792625188827514, | |
| "reward_std": 0.029082121956162155, | |
| "completion_length": 77.5375, | |
| "kl": 0.15671682581305504, | |
| "epoch": 0.8, | |
| "step": 2000 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 22.625, | |
| "learning_rate": 9.9e-07, | |
| "rewards/reward_fn": 0.4429012507200241, | |
| "reward": 0.4429012507200241, | |
| "reward_std": 0.0852669625543058, | |
| "completion_length": 78.9125, | |
| "kl": 0.17405613735318184, | |
| "epoch": 0.802, | |
| "step": 2005 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 20.0, | |
| "learning_rate": 9.800000000000001e-07, | |
| "rewards/reward_fn": 0.4483262479305267, | |
| "reward": 0.4483262479305267, | |
| "reward_std": 0.07652467372827232, | |
| "completion_length": 77.325, | |
| "kl": 0.15809645801782607, | |
| "epoch": 0.804, | |
| "step": 2010 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 22.125, | |
| "learning_rate": 9.7e-07, | |
| "rewards/reward_fn": 0.45815313160419463, | |
| "reward": 0.45815313160419463, | |
| "reward_std": 0.045375860878266394, | |
| "completion_length": 78.275, | |
| "kl": 0.1531553089618683, | |
| "epoch": 0.806, | |
| "step": 2015 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.0, | |
| "learning_rate": 9.600000000000001e-07, | |
| "rewards/reward_fn": 0.4604393750429153, | |
| "reward": 0.4604393750429153, | |
| "reward_std": 0.04589560895692557, | |
| "completion_length": 77.825, | |
| "kl": 0.1458041973412037, | |
| "epoch": 0.808, | |
| "step": 2020 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 23.625, | |
| "learning_rate": 9.500000000000001e-07, | |
| "rewards/reward_fn": 0.4627868801355362, | |
| "reward": 0.4627868801355362, | |
| "reward_std": 0.041009452322032305, | |
| "completion_length": 78.1625, | |
| "kl": 0.1340768076479435, | |
| "epoch": 0.81, | |
| "step": 2025 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.625, | |
| "learning_rate": 9.400000000000001e-07, | |
| "rewards/reward_fn": 0.47766625583171846, | |
| "reward": 0.47766625583171846, | |
| "reward_std": 0.008443673443980514, | |
| "completion_length": 79.0625, | |
| "kl": 0.1261758454144001, | |
| "epoch": 0.812, | |
| "step": 2030 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 20.25, | |
| "learning_rate": 9.300000000000001e-07, | |
| "rewards/reward_fn": 0.45916875302791593, | |
| "reward": 0.45916875302791593, | |
| "reward_std": 0.05537645731819794, | |
| "completion_length": 78.35, | |
| "kl": 0.13748721331357955, | |
| "epoch": 0.814, | |
| "step": 2035 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 19.75, | |
| "learning_rate": 9.200000000000001e-07, | |
| "rewards/reward_fn": 0.4704318791627884, | |
| "reward": 0.4704318791627884, | |
| "reward_std": 0.02074106188956648, | |
| "completion_length": 78.2375, | |
| "kl": 0.1439397320151329, | |
| "epoch": 0.816, | |
| "step": 2040 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 23.25, | |
| "learning_rate": 9.100000000000001e-07, | |
| "rewards/reward_fn": 0.457552495598793, | |
| "reward": 0.457552495598793, | |
| "reward_std": 0.04766743449727073, | |
| "completion_length": 79.125, | |
| "kl": 0.14166640490293503, | |
| "epoch": 0.818, | |
| "step": 2045 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 25.0, | |
| "learning_rate": 9.000000000000001e-07, | |
| "rewards/reward_fn": 0.46860311925411224, | |
| "reward": 0.46860311925411224, | |
| "reward_std": 0.03131808526813984, | |
| "completion_length": 78.3, | |
| "kl": 0.12001164257526398, | |
| "epoch": 0.82, | |
| "step": 2050 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 18.5, | |
| "learning_rate": 8.900000000000001e-07, | |
| "rewards/reward_fn": 0.4583775013685226, | |
| "reward": 0.4583775013685226, | |
| "reward_std": 0.0501600137562491, | |
| "completion_length": 78.65, | |
| "kl": 0.13961323350667953, | |
| "epoch": 0.822, | |
| "step": 2055 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 20.625, | |
| "learning_rate": 8.8e-07, | |
| "rewards/reward_fn": 0.4555699944496155, | |
| "reward": 0.4555699944496155, | |
| "reward_std": 0.05186676031444222, | |
| "completion_length": 77.7875, | |
| "kl": 0.17678724601864815, | |
| "epoch": 0.824, | |
| "step": 2060 | |
| }, | |
| { | |
| "loss": 0.0048, | |
| "grad_norm": 21.375, | |
| "learning_rate": 8.7e-07, | |
| "rewards/reward_fn": 0.46306562423706055, | |
| "reward": 0.46306562423706055, | |
| "reward_std": 0.025608734460547566, | |
| "completion_length": 78.45, | |
| "kl": 0.1207703597843647, | |
| "epoch": 0.826, | |
| "step": 2065 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 20.375, | |
| "learning_rate": 8.6e-07, | |
| "rewards/reward_fn": 0.4619231253862381, | |
| "reward": 0.4619231253862381, | |
| "reward_std": 0.05284600446466357, | |
| "completion_length": 78.5625, | |
| "kl": 0.14405835717916488, | |
| "epoch": 0.828, | |
| "step": 2070 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 19.75, | |
| "learning_rate": 8.500000000000001e-07, | |
| "rewards/reward_fn": 0.46677875220775605, | |
| "reward": 0.46677875220775605, | |
| "reward_std": 0.02917533617001027, | |
| "completion_length": 76.925, | |
| "kl": 0.14229361489415168, | |
| "epoch": 0.83, | |
| "step": 2075 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.875, | |
| "learning_rate": 8.400000000000001e-07, | |
| "rewards/reward_fn": 0.46606625616550446, | |
| "reward": 0.46606625616550446, | |
| "reward_std": 0.0280997826019302, | |
| "completion_length": 77.9125, | |
| "kl": 0.13288158997893335, | |
| "epoch": 0.832, | |
| "step": 2080 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 26.75, | |
| "learning_rate": 8.300000000000001e-07, | |
| "rewards/reward_fn": 0.4598168820142746, | |
| "reward": 0.4598168820142746, | |
| "reward_std": 0.03902562449220568, | |
| "completion_length": 78.4625, | |
| "kl": 0.13612622767686844, | |
| "epoch": 0.834, | |
| "step": 2085 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 20.375, | |
| "learning_rate": 8.200000000000001e-07, | |
| "rewards/reward_fn": 0.4634856253862381, | |
| "reward": 0.4634856253862381, | |
| "reward_std": 0.03273412830894813, | |
| "completion_length": 78.8875, | |
| "kl": 0.12488429546356201, | |
| "epoch": 0.836, | |
| "step": 2090 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 19.625, | |
| "learning_rate": 8.100000000000001e-07, | |
| "rewards/reward_fn": 0.469024994969368, | |
| "reward": 0.469024994969368, | |
| "reward_std": 0.025262853922322394, | |
| "completion_length": 79.1, | |
| "kl": 0.12443113997578621, | |
| "epoch": 0.838, | |
| "step": 2095 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 21.875, | |
| "learning_rate": 8.000000000000001e-07, | |
| "rewards/reward_fn": 0.4686718791723251, | |
| "reward": 0.4686718791723251, | |
| "reward_std": 0.03224018139299005, | |
| "completion_length": 77.9125, | |
| "kl": 0.15120850279927253, | |
| "epoch": 0.84, | |
| "step": 2100 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 24.125, | |
| "learning_rate": 7.900000000000001e-07, | |
| "rewards/reward_fn": 0.4641831278800964, | |
| "reward": 0.4641831278800964, | |
| "reward_std": 0.045767600310500714, | |
| "completion_length": 77.9875, | |
| "kl": 0.14055218696594238, | |
| "epoch": 0.842, | |
| "step": 2105 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.625, | |
| "learning_rate": 7.8e-07, | |
| "rewards/reward_fn": 0.44297937452793124, | |
| "reward": 0.44297937452793124, | |
| "reward_std": 0.0778072669985704, | |
| "completion_length": 77.5375, | |
| "kl": 0.147323065251112, | |
| "epoch": 0.844, | |
| "step": 2110 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 20.25, | |
| "learning_rate": 7.7e-07, | |
| "rewards/reward_fn": 0.4716887503862381, | |
| "reward": 0.4716887503862381, | |
| "reward_std": 0.02907162085175514, | |
| "completion_length": 78.675, | |
| "kl": 0.1302117206156254, | |
| "epoch": 0.846, | |
| "step": 2115 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 19.875, | |
| "learning_rate": 7.6e-07, | |
| "rewards/reward_fn": 0.4569631278514862, | |
| "reward": 0.4569631278514862, | |
| "reward_std": 0.04407282890751958, | |
| "completion_length": 79.1125, | |
| "kl": 0.13647983074188233, | |
| "epoch": 0.848, | |
| "step": 2120 | |
| }, | |
| { | |
| "loss": 0.0046, | |
| "grad_norm": 21.75, | |
| "learning_rate": 7.5e-07, | |
| "rewards/reward_fn": 0.47706499695777893, | |
| "reward": 0.47706499695777893, | |
| "reward_std": 0.009564002160914242, | |
| "completion_length": 79.5625, | |
| "kl": 0.11507855504751205, | |
| "epoch": 0.85, | |
| "step": 2125 | |
| }, | |
| { | |
| "loss": 0.0062, | |
| "grad_norm": 21.625, | |
| "learning_rate": 7.4e-07, | |
| "rewards/reward_fn": 0.4304031223058701, | |
| "reward": 0.4304031223058701, | |
| "reward_std": 0.08106931184884161, | |
| "completion_length": 78.85, | |
| "kl": 0.15556320548057556, | |
| "epoch": 0.852, | |
| "step": 2130 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 23.375, | |
| "learning_rate": 7.3e-07, | |
| "rewards/reward_fn": 0.44377937018871305, | |
| "reward": 0.44377937018871305, | |
| "reward_std": 0.08343072717543691, | |
| "completion_length": 78.675, | |
| "kl": 0.14879855364561081, | |
| "epoch": 0.854, | |
| "step": 2135 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 23.5, | |
| "learning_rate": 7.2e-07, | |
| "rewards/reward_fn": 0.45706000328063967, | |
| "reward": 0.45706000328063967, | |
| "reward_std": 0.043012913013808426, | |
| "completion_length": 78.25, | |
| "kl": 0.1595211073756218, | |
| "epoch": 0.856, | |
| "step": 2140 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 19.0, | |
| "learning_rate": 7.1e-07, | |
| "rewards/reward_fn": 0.4507762461900711, | |
| "reward": 0.4507762461900711, | |
| "reward_std": 0.0820188666926697, | |
| "completion_length": 78.475, | |
| "kl": 0.1462649531662464, | |
| "epoch": 0.858, | |
| "step": 2145 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 19.875, | |
| "learning_rate": 7.000000000000001e-07, | |
| "rewards/reward_fn": 0.46033436954021456, | |
| "reward": 0.46033436954021456, | |
| "reward_std": 0.05020685677882284, | |
| "completion_length": 77.6375, | |
| "kl": 0.13829350471496582, | |
| "epoch": 0.86, | |
| "step": 2150 | |
| }, | |
| { | |
| "loss": 0.0065, | |
| "grad_norm": 20.375, | |
| "learning_rate": 6.900000000000001e-07, | |
| "rewards/reward_fn": 0.44231187999248506, | |
| "reward": 0.44231187999248506, | |
| "reward_std": 0.0736640966264531, | |
| "completion_length": 77.6125, | |
| "kl": 0.1614016644656658, | |
| "epoch": 0.862, | |
| "step": 2155 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 19.875, | |
| "learning_rate": 6.800000000000001e-07, | |
| "rewards/reward_fn": 0.45599688291549684, | |
| "reward": 0.45599688291549684, | |
| "reward_std": 0.06550167343229987, | |
| "completion_length": 78.2125, | |
| "kl": 0.15353991836309433, | |
| "epoch": 0.864, | |
| "step": 2160 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.125, | |
| "learning_rate": 6.7e-07, | |
| "rewards/reward_fn": 0.4656537532806396, | |
| "reward": 0.4656537532806396, | |
| "reward_std": 0.025680063420441, | |
| "completion_length": 77.125, | |
| "kl": 0.1347724623978138, | |
| "epoch": 0.866, | |
| "step": 2165 | |
| }, | |
| { | |
| "loss": 0.0067, | |
| "grad_norm": 20.75, | |
| "learning_rate": 6.6e-07, | |
| "rewards/reward_fn": 0.4520668715238571, | |
| "reward": 0.4520668715238571, | |
| "reward_std": 0.07245250167325139, | |
| "completion_length": 78.4375, | |
| "kl": 0.16720658987760545, | |
| "epoch": 0.868, | |
| "step": 2170 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 22.5, | |
| "learning_rate": 6.5e-07, | |
| "rewards/reward_fn": 0.46412250101566316, | |
| "reward": 0.46412250101566316, | |
| "reward_std": 0.03379640890052542, | |
| "completion_length": 77.8625, | |
| "kl": 0.134855917096138, | |
| "epoch": 0.87, | |
| "step": 2175 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 24.25, | |
| "learning_rate": 6.4e-07, | |
| "rewards/reward_fn": 0.4482081264257431, | |
| "reward": 0.4482081264257431, | |
| "reward_std": 0.07765977667877451, | |
| "completion_length": 78.7875, | |
| "kl": 0.13678457364439964, | |
| "epoch": 0.872, | |
| "step": 2180 | |
| }, | |
| { | |
| "loss": 0.0079, | |
| "grad_norm": 23.0, | |
| "learning_rate": 6.3e-07, | |
| "rewards/reward_fn": 0.4295049995183945, | |
| "reward": 0.4295049995183945, | |
| "reward_std": 0.12403819523751736, | |
| "completion_length": 76.825, | |
| "kl": 0.1965901866555214, | |
| "epoch": 0.874, | |
| "step": 2185 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 20.0, | |
| "learning_rate": 6.200000000000001e-07, | |
| "rewards/reward_fn": 0.4354356348514557, | |
| "reward": 0.4354356348514557, | |
| "reward_std": 0.10014819449279458, | |
| "completion_length": 78.525, | |
| "kl": 0.17188069224357605, | |
| "epoch": 0.876, | |
| "step": 2190 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 25.875, | |
| "learning_rate": 6.100000000000001e-07, | |
| "rewards/reward_fn": 0.4368724972009659, | |
| "reward": 0.4368724972009659, | |
| "reward_std": 0.09698029151186346, | |
| "completion_length": 78.4875, | |
| "kl": 0.17358247861266135, | |
| "epoch": 0.878, | |
| "step": 2195 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 20.25, | |
| "learning_rate": 6.000000000000001e-07, | |
| "rewards/reward_fn": 0.4686275005340576, | |
| "reward": 0.4686275005340576, | |
| "reward_std": 0.02711519307922572, | |
| "completion_length": 78.2125, | |
| "kl": 0.14824069589376448, | |
| "epoch": 0.88, | |
| "step": 2200 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 21.125, | |
| "learning_rate": 5.900000000000001e-07, | |
| "rewards/reward_fn": 0.4606556236743927, | |
| "reward": 0.4606556236743927, | |
| "reward_std": 0.057382132229395214, | |
| "completion_length": 78.55, | |
| "kl": 0.1566497005522251, | |
| "epoch": 0.882, | |
| "step": 2205 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 21.125, | |
| "learning_rate": 5.800000000000001e-07, | |
| "rewards/reward_fn": 0.4678006261587143, | |
| "reward": 0.4678006261587143, | |
| "reward_std": 0.028731092542875557, | |
| "completion_length": 77.9625, | |
| "kl": 0.14399609267711638, | |
| "epoch": 0.884, | |
| "step": 2210 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 18.875, | |
| "learning_rate": 5.7e-07, | |
| "rewards/reward_fn": 0.45871124863624574, | |
| "reward": 0.45871124863624574, | |
| "reward_std": 0.061426320811733603, | |
| "completion_length": 77.2625, | |
| "kl": 0.13430218696594237, | |
| "epoch": 0.886, | |
| "step": 2215 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 21.875, | |
| "learning_rate": 5.6e-07, | |
| "rewards/reward_fn": 0.46343562602996824, | |
| "reward": 0.46343562602996824, | |
| "reward_std": 0.048576657217927276, | |
| "completion_length": 78.0375, | |
| "kl": 0.13749194145202637, | |
| "epoch": 0.888, | |
| "step": 2220 | |
| }, | |
| { | |
| "loss": 0.005, | |
| "grad_norm": 21.0, | |
| "learning_rate": 5.5e-07, | |
| "rewards/reward_fn": 0.45629812180995943, | |
| "reward": 0.45629812180995943, | |
| "reward_std": 0.05870918773580343, | |
| "completion_length": 79.225, | |
| "kl": 0.1249243251979351, | |
| "epoch": 0.89, | |
| "step": 2225 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 19.875, | |
| "learning_rate": 5.4e-07, | |
| "rewards/reward_fn": 0.4663606256246567, | |
| "reward": 0.4663606256246567, | |
| "reward_std": 0.03157033738680184, | |
| "completion_length": 77.35, | |
| "kl": 0.1368165969848633, | |
| "epoch": 0.892, | |
| "step": 2230 | |
| }, | |
| { | |
| "loss": 0.0061, | |
| "grad_norm": 20.625, | |
| "learning_rate": 5.3e-07, | |
| "rewards/reward_fn": 0.4638850033283234, | |
| "reward": 0.4638850033283234, | |
| "reward_std": 0.04434651714982465, | |
| "completion_length": 77.3, | |
| "kl": 0.1524613842368126, | |
| "epoch": 0.894, | |
| "step": 2235 | |
| }, | |
| { | |
| "loss": 0.0064, | |
| "grad_norm": 22.0, | |
| "learning_rate": 5.2e-07, | |
| "rewards/reward_fn": 0.4575281262397766, | |
| "reward": 0.4575281262397766, | |
| "reward_std": 0.0726023374358192, | |
| "completion_length": 77.475, | |
| "kl": 0.16076251789927481, | |
| "epoch": 0.896, | |
| "step": 2240 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 21.0, | |
| "learning_rate": 5.1e-07, | |
| "rewards/reward_fn": 0.4624299943447113, | |
| "reward": 0.4624299943447113, | |
| "reward_std": 0.03603266594000161, | |
| "completion_length": 77.6375, | |
| "kl": 0.14188418835401534, | |
| "epoch": 0.898, | |
| "step": 2245 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 21.125, | |
| "learning_rate": 5.000000000000001e-07, | |
| "rewards/reward_fn": 0.4630206227302551, | |
| "reward": 0.4630206227302551, | |
| "reward_std": 0.04361341076437384, | |
| "completion_length": 79.175, | |
| "kl": 0.1345802366733551, | |
| "epoch": 0.9, | |
| "step": 2250 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 21.5, | |
| "learning_rate": 4.900000000000001e-07, | |
| "rewards/reward_fn": 0.45432437062263487, | |
| "reward": 0.45432437062263487, | |
| "reward_std": 0.06151717790635303, | |
| "completion_length": 77.125, | |
| "kl": 0.16546293646097182, | |
| "epoch": 0.902, | |
| "step": 2255 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.875, | |
| "learning_rate": 4.800000000000001e-07, | |
| "rewards/reward_fn": 0.4623143792152405, | |
| "reward": 0.4623143792152405, | |
| "reward_std": 0.04675731394672766, | |
| "completion_length": 78.6875, | |
| "kl": 0.13369161933660506, | |
| "epoch": 0.904, | |
| "step": 2260 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 20.875, | |
| "learning_rate": 4.7000000000000005e-07, | |
| "rewards/reward_fn": 0.4644206166267395, | |
| "reward": 0.4644206166267395, | |
| "reward_std": 0.03467130603967235, | |
| "completion_length": 78.0875, | |
| "kl": 0.13208074048161506, | |
| "epoch": 0.906, | |
| "step": 2265 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 20.625, | |
| "learning_rate": 4.6000000000000004e-07, | |
| "rewards/reward_fn": 0.4571474939584732, | |
| "reward": 0.4571474939584732, | |
| "reward_std": 0.05679858090588823, | |
| "completion_length": 77.3, | |
| "kl": 0.1495486691594124, | |
| "epoch": 0.908, | |
| "step": 2270 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 23.625, | |
| "learning_rate": 4.5000000000000003e-07, | |
| "rewards/reward_fn": 0.42995937168598175, | |
| "reward": 0.42995937168598175, | |
| "reward_std": 0.10549633367918432, | |
| "completion_length": 77.675, | |
| "kl": 0.17482125535607337, | |
| "epoch": 0.91, | |
| "step": 2275 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 22.125, | |
| "learning_rate": 4.4e-07, | |
| "rewards/reward_fn": 0.4737518787384033, | |
| "reward": 0.4737518787384033, | |
| "reward_std": 0.009154988103546202, | |
| "completion_length": 78.3, | |
| "kl": 0.1262164294719696, | |
| "epoch": 0.912, | |
| "step": 2280 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 23.5, | |
| "learning_rate": 4.3e-07, | |
| "rewards/reward_fn": 0.46946938037872316, | |
| "reward": 0.46946938037872316, | |
| "reward_std": 0.03301746472716331, | |
| "completion_length": 78.7125, | |
| "kl": 0.14844730645418167, | |
| "epoch": 0.914, | |
| "step": 2285 | |
| }, | |
| { | |
| "loss": 0.0078, | |
| "grad_norm": 21.25, | |
| "learning_rate": 4.2000000000000006e-07, | |
| "rewards/reward_fn": 0.44804688096046447, | |
| "reward": 0.44804688096046447, | |
| "reward_std": 0.06899331058375538, | |
| "completion_length": 78.1875, | |
| "kl": 0.19460128620266914, | |
| "epoch": 0.916, | |
| "step": 2290 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 22.5, | |
| "learning_rate": 4.1000000000000004e-07, | |
| "rewards/reward_fn": 0.4682606279850006, | |
| "reward": 0.4682606279850006, | |
| "reward_std": 0.05492446586722508, | |
| "completion_length": 78.1375, | |
| "kl": 0.13396066278219224, | |
| "epoch": 0.918, | |
| "step": 2295 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 18.875, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "rewards/reward_fn": 0.45140312910079955, | |
| "reward": 0.45140312910079955, | |
| "reward_std": 0.04784779482288286, | |
| "completion_length": 78.45, | |
| "kl": 0.14303272366523742, | |
| "epoch": 0.92, | |
| "step": 2300 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3.9e-07, | |
| "rewards/reward_fn": 0.4413818746805191, | |
| "reward": 0.4413818746805191, | |
| "reward_std": 0.07519180465023964, | |
| "completion_length": 78.7, | |
| "kl": 0.14771961718797683, | |
| "epoch": 0.922, | |
| "step": 2305 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 21.375, | |
| "learning_rate": 3.8e-07, | |
| "rewards/reward_fn": 0.46304125487804415, | |
| "reward": 0.46304125487804415, | |
| "reward_std": 0.042501320654992014, | |
| "completion_length": 78.475, | |
| "kl": 0.13340821117162704, | |
| "epoch": 0.924, | |
| "step": 2310 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.5, | |
| "learning_rate": 3.7e-07, | |
| "rewards/reward_fn": 0.46366499960422514, | |
| "reward": 0.46366499960422514, | |
| "reward_std": 0.04564647800289094, | |
| "completion_length": 78.75, | |
| "kl": 0.142959389090538, | |
| "epoch": 0.926, | |
| "step": 2315 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 21.5, | |
| "learning_rate": 3.6e-07, | |
| "rewards/reward_fn": 0.44440999925136565, | |
| "reward": 0.44440999925136565, | |
| "reward_std": 0.09289236271288245, | |
| "completion_length": 77.9125, | |
| "kl": 0.17386788129806519, | |
| "epoch": 0.928, | |
| "step": 2320 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 19.375, | |
| "learning_rate": 3.5000000000000004e-07, | |
| "rewards/reward_fn": 0.44386188089847567, | |
| "reward": 0.44386188089847567, | |
| "reward_std": 0.07522995788604021, | |
| "completion_length": 77.9375, | |
| "kl": 0.14186157137155533, | |
| "epoch": 0.93, | |
| "step": 2325 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "rewards/reward_fn": 0.4641656279563904, | |
| "reward": 0.4641656279563904, | |
| "reward_std": 0.04270973342936486, | |
| "completion_length": 78.125, | |
| "kl": 0.13535649850964546, | |
| "epoch": 0.932, | |
| "step": 2330 | |
| }, | |
| { | |
| "loss": 0.0075, | |
| "grad_norm": 22.625, | |
| "learning_rate": 3.3e-07, | |
| "rewards/reward_fn": 0.4358831226825714, | |
| "reward": 0.4358831226825714, | |
| "reward_std": 0.09227959238924086, | |
| "completion_length": 78.2625, | |
| "kl": 0.18859679996967316, | |
| "epoch": 0.934, | |
| "step": 2335 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 19.75, | |
| "learning_rate": 3.2e-07, | |
| "rewards/reward_fn": 0.46294688284397123, | |
| "reward": 0.46294688284397123, | |
| "reward_std": 0.04259873778792098, | |
| "completion_length": 78.7375, | |
| "kl": 0.14116661995649338, | |
| "epoch": 0.936, | |
| "step": 2340 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 26.875, | |
| "learning_rate": 3.1000000000000005e-07, | |
| "rewards/reward_fn": 0.45820999443531035, | |
| "reward": 0.45820999443531035, | |
| "reward_std": 0.049100439576432106, | |
| "completion_length": 78.6875, | |
| "kl": 0.1331377424299717, | |
| "epoch": 0.938, | |
| "step": 2345 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 24.125, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "rewards/reward_fn": 0.4553199976682663, | |
| "reward": 0.4553199976682663, | |
| "reward_std": 0.06563679699320346, | |
| "completion_length": 77.9125, | |
| "kl": 0.13799761608242989, | |
| "epoch": 0.94, | |
| "step": 2350 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 23.375, | |
| "learning_rate": 2.9000000000000003e-07, | |
| "rewards/reward_fn": 0.45740562677383423, | |
| "reward": 0.45740562677383423, | |
| "reward_std": 0.05445564701221883, | |
| "completion_length": 77.05, | |
| "kl": 0.17594465613365173, | |
| "epoch": 0.942, | |
| "step": 2355 | |
| }, | |
| { | |
| "loss": 0.0066, | |
| "grad_norm": 28.25, | |
| "learning_rate": 2.8e-07, | |
| "rewards/reward_fn": 0.463620001077652, | |
| "reward": 0.463620001077652, | |
| "reward_std": 0.051476556318812074, | |
| "completion_length": 77.0625, | |
| "kl": 0.1647212788462639, | |
| "epoch": 0.944, | |
| "step": 2360 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 19.625, | |
| "learning_rate": 2.7e-07, | |
| "rewards/reward_fn": 0.46528937220573424, | |
| "reward": 0.46528937220573424, | |
| "reward_std": 0.022915772977285087, | |
| "completion_length": 78.4625, | |
| "kl": 0.132051981985569, | |
| "epoch": 0.946, | |
| "step": 2365 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 19.875, | |
| "learning_rate": 2.6e-07, | |
| "rewards/reward_fn": 0.4677406221628189, | |
| "reward": 0.4677406221628189, | |
| "reward_std": 0.05549450130201876, | |
| "completion_length": 78.125, | |
| "kl": 0.1719025544822216, | |
| "epoch": 0.948, | |
| "step": 2370 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 21.0, | |
| "learning_rate": 2.5000000000000004e-07, | |
| "rewards/reward_fn": 0.45939249396324155, | |
| "reward": 0.45939249396324155, | |
| "reward_std": 0.0691711014136672, | |
| "completion_length": 78.875, | |
| "kl": 0.14711649268865584, | |
| "epoch": 0.95, | |
| "step": 2375 | |
| }, | |
| { | |
| "loss": 0.0069, | |
| "grad_norm": 23.125, | |
| "learning_rate": 2.4000000000000003e-07, | |
| "rewards/reward_fn": 0.4654675006866455, | |
| "reward": 0.4654675006866455, | |
| "reward_std": 0.030214719858486207, | |
| "completion_length": 77.5, | |
| "kl": 0.1717626817524433, | |
| "epoch": 0.952, | |
| "step": 2380 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 17.125, | |
| "learning_rate": 2.3000000000000002e-07, | |
| "rewards/reward_fn": 0.45497375130653384, | |
| "reward": 0.45497375130653384, | |
| "reward_std": 0.06808556367177516, | |
| "completion_length": 77.825, | |
| "kl": 0.15703836753964423, | |
| "epoch": 0.954, | |
| "step": 2385 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 22.75, | |
| "learning_rate": 2.2e-07, | |
| "rewards/reward_fn": 0.44606186747550963, | |
| "reward": 0.44606186747550963, | |
| "reward_std": 0.08989615420578048, | |
| "completion_length": 78.0875, | |
| "kl": 0.15060136690735818, | |
| "epoch": 0.956, | |
| "step": 2390 | |
| }, | |
| { | |
| "loss": 0.0051, | |
| "grad_norm": 20.5, | |
| "learning_rate": 2.1000000000000003e-07, | |
| "rewards/reward_fn": 0.47334000170230867, | |
| "reward": 0.47334000170230867, | |
| "reward_std": 0.029013207624666394, | |
| "completion_length": 78.675, | |
| "kl": 0.1277802363038063, | |
| "epoch": 0.958, | |
| "step": 2395 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 21.125, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "rewards/reward_fn": 0.4681881219148636, | |
| "reward": 0.4681881219148636, | |
| "reward_std": 0.02434324522037059, | |
| "completion_length": 79.625, | |
| "kl": 0.1305567964911461, | |
| "epoch": 0.96, | |
| "step": 2400 | |
| }, | |
| { | |
| "loss": 0.006, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.9e-07, | |
| "rewards/reward_fn": 0.44461186826229093, | |
| "reward": 0.44461186826229093, | |
| "reward_std": 0.07648974631447344, | |
| "completion_length": 79.3375, | |
| "kl": 0.14975779727101327, | |
| "epoch": 0.962, | |
| "step": 2405 | |
| }, | |
| { | |
| "loss": 0.0049, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.8e-07, | |
| "rewards/reward_fn": 0.46052125096321106, | |
| "reward": 0.46052125096321106, | |
| "reward_std": 0.0383532726438716, | |
| "completion_length": 77.55, | |
| "kl": 0.12272944673895836, | |
| "epoch": 0.964, | |
| "step": 2410 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.7000000000000001e-07, | |
| "rewards/reward_fn": 0.4670031249523163, | |
| "reward": 0.4670031249523163, | |
| "reward_std": 0.03299781592795625, | |
| "completion_length": 78.025, | |
| "kl": 0.14154839739203454, | |
| "epoch": 0.966, | |
| "step": 2415 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.6e-07, | |
| "rewards/reward_fn": 0.46661687791347506, | |
| "reward": 0.46661687791347506, | |
| "reward_std": 0.02604542833287269, | |
| "completion_length": 77.2, | |
| "kl": 0.14838956594467162, | |
| "epoch": 0.968, | |
| "step": 2420 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.5000000000000002e-07, | |
| "rewards/reward_fn": 0.455153751373291, | |
| "reward": 0.455153751373291, | |
| "reward_std": 0.04773070907685906, | |
| "completion_length": 78.75, | |
| "kl": 0.1395164869725704, | |
| "epoch": 0.97, | |
| "step": 2425 | |
| }, | |
| { | |
| "loss": 0.0055, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.4e-07, | |
| "rewards/reward_fn": 0.46802250742912294, | |
| "reward": 0.46802250742912294, | |
| "reward_std": 0.03353580196853727, | |
| "completion_length": 78.1375, | |
| "kl": 0.13703610971570016, | |
| "epoch": 0.972, | |
| "step": 2430 | |
| }, | |
| { | |
| "loss": 0.0058, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.3e-07, | |
| "rewards/reward_fn": 0.44542625546455383, | |
| "reward": 0.44542625546455383, | |
| "reward_std": 0.07354072753805667, | |
| "completion_length": 79.375, | |
| "kl": 0.1450169213116169, | |
| "epoch": 0.974, | |
| "step": 2435 | |
| }, | |
| { | |
| "loss": 0.0059, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.2000000000000002e-07, | |
| "rewards/reward_fn": 0.45854686498641967, | |
| "reward": 0.45854686498641967, | |
| "reward_std": 0.05262974831275642, | |
| "completion_length": 78.1625, | |
| "kl": 0.1467783972620964, | |
| "epoch": 0.976, | |
| "step": 2440 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 21.5, | |
| "learning_rate": 1.1e-07, | |
| "rewards/reward_fn": 0.4662149965763092, | |
| "reward": 0.4662149965763092, | |
| "reward_std": 0.030651798704639077, | |
| "completion_length": 77.7375, | |
| "kl": 0.1360873505473137, | |
| "epoch": 0.978, | |
| "step": 2445 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.0000000000000001e-07, | |
| "rewards/reward_fn": 0.4552900016307831, | |
| "reward": 0.4552900016307831, | |
| "reward_std": 0.05980135982390493, | |
| "completion_length": 76.8625, | |
| "kl": 0.15636155605316163, | |
| "epoch": 0.98, | |
| "step": 2450 | |
| }, | |
| { | |
| "loss": 0.0053, | |
| "grad_norm": 22.0, | |
| "learning_rate": 9e-08, | |
| "rewards/reward_fn": 0.45424186289310453, | |
| "reward": 0.45424186289310453, | |
| "reward_std": 0.06591771512757987, | |
| "completion_length": 77.5625, | |
| "kl": 0.13355037719011306, | |
| "epoch": 0.982, | |
| "step": 2455 | |
| }, | |
| { | |
| "loss": 0.0067, | |
| "grad_norm": 24.375, | |
| "learning_rate": 8e-08, | |
| "rewards/reward_fn": 0.438731250166893, | |
| "reward": 0.438731250166893, | |
| "reward_std": 0.10588476944249123, | |
| "completion_length": 77.7, | |
| "kl": 0.1672614686191082, | |
| "epoch": 0.984, | |
| "step": 2460 | |
| }, | |
| { | |
| "loss": 0.0045, | |
| "grad_norm": 20.125, | |
| "learning_rate": 7e-08, | |
| "rewards/reward_fn": 0.4748474985361099, | |
| "reward": 0.4748474985361099, | |
| "reward_std": 0.011794954282231629, | |
| "completion_length": 78.425, | |
| "kl": 0.11192921400070191, | |
| "epoch": 0.986, | |
| "step": 2465 | |
| }, | |
| { | |
| "loss": 0.0056, | |
| "grad_norm": 21.75, | |
| "learning_rate": 6.000000000000001e-08, | |
| "rewards/reward_fn": 0.47034125924110415, | |
| "reward": 0.47034125924110415, | |
| "reward_std": 0.028933694993611425, | |
| "completion_length": 77.525, | |
| "kl": 0.13980434015393256, | |
| "epoch": 0.988, | |
| "step": 2470 | |
| }, | |
| { | |
| "loss": 0.007, | |
| "grad_norm": 20.625, | |
| "learning_rate": 5.0000000000000004e-08, | |
| "rewards/reward_fn": 0.46100749671459196, | |
| "reward": 0.46100749671459196, | |
| "reward_std": 0.046814579702913764, | |
| "completion_length": 78.225, | |
| "kl": 0.17536836490035057, | |
| "epoch": 0.99, | |
| "step": 2475 | |
| }, | |
| { | |
| "loss": 0.0052, | |
| "grad_norm": 21.0, | |
| "learning_rate": 4e-08, | |
| "rewards/reward_fn": 0.45782187581062317, | |
| "reward": 0.45782187581062317, | |
| "reward_std": 0.06049414209555835, | |
| "completion_length": 78.275, | |
| "kl": 0.13118749782443045, | |
| "epoch": 0.992, | |
| "step": 2480 | |
| }, | |
| { | |
| "loss": 0.0063, | |
| "grad_norm": 20.625, | |
| "learning_rate": 3.0000000000000004e-08, | |
| "rewards/reward_fn": 0.44442749917507174, | |
| "reward": 0.44442749917507174, | |
| "reward_std": 0.08098832431714982, | |
| "completion_length": 77.925, | |
| "kl": 0.158622158318758, | |
| "epoch": 0.994, | |
| "step": 2485 | |
| }, | |
| { | |
| "loss": 0.0054, | |
| "grad_norm": 22.625, | |
| "learning_rate": 2e-08, | |
| "rewards/reward_fn": 0.47422375380992887, | |
| "reward": 0.47422375380992887, | |
| "reward_std": 0.018112805008422585, | |
| "completion_length": 77.9875, | |
| "kl": 0.13429155126214026, | |
| "epoch": 0.996, | |
| "step": 2490 | |
| }, | |
| { | |
| "loss": 0.0071, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1e-08, | |
| "rewards/reward_fn": 0.4550556272268295, | |
| "reward": 0.4550556272268295, | |
| "reward_std": 0.06616670698858798, | |
| "completion_length": 78.4625, | |
| "kl": 0.17691104635596275, | |
| "epoch": 0.998, | |
| "step": 2495 | |
| }, | |
| { | |
| "loss": 0.0057, | |
| "grad_norm": 20.125, | |
| "learning_rate": 0.0, | |
| "rewards/reward_fn": 0.46899437308311465, | |
| "reward": 0.46899437308311465, | |
| "reward_std": 0.03491774908034131, | |
| "completion_length": 78.35, | |
| "kl": 0.14370609149336816, | |
| "epoch": 1.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "train_runtime": 10996.7976, | |
| "train_samples_per_second": 0.455, | |
| "train_steps_per_second": 0.227, | |
| "total_flos": 0.0, | |
| "train_loss": 0.005647265207767487, | |
| "epoch": 1.0, | |
| "step": 2500 | |
| } | |
| ] | |
| } |