diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,48034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 282.3125, + "completions/mean_terminated_length": 282.3125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.34285064321011305, + "epoch": 0.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6038885116577148, + "learning_rate": 0.0, + "loss": 0.1215, + "num_tokens": 9066.0, + "reward": 2.776546001434326, + "reward_std": 3.6707262992858887, + "rewards/fitness_reward/mean": 2.6902005672454834, + "rewards/fitness_reward/std": 3.9962387084960938, + "rewards/kidney_reward/mean": -0.014779508113861084, + "rewards/kidney_reward/std": 0.944055438041687, + "rewards/length2tails_reward/mean": 0.3619094491004944, + "rewards/length2tails_reward/std": 0.47263848781585693, + "rewards/thermo_reward/mean": 0.006515428423881531, + "rewards/thermo_reward/std": 1.582302212715149, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 265.3125, + "completions/mean_terminated_length": 265.3125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.25550550501793623, + "epoch": 0.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1680359840393066, + "learning_rate": 4e-08, + "loss": -0.0098, + "num_tokens": 17588.0, + "reward": 2.6952362060546875, + "reward_std": 3.8378028869628906, + "rewards/fitness_reward/mean": 2.763430118560791, + "rewards/fitness_reward/std": 4.361652374267578, + "rewards/kidney_reward/mean": -0.05466890335083008, + "rewards/kidney_reward/std": 1.101677656173706, + "rewards/length2tails_reward/mean": 0.49158233404159546, + "rewards/length2tails_reward/std": 0.4806991517543793, + "rewards/thermo_reward/mean": -0.3275108337402344, + "rewards/thermo_reward/std": 1.8423668146133423, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.30424352758564055, + "epoch": 0.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1832661628723145, + "learning_rate": 8e-08, + "loss": 0.0287, + "num_tokens": 26258.0, + "reward": 2.8955960273742676, + "reward_std": 4.059200763702393, + "rewards/fitness_reward/mean": 3.0105772018432617, + "rewards/fitness_reward/std": 4.357508659362793, + "rewards/kidney_reward/mean": -0.23322094976902008, + "rewards/kidney_reward/std": 0.9928128719329834, + "rewards/length2tails_reward/mean": 0.4026995301246643, + "rewards/length2tails_reward/std": 0.4532533884048462, + "rewards/thermo_reward/mean": -0.19809073209762573, + "rewards/thermo_reward/std": 1.4632333517074585, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 275.1875, + "completions/mean_terminated_length": 275.1875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.28253951808437705, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9522150754928589, + "learning_rate": 1.2e-07, + "loss": 0.0271, + "num_tokens": 35096.0, + "reward": 1.998507022857666, + "reward_std": 3.9490599632263184, + "rewards/fitness_reward/mean": 2.1376171112060547, + "rewards/fitness_reward/std": 4.451599597930908, + "rewards/kidney_reward/mean": -0.14740341901779175, + "rewards/kidney_reward/std": 1.0896557569503784, + "rewards/length2tails_reward/mean": 0.4439179301261902, + "rewards/length2tails_reward/std": 0.4727405905723572, + "rewards/thermo_reward/mean": -0.3527754545211792, + "rewards/thermo_reward/std": 1.5906970500946045, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 258.15625, + "completions/mean_terminated_length": 258.15625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.14575099013745785, + "epoch": 0.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8847610950469971, + "learning_rate": 1.6e-07, + "loss": -0.0549, + "num_tokens": 43389.0, + "reward": 3.631153106689453, + "reward_std": 3.643018960952759, + "rewards/fitness_reward/mean": 3.822523593902588, + "rewards/fitness_reward/std": 3.948245048522949, + "rewards/kidney_reward/mean": -0.1129472404718399, + "rewards/kidney_reward/std": 1.0505248308181763, + "rewards/length2tails_reward/mean": 0.4005865156650543, + "rewards/length2tails_reward/std": 0.44541364908218384, + "rewards/thermo_reward/mean": -0.4700867533683777, + "rewards/thermo_reward/std": 1.7214210033416748, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.30541147477924824, + "epoch": 0.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3191102743148804, + "learning_rate": 2e-07, + "loss": 0.0231, + "num_tokens": 52123.0, + "reward": 3.4447648525238037, + "reward_std": 3.701854944229126, + "rewards/fitness_reward/mean": 3.469925880432129, + "rewards/fitness_reward/std": 3.4349849224090576, + "rewards/kidney_reward/mean": 0.07564692199230194, + "rewards/kidney_reward/std": 1.061210036277771, + "rewards/length2tails_reward/mean": 0.3852400779724121, + "rewards/length2tails_reward/std": 0.46152031421661377, + "rewards/thermo_reward/mean": -0.3185890316963196, + "rewards/thermo_reward/std": 1.1925057172775269, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 266.53125, + "completions/mean_terminated_length": 266.53125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.17479356518015265, + "epoch": 0.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453187346458435, + "learning_rate": 2.4e-07, + "loss": -0.0039, + "num_tokens": 60684.0, + "reward": 4.399518013000488, + "reward_std": 3.6989517211914062, + "rewards/fitness_reward/mean": 4.309534072875977, + "rewards/fitness_reward/std": 3.625988483428955, + "rewards/kidney_reward/mean": 0.16553248465061188, + "rewards/kidney_reward/std": 1.1728154420852661, + "rewards/length2tails_reward/mean": 0.41060203313827515, + "rewards/length2tails_reward/std": 0.45657584071159363, + "rewards/thermo_reward/mean": -0.19086632132530212, + "rewards/thermo_reward/std": 1.5577343702316284, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 278.0, + "completions/mean_terminated_length": 278.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.2710487926378846, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3498140573501587, + "learning_rate": 2.8e-07, + "loss": 0.0108, + "num_tokens": 69612.0, + "reward": 2.644428253173828, + "reward_std": 4.3834662437438965, + "rewards/fitness_reward/mean": 2.7699179649353027, + "rewards/fitness_reward/std": 4.351969242095947, + "rewards/kidney_reward/mean": -0.1517922580242157, + "rewards/kidney_reward/std": 1.004507303237915, + "rewards/length2tails_reward/mean": 0.4273234009742737, + "rewards/length2tails_reward/std": 0.4685462415218353, + "rewards/thermo_reward/mean": -0.31284886598587036, + "rewards/thermo_reward/std": 1.7761188745498657, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 284.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.16439750418066978, + "epoch": 0.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4144439697265625, + "learning_rate": 3.2e-07, + "loss": 0.1057, + "num_tokens": 78732.0, + "reward": 2.6186068058013916, + "reward_std": 3.714634895324707, + "rewards/fitness_reward/mean": 2.789903163909912, + "rewards/fitness_reward/std": 4.058469295501709, + "rewards/kidney_reward/mean": -0.16366836428642273, + "rewards/kidney_reward/std": 1.120846152305603, + "rewards/length2tails_reward/mean": 0.4343424439430237, + "rewards/length2tails_reward/std": 0.47384053468704224, + "rewards/thermo_reward/mean": -0.3960953652858734, + "rewards/thermo_reward/std": 1.533713459968567, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 263.21875, + "completions/mean_terminated_length": 263.21875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.213692031102255, + "epoch": 0.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2669860124588013, + "learning_rate": 3.6e-07, + "loss": -0.0059, + "num_tokens": 87187.0, + "reward": 2.8165817260742188, + "reward_std": 3.6723098754882812, + "rewards/fitness_reward/mean": 3.185605049133301, + "rewards/fitness_reward/std": 4.036534309387207, + "rewards/kidney_reward/mean": -0.2615082561969757, + "rewards/kidney_reward/std": 0.9169155359268188, + "rewards/length2tails_reward/mean": 0.3918038606643677, + "rewards/length2tails_reward/std": 0.46539491415023804, + "rewards/thermo_reward/mean": -0.6724401712417603, + "rewards/thermo_reward/std": 1.888286828994751, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 276.03125, + "completions/mean_terminated_length": 276.03125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.1332897578831762, + "epoch": 0.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8590123057365417, + "learning_rate": 4e-07, + "loss": 0.0672, + "num_tokens": 96052.0, + "reward": 2.619511127471924, + "reward_std": 3.0328433513641357, + "rewards/fitness_reward/mean": 2.949721336364746, + "rewards/fitness_reward/std": 3.4479706287384033, + "rewards/kidney_reward/mean": -0.3590899705886841, + "rewards/kidney_reward/std": 0.6021451950073242, + "rewards/length2tails_reward/mean": 0.24954092502593994, + "rewards/length2tails_reward/std": 0.4148816764354706, + "rewards/thermo_reward/mean": -0.42610087990760803, + "rewards/thermo_reward/std": 1.485501766204834, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 294.0, + "completions/mean_terminated_length": 294.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.2908358834683895, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5993109941482544, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.1808, + "num_tokens": 105492.0, + "reward": 2.504091501235962, + "reward_std": 3.4993772506713867, + "rewards/fitness_reward/mean": 2.8465797901153564, + "rewards/fitness_reward/std": 4.194863319396973, + "rewards/kidney_reward/mean": -0.10735474526882172, + "rewards/kidney_reward/std": 1.0824220180511475, + "rewards/length2tails_reward/mean": 0.41566556692123413, + "rewards/length2tails_reward/std": 0.4747588336467743, + "rewards/thermo_reward/mean": -0.7854543328285217, + "rewards/thermo_reward/std": 1.695159912109375, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 265.0625, + "completions/mean_terminated_length": 265.0625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.2911716205999255, + "epoch": 0.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40619695186615, + "learning_rate": 4.8e-07, + "loss": 0.004, + "num_tokens": 114006.0, + "reward": 1.9322597980499268, + "reward_std": 4.115525722503662, + "rewards/fitness_reward/mean": 2.053403615951538, + "rewards/fitness_reward/std": 4.407435417175293, + "rewards/kidney_reward/mean": -0.31295573711395264, + "rewards/kidney_reward/std": 0.8682083487510681, + "rewards/length2tails_reward/mean": 0.48870545625686646, + "rewards/length2tails_reward/std": 0.49721574783325195, + "rewards/thermo_reward/mean": -0.17368489503860474, + "rewards/thermo_reward/std": 2.0285637378692627, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 279.15625, + "completions/mean_terminated_length": 279.15625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.26627546083182096, + "epoch": 0.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0304509401321411, + "learning_rate": 5.2e-07, + "loss": 0.0939, + "num_tokens": 122971.0, + "reward": 2.5467944145202637, + "reward_std": 3.6509149074554443, + "rewards/fitness_reward/mean": 2.655552387237549, + "rewards/fitness_reward/std": 4.0566606521606445, + "rewards/kidney_reward/mean": 0.1742018312215805, + "rewards/kidney_reward/std": 1.108739972114563, + "rewards/length2tails_reward/mean": 0.346333771944046, + "rewards/length2tails_reward/std": 0.43246445059776306, + "rewards/thermo_reward/mean": -0.564885139465332, + "rewards/thermo_reward/std": 1.453627109527588, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.09859434771351516, + "epoch": 0.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0298631191253662, + "learning_rate": 5.6e-07, + "loss": 0.0469, + "num_tokens": 131707.0, + "reward": 3.6903483867645264, + "reward_std": 3.1678555011749268, + "rewards/fitness_reward/mean": 3.889268398284912, + "rewards/fitness_reward/std": 3.267036199569702, + "rewards/kidney_reward/mean": -0.27444595098495483, + "rewards/kidney_reward/std": 0.8988531231880188, + "rewards/length2tails_reward/mean": 0.25988560914993286, + "rewards/length2tails_reward/std": 0.39783328771591187, + "rewards/thermo_reward/mean": -0.25333669781684875, + "rewards/thermo_reward/std": 1.2771704196929932, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 267.28125, + "completions/mean_terminated_length": 267.28125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.19521328411065042, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1851061582565308, + "learning_rate": 6e-07, + "loss": 0.0212, + "num_tokens": 140292.0, + "reward": 2.5893378257751465, + "reward_std": 3.584918737411499, + "rewards/fitness_reward/mean": 2.814847946166992, + "rewards/fitness_reward/std": 3.982393741607666, + "rewards/kidney_reward/mean": -0.11568474769592285, + "rewards/kidney_reward/std": 0.9866483807563782, + "rewards/length2tails_reward/mean": 0.3492621183395386, + "rewards/length2tails_reward/std": 0.45565006136894226, + "rewards/thermo_reward/mean": -0.5099663138389587, + "rewards/thermo_reward/std": 1.4580680131912231, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 282.5625, + "completions/mean_terminated_length": 282.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.28495415579527617, + "epoch": 0.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2181986570358276, + "learning_rate": 6.4e-07, + "loss": 0.0974, + "num_tokens": 149366.0, + "reward": 2.808767795562744, + "reward_std": 3.643484115600586, + "rewards/fitness_reward/mean": 3.0713696479797363, + "rewards/fitness_reward/std": 3.9968082904815674, + "rewards/kidney_reward/mean": -0.382036954164505, + "rewards/kidney_reward/std": 0.8268358707427979, + "rewards/length2tails_reward/mean": 0.5024834871292114, + "rewards/length2tails_reward/std": 0.4791508615016937, + "rewards/thermo_reward/mean": -0.3944079279899597, + "rewards/thermo_reward/std": 1.9147756099700928, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.17844699136912823, + "epoch": 0.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2913175821304321, + "learning_rate": 6.800000000000001e-07, + "loss": 0.072, + "num_tokens": 158292.0, + "reward": 2.8846983909606934, + "reward_std": 3.638531446456909, + "rewards/fitness_reward/mean": 3.058784008026123, + "rewards/fitness_reward/std": 3.7510721683502197, + "rewards/kidney_reward/mean": -0.2737460136413574, + "rewards/kidney_reward/std": 0.8914804458618164, + "rewards/length2tails_reward/mean": 0.34188228845596313, + "rewards/length2tails_reward/std": 0.4734570384025574, + "rewards/thermo_reward/mean": -0.24536648392677307, + "rewards/thermo_reward/std": 1.4068657159805298, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 297.625, + "completions/mean_terminated_length": 297.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.19913436053320765, + "epoch": 0.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7019699811935425, + "learning_rate": 7.2e-07, + "loss": 0.1602, + "num_tokens": 167848.0, + "reward": 2.3710553646087646, + "reward_std": 4.5642547607421875, + "rewards/fitness_reward/mean": 2.7442526817321777, + "rewards/fitness_reward/std": 4.607006072998047, + "rewards/kidney_reward/mean": -0.2314104288816452, + "rewards/kidney_reward/std": 0.9755641222000122, + "rewards/length2tails_reward/mean": 0.4541005790233612, + "rewards/length2tails_reward/std": 0.48273563385009766, + "rewards/thermo_reward/mean": -0.7420345544815063, + "rewards/thermo_reward/std": 1.7983129024505615, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.10643508494831622, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2317397594451904, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0675, + "num_tokens": 176656.0, + "reward": 3.6184439659118652, + "reward_std": 3.254441738128662, + "rewards/fitness_reward/mean": 4.041838645935059, + "rewards/fitness_reward/std": 3.4221901893615723, + "rewards/kidney_reward/mean": -0.1963132917881012, + "rewards/kidney_reward/std": 0.9940592050552368, + "rewards/length2tails_reward/mean": 0.3017134666442871, + "rewards/length2tails_reward/std": 0.44021928310394287, + "rewards/thermo_reward/mean": -0.8013330698013306, + "rewards/thermo_reward/std": 1.6256381273269653, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 283.46875, + "completions/mean_terminated_length": 283.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1744343377649784, + "epoch": 0.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2164242267608643, + "learning_rate": 8e-07, + "loss": 0.1111, + "num_tokens": 185759.0, + "reward": 2.710228443145752, + "reward_std": 3.926434278488159, + "rewards/fitness_reward/mean": 2.9178555011749268, + "rewards/fitness_reward/std": 3.801013946533203, + "rewards/kidney_reward/mean": -0.17946933209896088, + "rewards/kidney_reward/std": 1.0358121395111084, + "rewards/length2tails_reward/mean": 0.2689725160598755, + "rewards/length2tails_reward/std": 0.4235849678516388, + "rewards/thermo_reward/mean": -0.3702709972858429, + "rewards/thermo_reward/std": 1.3275806903839111, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 263.4375, + "completions/mean_terminated_length": 263.4375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.35666807275265455, + "epoch": 0.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3615895509719849, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0103, + "num_tokens": 194221.0, + "reward": 1.7372136116027832, + "reward_std": 4.236116409301758, + "rewards/fitness_reward/mean": 1.6128908395767212, + "rewards/fitness_reward/std": 4.648111343383789, + "rewards/kidney_reward/mean": 0.22530707716941833, + "rewards/kidney_reward/std": 0.9966040253639221, + "rewards/length2tails_reward/mean": 0.5139718055725098, + "rewards/length2tails_reward/std": 0.48198941349983215, + "rewards/thermo_reward/mean": -0.23364755511283875, + "rewards/thermo_reward/std": 1.8280906677246094, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 293.46875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.2110906399320811, + "epoch": 0.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1795854568481445, + "learning_rate": 8.799999999999999e-07, + "loss": 0.1542, + "num_tokens": 203644.0, + "reward": 2.4099345207214355, + "reward_std": 3.9786722660064697, + "rewards/fitness_reward/mean": 2.844266891479492, + "rewards/fitness_reward/std": 4.1898393630981445, + "rewards/kidney_reward/mean": -0.45548105239868164, + "rewards/kidney_reward/std": 0.7641519904136658, + "rewards/length2tails_reward/mean": 0.49857550859451294, + "rewards/length2tails_reward/std": 0.4670303165912628, + "rewards/thermo_reward/mean": -0.6624711751937866, + "rewards/thermo_reward/std": 1.6689636707305908, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.2916894480586052, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.159227967262268, + "learning_rate": 9.2e-07, + "loss": 0.0248, + "num_tokens": 212328.0, + "reward": 2.407467842102051, + "reward_std": 3.439573287963867, + "rewards/fitness_reward/mean": 2.5296783447265625, + "rewards/fitness_reward/std": 3.747447967529297, + "rewards/kidney_reward/mean": -0.2728431224822998, + "rewards/kidney_reward/std": 0.8738398551940918, + "rewards/length2tails_reward/mean": 0.3025975227355957, + "rewards/length2tails_reward/std": 0.44032588601112366, + "rewards/thermo_reward/mean": -0.1228766143321991, + "rewards/thermo_reward/std": 1.4348689317703247, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.1331020297948271, + "epoch": 0.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2533458471298218, + "learning_rate": 9.6e-07, + "loss": -0.0039, + "num_tokens": 220864.0, + "reward": 3.805389881134033, + "reward_std": 3.130521774291992, + "rewards/fitness_reward/mean": 3.9648332595825195, + "rewards/fitness_reward/std": 3.3284666538238525, + "rewards/kidney_reward/mean": -0.11595182865858078, + "rewards/kidney_reward/std": 0.9243037104606628, + "rewards/length2tails_reward/mean": 0.3001132011413574, + "rewards/length2tails_reward/std": 0.4152694344520569, + "rewards/thermo_reward/mean": -0.3529908061027527, + "rewards/thermo_reward/std": 1.5167909860610962, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.16058688261546195, + "epoch": 0.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6334291100502014, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 229444.0, + "reward": 3.3169748783111572, + "reward_std": 3.120516538619995, + "rewards/fitness_reward/mean": 3.4925124645233154, + "rewards/fitness_reward/std": 3.340493679046631, + "rewards/kidney_reward/mean": -0.30361461639404297, + "rewards/kidney_reward/std": 0.7561451196670532, + "rewards/length2tails_reward/mean": 0.3624529242515564, + "rewards/length2tails_reward/std": 0.4537730813026428, + "rewards/thermo_reward/mean": -0.22868666052818298, + "rewards/thermo_reward/std": 1.5443941354751587, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 266.5625, + "completions/mean_terminated_length": 266.5625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.12170495046302676, + "epoch": 0.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9874021410942078, + "learning_rate": 1.04e-06, + "loss": -0.0104, + "num_tokens": 238006.0, + "reward": 3.653249979019165, + "reward_std": 3.816178560256958, + "rewards/fitness_reward/mean": 3.920667886734009, + "rewards/fitness_reward/std": 4.002020835876465, + "rewards/kidney_reward/mean": -0.01481878012418747, + "rewards/kidney_reward/std": 1.17020845413208, + "rewards/length2tails_reward/mean": 0.45274999737739563, + "rewards/length2tails_reward/std": 0.4590936601161957, + "rewards/thermo_reward/mean": -0.7463918924331665, + "rewards/thermo_reward/std": 1.5766205787658691, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 264.9375, + "completions/mean_terminated_length": 264.9375, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 0.1064134482294321, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6786694526672363, + "learning_rate": 1.08e-06, + "loss": -0.0297, + "num_tokens": 246516.0, + "reward": 3.909611701965332, + "reward_std": 3.2675232887268066, + "rewards/fitness_reward/mean": 4.002963066101074, + "rewards/fitness_reward/std": 3.269036054611206, + "rewards/kidney_reward/mean": 0.0017276406288146973, + "rewards/kidney_reward/std": 1.061722993850708, + "rewards/length2tails_reward/mean": 0.3930842876434326, + "rewards/length2tails_reward/std": 0.44041237235069275, + "rewards/thermo_reward/mean": -0.3849724531173706, + "rewards/thermo_reward/std": 1.5721300840377808, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 285.46875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.2716119010001421, + "epoch": 0.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.758548617362976, + "learning_rate": 1.12e-06, + "loss": 0.0978, + "num_tokens": 255683.0, + "reward": 2.6089797019958496, + "reward_std": 4.1650495529174805, + "rewards/fitness_reward/mean": 2.7181997299194336, + "rewards/fitness_reward/std": 4.425798416137695, + "rewards/kidney_reward/mean": -0.15508919954299927, + "rewards/kidney_reward/std": 0.9946020245552063, + "rewards/length2tails_reward/mean": 0.5060371160507202, + "rewards/length2tails_reward/std": 0.4584464728832245, + "rewards/thermo_reward/mean": -0.31636953353881836, + "rewards/thermo_reward/std": 1.4987841844558716, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 293.46875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.25898733153007925, + "epoch": 0.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0409340858459473, + "learning_rate": 1.16e-06, + "loss": 0.1339, + "num_tokens": 265106.0, + "reward": 1.6009509563446045, + "reward_std": 3.8154296875, + "rewards/fitness_reward/mean": 1.8939182758331299, + "rewards/fitness_reward/std": 4.212096691131592, + "rewards/kidney_reward/mean": -0.3592352867126465, + "rewards/kidney_reward/std": 0.9245181679725647, + "rewards/length2tails_reward/mean": 0.3944413661956787, + "rewards/length2tails_reward/std": 0.46978193521499634, + "rewards/thermo_reward/mean": -0.4239196479320526, + "rewards/thermo_reward/std": 1.8588840961456299, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 279.90625, + "completions/mean_terminated_length": 279.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12356520257890224, + "epoch": 0.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1687328815460205, + "learning_rate": 1.2e-06, + "loss": 0.0934, + "num_tokens": 274095.0, + "reward": 3.447176456451416, + "reward_std": 3.1346819400787354, + "rewards/fitness_reward/mean": 3.8587636947631836, + "rewards/fitness_reward/std": 3.590867280960083, + "rewards/kidney_reward/mean": -0.4081210494041443, + "rewards/kidney_reward/std": 0.8285349607467651, + "rewards/length2tails_reward/mean": 0.46461570262908936, + "rewards/length2tails_reward/std": 0.47861263155937195, + "rewards/thermo_reward/mean": -0.6473608613014221, + "rewards/thermo_reward/std": 1.6887931823730469, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 260.15625, + "completions/mean_terminated_length": 260.15625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.13539172057062387, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3024259805679321, + "learning_rate": 1.24e-06, + "loss": -0.0461, + "num_tokens": 282452.0, + "reward": 2.7450294494628906, + "reward_std": 3.819532632827759, + "rewards/fitness_reward/mean": 2.8516581058502197, + "rewards/fitness_reward/std": 4.211026668548584, + "rewards/kidney_reward/mean": -0.03711742162704468, + "rewards/kidney_reward/std": 0.9770359396934509, + "rewards/length2tails_reward/mean": 0.3360474109649658, + "rewards/length2tails_reward/std": 0.44527024030685425, + "rewards/thermo_reward/mean": -0.3441632390022278, + "rewards/thermo_reward/std": 1.6381678581237793, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 266.96875, + "completions/mean_terminated_length": 266.96875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.21443682396784425, + "epoch": 0.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1235257387161255, + "learning_rate": 1.28e-06, + "loss": 0.0131, + "num_tokens": 291027.0, + "reward": 2.749849319458008, + "reward_std": 3.7275326251983643, + "rewards/fitness_reward/mean": 3.279467821121216, + "rewards/fitness_reward/std": 3.8192431926727295, + "rewards/kidney_reward/mean": -0.3075902462005615, + "rewards/kidney_reward/std": 1.0802315473556519, + "rewards/length2tails_reward/mean": 0.40814924240112305, + "rewards/length2tails_reward/std": 0.46687832474708557, + "rewards/thermo_reward/mean": -0.9557211995124817, + "rewards/thermo_reward/std": 1.525609016418457, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.11994511261582375, + "epoch": 0.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5663752555847168, + "learning_rate": 1.32e-06, + "loss": 0.0547, + "num_tokens": 299778.0, + "reward": 2.085150718688965, + "reward_std": 3.8342347145080566, + "rewards/fitness_reward/mean": 2.482593059539795, + "rewards/fitness_reward/std": 3.909547805786133, + "rewards/kidney_reward/mean": -0.08659157156944275, + "rewards/kidney_reward/std": 1.036144495010376, + "rewards/length2tails_reward/mean": 0.29886335134506226, + "rewards/length2tails_reward/std": 0.4395237863063812, + "rewards/thermo_reward/mean": -0.8577248454093933, + "rewards/thermo_reward/std": 1.518059492111206, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 286.9375, + "completions/mean_terminated_length": 286.9375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1442709225229919, + "epoch": 0.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0670015811920166, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.1441, + "num_tokens": 308992.0, + "reward": 3.33197283744812, + "reward_std": 3.3288204669952393, + "rewards/fitness_reward/mean": 3.8933303356170654, + "rewards/fitness_reward/std": 3.8310139179229736, + "rewards/kidney_reward/mean": -0.3780807852745056, + "rewards/kidney_reward/std": 0.9036357998847961, + "rewards/length2tails_reward/mean": 0.39739546179771423, + "rewards/length2tails_reward/std": 0.4599010646343231, + "rewards/thermo_reward/mean": -0.9433316588401794, + "rewards/thermo_reward/std": 1.4738272428512573, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 274.46875, + "completions/mean_terminated_length": 274.46875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.09205999784171581, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3518712520599365, + "learning_rate": 1.4e-06, + "loss": 0.0977, + "num_tokens": 317807.0, + "reward": 3.7158541679382324, + "reward_std": 3.447343349456787, + "rewards/fitness_reward/mean": 3.822787284851074, + "rewards/fitness_reward/std": 3.425808906555176, + "rewards/kidney_reward/mean": -0.19986845552921295, + "rewards/kidney_reward/std": 0.9525273442268372, + "rewards/length2tails_reward/mean": 0.289717435836792, + "rewards/length2tails_reward/std": 0.3901156783103943, + "rewards/thermo_reward/mean": -0.15885674953460693, + "rewards/thermo_reward/std": 1.5303553342819214, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.23849095264449716, + "epoch": 0.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8961380124092102, + "learning_rate": 1.44e-06, + "loss": -0.0484, + "num_tokens": 326223.0, + "reward": 3.2912168502807617, + "reward_std": 3.498786687850952, + "rewards/fitness_reward/mean": 3.485775947570801, + "rewards/fitness_reward/std": 3.9797747135162354, + "rewards/kidney_reward/mean": -0.08097781985998154, + "rewards/kidney_reward/std": 1.0017220973968506, + "rewards/length2tails_reward/mean": 0.45231589674949646, + "rewards/length2tails_reward/std": 0.4590526819229126, + "rewards/thermo_reward/mean": -0.5342980027198792, + "rewards/thermo_reward/std": 1.7394862174987793, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.15560346003621817, + "epoch": 0.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8480332493782043, + "learning_rate": 1.48e-06, + "loss": 0.0411, + "num_tokens": 334929.0, + "reward": 3.250786304473877, + "reward_std": 3.4832067489624023, + "rewards/fitness_reward/mean": 3.512643814086914, + "rewards/fitness_reward/std": 3.6526308059692383, + "rewards/kidney_reward/mean": -0.3572557866573334, + "rewards/kidney_reward/std": 0.8072547912597656, + "rewards/length2tails_reward/mean": 0.4068295955657959, + "rewards/length2tails_reward/std": 0.46265122294425964, + "rewards/thermo_reward/mean": -0.3698734939098358, + "rewards/thermo_reward/std": 1.6630305051803589, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 281.3125, + "completions/mean_terminated_length": 281.3125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.15662717120721936, + "epoch": 0.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.380427360534668, + "learning_rate": 1.5199999999999998e-06, + "loss": 0.1041, + "num_tokens": 343963.0, + "reward": 3.2305877208709717, + "reward_std": 3.6741504669189453, + "rewards/fitness_reward/mean": 3.7796645164489746, + "rewards/fitness_reward/std": 4.052594184875488, + "rewards/kidney_reward/mean": -0.5030925273895264, + "rewards/kidney_reward/std": 0.9221724271774292, + "rewards/length2tails_reward/mean": 0.4580508768558502, + "rewards/length2tails_reward/std": 0.4660351276397705, + "rewards/thermo_reward/mean": -0.8240861892700195, + "rewards/thermo_reward/std": 1.6861786842346191, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 260.53125, + "completions/mean_terminated_length": 260.53125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.08550065802410245, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.308832049369812, + "learning_rate": 1.5599999999999999e-06, + "loss": -0.0705, + "num_tokens": 352332.0, + "reward": 3.550997734069824, + "reward_std": 3.215728282928467, + "rewards/fitness_reward/mean": 3.882805824279785, + "rewards/fitness_reward/std": 3.2661876678466797, + "rewards/kidney_reward/mean": -0.331551194190979, + "rewards/kidney_reward/std": 1.047247290611267, + "rewards/length2tails_reward/mean": 0.3643738627433777, + "rewards/length2tails_reward/std": 0.45199695229530334, + "rewards/thermo_reward/mean": -0.5142515301704407, + "rewards/thermo_reward/std": 1.6633535623550415, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 287.9375, + "completions/mean_terminated_length": 287.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.23963367426767945, + "epoch": 0.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.635361909866333, + "learning_rate": 1.6e-06, + "loss": 0.0942, + "num_tokens": 361578.0, + "reward": 2.2229952812194824, + "reward_std": 4.70599889755249, + "rewards/fitness_reward/mean": 2.4011287689208984, + "rewards/fitness_reward/std": 4.697346210479736, + "rewards/kidney_reward/mean": -0.21263191103935242, + "rewards/kidney_reward/std": 0.9460816383361816, + "rewards/length2tails_reward/mean": 0.49359941482543945, + "rewards/length2tails_reward/std": 0.4644700288772583, + "rewards/thermo_reward/mean": -0.39043474197387695, + "rewards/thermo_reward/std": 1.7590357065200806, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 264.3125, + "completions/mean_terminated_length": 264.3125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.17565014283172786, + "epoch": 0.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.304567813873291, + "learning_rate": 1.6399999999999998e-06, + "loss": -0.0273, + "num_tokens": 370068.0, + "reward": 2.681387424468994, + "reward_std": 2.717798948287964, + "rewards/fitness_reward/mean": 3.064466953277588, + "rewards/fitness_reward/std": 3.145963668823242, + "rewards/kidney_reward/mean": -0.30574914813041687, + "rewards/kidney_reward/std": 0.6779427528381348, + "rewards/length2tails_reward/mean": 0.283324658870697, + "rewards/length2tails_reward/std": 0.4304597079753876, + "rewards/thermo_reward/mean": -0.6020721197128296, + "rewards/thermo_reward/std": 1.501791000366211, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.049185109324753284, + "epoch": 0.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6598054766654968, + "learning_rate": 1.6799999999999998e-06, + "loss": -0.0068, + "num_tokens": 378600.0, + "reward": 3.6152701377868652, + "reward_std": 2.4673726558685303, + "rewards/fitness_reward/mean": 3.865413188934326, + "rewards/fitness_reward/std": 2.656116247177124, + "rewards/kidney_reward/mean": 0.025645185261964798, + "rewards/kidney_reward/std": 0.9594531059265137, + "rewards/length2tails_reward/mean": 0.2298400104045868, + "rewards/length2tails_reward/std": 0.4059338867664337, + "rewards/thermo_reward/mean": -0.6408505439758301, + "rewards/thermo_reward/std": 1.0837303400039673, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 284.875, + "completions/mean_terminated_length": 284.875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.14746736688539386, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9718717336654663, + "learning_rate": 1.7199999999999998e-06, + "loss": 0.1251, + "num_tokens": 387748.0, + "reward": 3.236013650894165, + "reward_std": 3.3577160835266113, + "rewards/fitness_reward/mean": 3.322758913040161, + "rewards/fitness_reward/std": 4.015753269195557, + "rewards/kidney_reward/mean": -0.2718885838985443, + "rewards/kidney_reward/std": 1.0613834857940674, + "rewards/length2tails_reward/mean": 0.35513967275619507, + "rewards/length2tails_reward/std": 0.465355783700943, + "rewards/thermo_reward/mean": -0.07917150855064392, + "rewards/thermo_reward/std": 1.645909070968628, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.13994689658284187, + "epoch": 0.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.789834976196289, + "learning_rate": 1.7599999999999999e-06, + "loss": -0.0738, + "num_tokens": 396176.0, + "reward": 3.187227487564087, + "reward_std": 2.7353832721710205, + "rewards/fitness_reward/mean": 3.8397207260131836, + "rewards/fitness_reward/std": 3.054448366165161, + "rewards/kidney_reward/mean": -0.5340659618377686, + "rewards/kidney_reward/std": 0.7410010099411011, + "rewards/length2tails_reward/mean": 0.4532582461833954, + "rewards/length2tails_reward/std": 0.4908135235309601, + "rewards/thermo_reward/mean": -0.9975494146347046, + "rewards/thermo_reward/std": 1.4557503461837769, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 315.625, + "completions/mean_terminated_length": 301.4838562011719, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.3240436161868274, + "epoch": 0.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.263629913330078, + "learning_rate": 1.8e-06, + "loss": 0.2438, + "num_tokens": 406308.0, + "reward": 2.244203805923462, + "reward_std": 3.7210023403167725, + "rewards/fitness_reward/mean": 2.758302927017212, + "rewards/fitness_reward/std": 4.335577964782715, + "rewards/kidney_reward/mean": -0.517410397529602, + "rewards/kidney_reward/std": 0.6929426193237305, + "rewards/length2tails_reward/mean": 0.5104854702949524, + "rewards/length2tails_reward/std": 0.45414960384368896, + "rewards/thermo_reward/mean": -0.7660301923751831, + "rewards/thermo_reward/std": 1.8981391191482544, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.21957311406731606, + "epoch": 0.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5725150108337402, + "learning_rate": 1.84e-06, + "loss": 0.0397, + "num_tokens": 415073.0, + "reward": 3.440593719482422, + "reward_std": 3.907878875732422, + "rewards/fitness_reward/mean": 3.6773455142974854, + "rewards/fitness_reward/std": 4.009237289428711, + "rewards/kidney_reward/mean": -0.3265707492828369, + "rewards/kidney_reward/std": 0.8724467754364014, + "rewards/length2tails_reward/mean": 0.4737565517425537, + "rewards/length2tails_reward/std": 0.47977402806282043, + "rewards/thermo_reward/mean": -0.38381147384643555, + "rewards/thermo_reward/std": 1.7817882299423218, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 280.71875, + "completions/mean_terminated_length": 280.71875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.20995492348447442, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4390766620635986, + "learning_rate": 1.8799999999999998e-06, + "loss": 0.1414, + "num_tokens": 424088.0, + "reward": 2.7158737182617188, + "reward_std": 3.9450368881225586, + "rewards/fitness_reward/mean": 3.2037267684936523, + "rewards/fitness_reward/std": 4.233373165130615, + "rewards/kidney_reward/mean": -0.2726379632949829, + "rewards/kidney_reward/std": 1.0136029720306396, + "rewards/length2tails_reward/mean": 0.4437826871871948, + "rewards/length2tails_reward/std": 0.4791125953197479, + "rewards/thermo_reward/mean": -0.9249591827392578, + "rewards/thermo_reward/std": 1.5671353340148926, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 259.40625, + "completions/mean_terminated_length": 259.40625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.09386752406135201, + "epoch": 0.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9598005414009094, + "learning_rate": 1.92e-06, + "loss": -0.0616, + "num_tokens": 432421.0, + "reward": 3.1751842498779297, + "reward_std": 2.8628969192504883, + "rewards/fitness_reward/mean": 3.399622678756714, + "rewards/fitness_reward/std": 3.3204360008239746, + "rewards/kidney_reward/mean": -0.05504148453474045, + "rewards/kidney_reward/std": 0.9885132312774658, + "rewards/length2tails_reward/mean": 0.3248463273048401, + "rewards/length2tails_reward/std": 0.42490923404693604, + "rewards/thermo_reward/mean": -0.5562581419944763, + "rewards/thermo_reward/std": 1.7294560670852661, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 276.375, + "completions/mean_terminated_length": 276.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2751795544754714, + "epoch": 0.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5895367860794067, + "learning_rate": 1.96e-06, + "loss": 0.0493, + "num_tokens": 441297.0, + "reward": 3.4684815406799316, + "reward_std": 3.365799903869629, + "rewards/fitness_reward/mean": 3.8252737522125244, + "rewards/fitness_reward/std": 3.701256513595581, + "rewards/kidney_reward/mean": -0.21321097016334534, + "rewards/kidney_reward/std": 1.0331931114196777, + "rewards/length2tails_reward/mean": 0.5216790437698364, + "rewards/length2tails_reward/std": 0.475973904132843, + "rewards/thermo_reward/mean": -0.7612127065658569, + "rewards/thermo_reward/std": 1.7431429624557495, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 275.6875, + "completions/mean_terminated_length": 275.6875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.16012710286304355, + "epoch": 0.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3491517305374146, + "learning_rate": 2e-06, + "loss": 0.0725, + "num_tokens": 450151.0, + "reward": 3.006770133972168, + "reward_std": 3.639115810394287, + "rewards/fitness_reward/mean": 3.219278335571289, + "rewards/fitness_reward/std": 3.9509146213531494, + "rewards/kidney_reward/mean": -0.12363478541374207, + "rewards/kidney_reward/std": 1.1107032299041748, + "rewards/length2tails_reward/mean": 0.3590978980064392, + "rewards/length2tails_reward/std": 0.4454381763935089, + "rewards/thermo_reward/mean": -0.48093003034591675, + "rewards/thermo_reward/std": 1.5363606214523315, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.15937610948458314, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.491953730583191, + "learning_rate": 1.9999756307053944e-06, + "loss": 0.0897, + "num_tokens": 459011.0, + "reward": 2.9640777111053467, + "reward_std": 3.2988362312316895, + "rewards/fitness_reward/mean": 3.421854257583618, + "rewards/fitness_reward/std": 3.4940311908721924, + "rewards/kidney_reward/mean": -0.35488569736480713, + "rewards/kidney_reward/std": 0.8144198060035706, + "rewards/length2tails_reward/mean": 0.32573986053466797, + "rewards/length2tails_reward/std": 0.43565720319747925, + "rewards/thermo_reward/mean": -0.7235372066497803, + "rewards/thermo_reward/std": 1.4692848920822144, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 291.40625, + "completions/mean_terminated_length": 276.4838562011719, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.17189960647374392, + "epoch": 0.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6252596378326416, + "learning_rate": 1.999902524009304e-06, + "loss": 0.1744, + "num_tokens": 468368.0, + "reward": 3.256671667098999, + "reward_std": 3.157939910888672, + "rewards/fitness_reward/mean": 3.558439016342163, + "rewards/fitness_reward/std": 3.760011672973633, + "rewards/kidney_reward/mean": -0.20778009295463562, + "rewards/kidney_reward/std": 0.9953776597976685, + "rewards/length2tails_reward/mean": 0.41051942110061646, + "rewards/length2tails_reward/std": 0.47235342860221863, + "rewards/thermo_reward/mean": -0.6010138988494873, + "rewards/thermo_reward/std": 1.5003546476364136, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.18772483337670565, + "epoch": 0.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5053292512893677, + "learning_rate": 1.999780683474845e-06, + "loss": 0.0136, + "num_tokens": 476960.0, + "reward": 3.904463529586792, + "reward_std": 3.229468822479248, + "rewards/fitness_reward/mean": 4.0994086265563965, + "rewards/fitness_reward/std": 3.267956018447876, + "rewards/kidney_reward/mean": 0.038502246141433716, + "rewards/kidney_reward/std": 1.0834846496582031, + "rewards/length2tails_reward/mean": 0.42780017852783203, + "rewards/length2tails_reward/std": 0.4738711416721344, + "rewards/thermo_reward/mean": -0.6422922611236572, + "rewards/thermo_reward/std": 1.6311054229736328, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 280.375, + "completions/mean_terminated_length": 280.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.16723995190113783, + "epoch": 0.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2620344161987305, + "learning_rate": 1.999610115040354e-06, + "loss": 0.0829, + "num_tokens": 485964.0, + "reward": 2.829023838043213, + "reward_std": 3.2344512939453125, + "rewards/fitness_reward/mean": 3.2663230895996094, + "rewards/fitness_reward/std": 3.6452345848083496, + "rewards/kidney_reward/mean": -0.43373996019363403, + "rewards/kidney_reward/std": 0.8998933434486389, + "rewards/length2tails_reward/mean": 0.39806175231933594, + "rewards/length2tails_reward/std": 0.4529305696487427, + "rewards/thermo_reward/mean": -0.6398894786834717, + "rewards/thermo_reward/std": 1.7521892786026, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.15543030109256506, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3704290390014648, + "learning_rate": 1.9993908270190957e-06, + "loss": 0.05, + "num_tokens": 494708.0, + "reward": 2.963658571243286, + "reward_std": 2.9826788902282715, + "rewards/fitness_reward/mean": 3.5461554527282715, + "rewards/fitness_reward/std": 3.4882194995880127, + "rewards/kidney_reward/mean": -0.32047683000564575, + "rewards/kidney_reward/std": 0.8685289621353149, + "rewards/length2tails_reward/mean": 0.405619353055954, + "rewards/length2tails_reward/std": 0.4598184823989868, + "rewards/thermo_reward/mean": -1.0473260879516602, + "rewards/thermo_reward/std": 1.5689518451690674, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 286.75, + "completions/mean_terminated_length": 286.75, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.20465043978765607, + "epoch": 0.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5429962873458862, + "learning_rate": 1.999122830098858e-06, + "loss": 0.1211, + "num_tokens": 503916.0, + "reward": 2.5245821475982666, + "reward_std": 3.371739625930786, + "rewards/fitness_reward/mean": 2.8426780700683594, + "rewards/fitness_reward/std": 3.971247434616089, + "rewards/kidney_reward/mean": -0.447263240814209, + "rewards/kidney_reward/std": 0.7793542146682739, + "rewards/length2tails_reward/mean": 0.41784876585006714, + "rewards/length2tails_reward/std": 0.4716041386127472, + "rewards/thermo_reward/mean": -0.39785265922546387, + "rewards/thermo_reward/std": 1.7068020105361938, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 283.96875, + "completions/mean_terminated_length": 283.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10966742131859064, + "epoch": 0.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2801357507705688, + "learning_rate": 1.998806137341434e-06, + "loss": 0.1257, + "num_tokens": 513035.0, + "reward": 2.501692771911621, + "reward_std": 3.395608425140381, + "rewards/fitness_reward/mean": 2.803565263748169, + "rewards/fitness_reward/std": 3.8261919021606445, + "rewards/kidney_reward/mean": -0.06101062148809433, + "rewards/kidney_reward/std": 0.9440979361534119, + "rewards/length2tails_reward/mean": 0.30113983154296875, + "rewards/length2tails_reward/std": 0.4402500092983246, + "rewards/thermo_reward/mean": -0.6933040022850037, + "rewards/thermo_reward/std": 1.348304033279419, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 266.46875, + "completions/mean_terminated_length": 266.46875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.10635471204295754, + "epoch": 0.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.288246750831604, + "learning_rate": 1.998440764181981e-06, + "loss": -0.0024, + "num_tokens": 521594.0, + "reward": 2.6305971145629883, + "reward_std": 3.2401773929595947, + "rewards/fitness_reward/mean": 3.008155345916748, + "rewards/fitness_reward/std": 3.672124147415161, + "rewards/kidney_reward/mean": -0.308817982673645, + "rewards/kidney_reward/std": 0.7729810476303101, + "rewards/length2tails_reward/mean": 0.29512399435043335, + "rewards/length2tails_reward/std": 0.42243650555610657, + "rewards/thermo_reward/mean": -0.5938605666160583, + "rewards/thermo_reward/std": 1.6341571807861328, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12811401998624206, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5416864156723022, + "learning_rate": 1.9980267284282714e-06, + "loss": 0.0415, + "num_tokens": 530305.0, + "reward": 3.8451623916625977, + "reward_std": 2.8841209411621094, + "rewards/fitness_reward/mean": 4.159316062927246, + "rewards/fitness_reward/std": 2.810502767562866, + "rewards/kidney_reward/mean": -0.11423890292644501, + "rewards/kidney_reward/std": 0.9905949831008911, + "rewards/length2tails_reward/mean": 0.29447993636131287, + "rewards/length2tails_reward/std": 0.4234199523925781, + "rewards/thermo_reward/mean": -0.6613084077835083, + "rewards/thermo_reward/std": 1.4590983390808105, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.21011533495038748, + "epoch": 0.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1377606391906738, + "learning_rate": 1.997564050259824e-06, + "loss": -0.0385, + "num_tokens": 538984.0, + "reward": 3.4698257446289062, + "reward_std": 2.9833648204803467, + "rewards/fitness_reward/mean": 3.722118616104126, + "rewards/fitness_reward/std": 3.0271809101104736, + "rewards/kidney_reward/mean": -0.19583800435066223, + "rewards/kidney_reward/std": 0.8872043490409851, + "rewards/length2tails_reward/mean": 0.35539913177490234, + "rewards/length2tails_reward/std": 0.45690247416496277, + "rewards/thermo_reward/mean": -0.4864477515220642, + "rewards/thermo_reward/std": 1.4413105249404907, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 274.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1759026509243995, + "epoch": 0.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6390390396118164, + "learning_rate": 1.99705275222692e-06, + "loss": 0.0489, + "num_tokens": 547805.0, + "reward": 2.411684036254883, + "reward_std": 3.4048962593078613, + "rewards/fitness_reward/mean": 2.772763729095459, + "rewards/fitness_reward/std": 3.615152597427368, + "rewards/kidney_reward/mean": -0.2250068634748459, + "rewards/kidney_reward/std": 0.8008552193641663, + "rewards/length2tails_reward/mean": 0.38801515102386475, + "rewards/length2tails_reward/std": 0.4778028428554535, + "rewards/thermo_reward/mean": -0.6911601424217224, + "rewards/thermo_reward/std": 1.3928868770599365, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 257.09375, + "completions/mean_terminated_length": 257.09375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.12114082940388471, + "epoch": 0.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0728001594543457, + "learning_rate": 1.9964928592495045e-06, + "loss": -0.0602, + "num_tokens": 556064.0, + "reward": 2.318950653076172, + "reward_std": 3.8857407569885254, + "rewards/fitness_reward/mean": 2.7293825149536133, + "rewards/fitness_reward/std": 3.9336018562316895, + "rewards/kidney_reward/mean": -0.12004195153713226, + "rewards/kidney_reward/std": 1.0272102355957031, + "rewards/length2tails_reward/mean": 0.3241163492202759, + "rewards/length2tails_reward/std": 0.4461221396923065, + "rewards/thermo_reward/mean": -0.8628798723220825, + "rewards/thermo_reward/std": 1.4122726917266846, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 266.34375, + "completions/mean_terminated_length": 266.34375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.09513700450770557, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4474369287490845, + "learning_rate": 1.99588439861597e-06, + "loss": -0.0035, + "num_tokens": 564619.0, + "reward": 2.8130290508270264, + "reward_std": 2.8655166625976562, + "rewards/fitness_reward/mean": 3.189012289047241, + "rewards/fitness_reward/std": 3.4911272525787354, + "rewards/kidney_reward/mean": -0.19488850235939026, + "rewards/kidney_reward/std": 0.8539829254150391, + "rewards/length2tails_reward/mean": 0.3629145622253418, + "rewards/length2tails_reward/std": 0.4654383659362793, + "rewards/thermo_reward/mean": -0.7385349273681641, + "rewards/thermo_reward/std": 1.4404512643814087, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.13064821506850421, + "epoch": 0.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.423954963684082, + "learning_rate": 1.995227399981831e-06, + "loss": 0.0646, + "num_tokens": 573410.0, + "reward": 2.9144697189331055, + "reward_std": 2.6013689041137695, + "rewards/fitness_reward/mean": 3.6918673515319824, + "rewards/fitness_reward/std": 3.162837266921997, + "rewards/kidney_reward/mean": -0.5046856999397278, + "rewards/kidney_reward/std": 0.8479384779930115, + "rewards/length2tails_reward/mean": 0.39057034254074097, + "rewards/length2tails_reward/std": 0.47401538491249084, + "rewards/thermo_reward/mean": -1.2453944683074951, + "rewards/thermo_reward/std": 1.5381916761398315, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 263.75, + "completions/mean_terminated_length": 263.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.16724270419217646, + "epoch": 0.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4178781509399414, + "learning_rate": 1.994521895368273e-06, + "loss": -0.0306, + "num_tokens": 581882.0, + "reward": 3.1069273948669434, + "reward_std": 3.656845808029175, + "rewards/fitness_reward/mean": 3.6763453483581543, + "rewards/fitness_reward/std": 3.7498345375061035, + "rewards/kidney_reward/mean": -0.46478116512298584, + "rewards/kidney_reward/std": 0.780349612236023, + "rewards/length2tails_reward/mean": 0.4894040822982788, + "rewards/length2tails_reward/std": 0.4659845530986786, + "rewards/thermo_reward/mean": -0.9187557101249695, + "rewards/thermo_reward/std": 1.7948275804519653, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 263.46875, + "completions/mean_terminated_length": 263.46875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.047335606650449336, + "epoch": 0.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.666408360004425, + "learning_rate": 1.9937679191605962e-06, + "loss": -0.0395, + "num_tokens": 590345.0, + "reward": 3.4559826850891113, + "reward_std": 2.2802734375, + "rewards/fitness_reward/mean": 4.01068115234375, + "rewards/fitness_reward/std": 2.509319305419922, + "rewards/kidney_reward/mean": -0.4572184383869171, + "rewards/kidney_reward/std": 0.5606120228767395, + "rewards/length2tails_reward/mean": 0.30433884263038635, + "rewards/length2tails_reward/std": 0.44804126024246216, + "rewards/thermo_reward/mean": -0.8043481111526489, + "rewards/thermo_reward/std": 1.5644283294677734, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 249.03125, + "completions/mean_terminated_length": 249.03125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.12496072240173817, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8123031258583069, + "learning_rate": 1.992965508106537e-06, + "loss": -0.1554, + "num_tokens": 598346.0, + "reward": 3.622310161590576, + "reward_std": 3.287194013595581, + "rewards/fitness_reward/mean": 3.8683528900146484, + "rewards/fitness_reward/std": 3.616657018661499, + "rewards/kidney_reward/mean": -0.33187222480773926, + "rewards/kidney_reward/std": 0.8844914436340332, + "rewards/length2tails_reward/mean": 0.40272101759910583, + "rewards/length2tails_reward/std": 0.4587315022945404, + "rewards/thermo_reward/mean": -0.3615736961364746, + "rewards/thermo_reward/std": 1.7587852478027344, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06588866747915745, + "epoch": 0.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8075075149536133, + "learning_rate": 1.9921147013144777e-06, + "loss": 0.084, + "num_tokens": 607094.0, + "reward": 3.5608577728271484, + "reward_std": 2.032958745956421, + "rewards/fitness_reward/mean": 3.9016268253326416, + "rewards/fitness_reward/std": 1.998185634613037, + "rewards/kidney_reward/mean": -0.2879283130168915, + "rewards/kidney_reward/std": 0.7849275469779968, + "rewards/length2tails_reward/mean": 0.1897910237312317, + "rewards/length2tails_reward/std": 0.3825874924659729, + "rewards/thermo_reward/mean": -0.488505482673645, + "rewards/thermo_reward/std": 1.2325689792633057, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.1318660757970065, + "epoch": 0.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7601933479309082, + "learning_rate": 1.9912155402515414e-06, + "loss": 0.0819, + "num_tokens": 616086.0, + "reward": 3.302825927734375, + "reward_std": 3.520420789718628, + "rewards/fitness_reward/mean": 3.4547269344329834, + "rewards/fitness_reward/std": 4.018453598022461, + "rewards/kidney_reward/mean": -0.33877238631248474, + "rewards/kidney_reward/std": 1.066430687904358, + "rewards/length2tails_reward/mean": 0.5047099590301514, + "rewards/length2tails_reward/std": 0.45460987091064453, + "rewards/thermo_reward/mean": -0.21738454699516296, + "rewards/thermo_reward/std": 1.8911807537078857, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 281.125, + "completions/mean_terminated_length": 281.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12423830945044756, + "epoch": 0.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3924140930175781, + "learning_rate": 1.99026806874157e-06, + "loss": 0.1156, + "num_tokens": 625114.0, + "reward": 3.2361419200897217, + "reward_std": 3.2574002742767334, + "rewards/fitness_reward/mean": 3.6235194206237793, + "rewards/fitness_reward/std": 3.311436176300049, + "rewards/kidney_reward/mean": -0.20599408447742462, + "rewards/kidney_reward/std": 0.8880826830863953, + "rewards/length2tails_reward/mean": 0.3809414505958557, + "rewards/length2tails_reward/std": 0.4668664038181305, + "rewards/thermo_reward/mean": -0.7592315673828125, + "rewards/thermo_reward/std": 1.6003990173339844, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09949495655018836, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9823671579360962, + "learning_rate": 1.9892723329629885e-06, + "loss": 0.0914, + "num_tokens": 633906.0, + "reward": 3.915363073348999, + "reward_std": 2.6797749996185303, + "rewards/fitness_reward/mean": 4.155492782592773, + "rewards/fitness_reward/std": 2.356550455093384, + "rewards/kidney_reward/mean": -0.17341646552085876, + "rewards/kidney_reward/std": 0.8984330892562866, + "rewards/length2tails_reward/mean": 0.26437491178512573, + "rewards/length2tails_reward/std": 0.3996802270412445, + "rewards/thermo_reward/mean": -0.43903061747550964, + "rewards/thermo_reward/std": 1.5646251440048218, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 267.6875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.10478552733547986, + "epoch": 0.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3388633728027344, + "learning_rate": 1.9882283814465526e-06, + "loss": -0.0059, + "num_tokens": 642504.0, + "reward": 2.3275299072265625, + "reward_std": 3.3928511142730713, + "rewards/fitness_reward/mean": 2.9948863983154297, + "rewards/fitness_reward/std": 3.664214849472046, + "rewards/kidney_reward/mean": -0.5589306354522705, + "rewards/kidney_reward/std": 0.6325205564498901, + "rewards/length2tails_reward/mean": 0.3547426760196686, + "rewards/length2tails_reward/std": 0.4702369272708893, + "rewards/thermo_reward/mean": -0.9531527757644653, + "rewards/thermo_reward/std": 1.475315809249878, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 265.6875, + "completions/mean_terminated_length": 265.6875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.05924345087260008, + "epoch": 0.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8762544989585876, + "learning_rate": 1.987136265072988e-06, + "loss": -0.0119, + "num_tokens": 651038.0, + "reward": 3.500439167022705, + "reward_std": 3.052443265914917, + "rewards/fitness_reward/mean": 4.001605033874512, + "rewards/fitness_reward/std": 2.928560733795166, + "rewards/kidney_reward/mean": -0.3237060010433197, + "rewards/kidney_reward/std": 0.9022273421287537, + "rewards/length2tails_reward/mean": 0.34477800130844116, + "rewards/length2tails_reward/std": 0.44316500425338745, + "rewards/thermo_reward/mean": -0.8510144948959351, + "rewards/thermo_reward/std": 1.473467469215393, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.38166644517332315, + "epoch": 0.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.392104983329773, + "learning_rate": 1.985996037070505e-06, + "loss": -0.0404, + "num_tokens": 659816.0, + "reward": 3.276812791824341, + "reward_std": 3.1364307403564453, + "rewards/fitness_reward/mean": 3.4767651557922363, + "rewards/fitness_reward/std": 3.71610689163208, + "rewards/kidney_reward/mean": -0.04081631079316139, + "rewards/kidney_reward/std": 0.9693803191184998, + "rewards/length2tails_reward/mean": 0.48275813460350037, + "rewards/length2tails_reward/std": 0.4699109196662903, + "rewards/thermo_reward/mean": -0.6004676818847656, + "rewards/thermo_reward/std": 1.6504100561141968, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 265.03125, + "completions/mean_terminated_length": 265.03125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.09045771509408951, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0095329284667969, + "learning_rate": 1.984807753012208e-06, + "loss": -0.02, + "num_tokens": 668329.0, + "reward": 3.8370518684387207, + "reward_std": 2.9943735599517822, + "rewards/fitness_reward/mean": 4.162042617797852, + "rewards/fitness_reward/std": 3.4031171798706055, + "rewards/kidney_reward/mean": -0.1037202849984169, + "rewards/kidney_reward/std": 0.8663031458854675, + "rewards/length2tails_reward/mean": 0.40801501274108887, + "rewards/length2tails_reward/std": 0.46140018105506897, + "rewards/thermo_reward/mean": -0.7502682209014893, + "rewards/thermo_reward/std": 1.5727473497390747, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 263.96875, + "completions/mean_terminated_length": 263.96875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.07936325226910412, + "epoch": 0.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5812162160873413, + "learning_rate": 1.983571470813386e-06, + "loss": -0.0325, + "num_tokens": 676808.0, + "reward": 3.7221837043762207, + "reward_std": 2.660025119781494, + "rewards/fitness_reward/mean": 4.209771156311035, + "rewards/fitness_reward/std": 2.9468605518341064, + "rewards/kidney_reward/mean": -0.5054516196250916, + "rewards/kidney_reward/std": 0.7025943398475647, + "rewards/length2tails_reward/mean": 0.4130633771419525, + "rewards/length2tails_reward/std": 0.4388565123081207, + "rewards/thermo_reward/mean": -0.6762548089027405, + "rewards/thermo_reward/std": 1.6007850170135498, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 260.59375, + "completions/mean_terminated_length": 260.59375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.054782358231022954, + "epoch": 0.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.408692479133606, + "learning_rate": 1.9822872507286887e-06, + "loss": -0.077, + "num_tokens": 685179.0, + "reward": 3.3060789108276367, + "reward_std": 2.6914637088775635, + "rewards/fitness_reward/mean": 3.7634127140045166, + "rewards/fitness_reward/std": 2.9260480403900146, + "rewards/kidney_reward/mean": -0.3477899730205536, + "rewards/kidney_reward/std": 0.8083656430244446, + "rewards/length2tails_reward/mean": 0.30918532609939575, + "rewards/length2tails_reward/std": 0.4387562870979309, + "rewards/thermo_reward/mean": -0.7214701175689697, + "rewards/thermo_reward/std": 1.4108153581619263, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 278.375, + "completions/mean_terminated_length": 278.375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.1216822536662221, + "epoch": 0.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7516916990280151, + "learning_rate": 1.9809551553491913e-06, + "loss": 0.0891, + "num_tokens": 694119.0, + "reward": 3.180351972579956, + "reward_std": 3.2595674991607666, + "rewards/fitness_reward/mean": 3.630638599395752, + "rewards/fitness_reward/std": 3.309762477874756, + "rewards/kidney_reward/mean": -0.2499833106994629, + "rewards/kidney_reward/std": 1.0745337009429932, + "rewards/length2tails_reward/mean": 0.4155610203742981, + "rewards/length2tails_reward/std": 0.47090789675712585, + "rewards/thermo_reward/mean": -0.8583704233169556, + "rewards/thermo_reward/std": 1.5894389152526855, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.06150074442848563, + "epoch": 0.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0668549537658691, + "learning_rate": 1.979575249599344e-06, + "loss": -0.0093, + "num_tokens": 702651.0, + "reward": 3.1801319122314453, + "reward_std": 2.3890247344970703, + "rewards/fitness_reward/mean": 3.6655666828155518, + "rewards/fitness_reward/std": 2.888410806655884, + "rewards/kidney_reward/mean": -0.5205367803573608, + "rewards/kidney_reward/std": 0.7554165124893188, + "rewards/length2tails_reward/mean": 0.30073481798171997, + "rewards/length2tails_reward/std": 0.43407922983169556, + "rewards/thermo_reward/mean": -0.600700318813324, + "rewards/thermo_reward/std": 1.570206880569458, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 278.59375, + "completions/mean_terminated_length": 278.59375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.19582223054021597, + "epoch": 0.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9738500118255615, + "learning_rate": 1.9781476007338054e-06, + "loss": 0.0849, + "num_tokens": 711598.0, + "reward": 3.531404495239258, + "reward_std": 2.798060417175293, + "rewards/fitness_reward/mean": 4.016422748565674, + "rewards/fitness_reward/std": 3.2364284992218018, + "rewards/kidney_reward/mean": -0.40827667713165283, + "rewards/kidney_reward/std": 0.8911892771720886, + "rewards/length2tails_reward/mean": 0.4315437376499176, + "rewards/length2tails_reward/std": 0.46955406665802, + "rewards/thermo_reward/mean": -0.7775313258171082, + "rewards/thermo_reward/std": 1.5042880773544312, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 259.2903137207031, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.09533343114890158, + "epoch": 0.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.523869037628174, + "learning_rate": 1.9766722783341677e-06, + "loss": 0.086, + "num_tokens": 720422.0, + "reward": 3.6216368675231934, + "reward_std": 2.9999029636383057, + "rewards/fitness_reward/mean": 4.240582466125488, + "rewards/fitness_reward/std": 3.2040698528289795, + "rewards/kidney_reward/mean": -0.39840295910835266, + "rewards/kidney_reward/std": 1.0155638456344604, + "rewards/length2tails_reward/mean": 0.434470534324646, + "rewards/length2tails_reward/std": 0.46228936314582825, + "rewards/thermo_reward/mean": -1.0567247867584229, + "rewards/thermo_reward/std": 1.665966272354126, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0562637432012707, + "epoch": 0.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1357522010803223, + "learning_rate": 1.975149354305563e-06, + "loss": 0.0135, + "num_tokens": 728994.0, + "reward": 3.007725715637207, + "reward_std": 2.753161668777466, + "rewards/fitness_reward/mean": 3.52304744720459, + "rewards/fitness_reward/std": 2.9334521293640137, + "rewards/kidney_reward/mean": -0.41844168305397034, + "rewards/kidney_reward/std": 0.6413934230804443, + "rewards/length2tails_reward/mean": 0.2917965352535248, + "rewards/length2tails_reward/std": 0.4327235817909241, + "rewards/thermo_reward/mean": -0.7580999135971069, + "rewards/thermo_reward/std": 1.4481271505355835, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 286.65625, + "completions/mean_terminated_length": 286.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13436559145338833, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7011663913726807, + "learning_rate": 1.97357890287316e-06, + "loss": 0.1895, + "num_tokens": 738199.0, + "reward": 4.253416538238525, + "reward_std": 2.8524696826934814, + "rewards/fitness_reward/mean": 4.332322597503662, + "rewards/fitness_reward/std": 2.9005343914031982, + "rewards/kidney_reward/mean": -0.07999514043331146, + "rewards/kidney_reward/std": 1.0148781538009644, + "rewards/length2tails_reward/mean": 0.33597058057785034, + "rewards/length2tails_reward/std": 0.43135184049606323, + "rewards/thermo_reward/mean": -0.24580200016498566, + "rewards/thermo_reward/std": 1.51665461063385, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 282.0625, + "completions/mean_terminated_length": 282.0625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.17252697050571442, + "epoch": 0.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1756560802459717, + "learning_rate": 1.9719610005785463e-06, + "loss": 0.1212, + "num_tokens": 747257.0, + "reward": 2.786454200744629, + "reward_std": 3.7640724182128906, + "rewards/fitness_reward/mean": 3.159298896789551, + "rewards/fitness_reward/std": 4.33671236038208, + "rewards/kidney_reward/mean": -0.3573535680770874, + "rewards/kidney_reward/std": 0.8194970488548279, + "rewards/length2tails_reward/mean": 0.4246785044670105, + "rewards/length2tails_reward/std": 0.48692333698272705, + "rewards/thermo_reward/mean": -0.600675106048584, + "rewards/thermo_reward/std": 1.7436631917953491, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 287.21875, + "completions/mean_terminated_length": 287.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12859952612780035, + "epoch": 0.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2280685901641846, + "learning_rate": 1.9702957262759963e-06, + "loss": 0.1583, + "num_tokens": 756480.0, + "reward": 2.5727219581604004, + "reward_std": 3.09424090385437, + "rewards/fitness_reward/mean": 2.9395320415496826, + "rewards/fitness_reward/std": 3.4791810512542725, + "rewards/kidney_reward/mean": -0.23735620081424713, + "rewards/kidney_reward/std": 0.9329817891120911, + "rewards/length2tails_reward/mean": 0.30392399430274963, + "rewards/length2tails_reward/std": 0.45537668466567993, + "rewards/thermo_reward/mean": -0.6482260227203369, + "rewards/thermo_reward/std": 1.1679952144622803, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.12523282784968615, + "epoch": 0.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.673491954803467, + "learning_rate": 1.968583161128631e-06, + "loss": 0.0818, + "num_tokens": 765276.0, + "reward": 3.5213098526000977, + "reward_std": 2.275979518890381, + "rewards/fitness_reward/mean": 4.062276363372803, + "rewards/fitness_reward/std": 2.692859172821045, + "rewards/kidney_reward/mean": -0.4449991285800934, + "rewards/kidney_reward/std": 0.7791681885719299, + "rewards/length2tails_reward/mean": 0.3919871151447296, + "rewards/length2tails_reward/std": 0.4710420072078705, + "rewards/thermo_reward/mean": -0.8329271674156189, + "rewards/thermo_reward/std": 1.5729801654815674, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0831142186652869, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.926175594329834, + "learning_rate": 1.9668233886044593e-06, + "loss": 0.0891, + "num_tokens": 774142.0, + "reward": 3.4537410736083984, + "reward_std": 2.8645174503326416, + "rewards/fitness_reward/mean": 3.9672062397003174, + "rewards/fitness_reward/std": 3.019857883453369, + "rewards/kidney_reward/mean": -0.44299253821372986, + "rewards/kidney_reward/std": 0.9003056287765503, + "rewards/length2tails_reward/mean": 0.3812631368637085, + "rewards/length2tails_reward/std": 0.455536812543869, + "rewards/thermo_reward/mean": -0.7745689153671265, + "rewards/thermo_reward/std": 1.6042786836624146, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 281.78125, + "completions/mean_terminated_length": 266.5483703613281, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.09161855978891253, + "epoch": 0.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.947537422180176, + "learning_rate": 1.9650164944723112e-06, + "loss": 0.123, + "num_tokens": 783191.0, + "reward": 3.7828750610351562, + "reward_std": 2.4898288249969482, + "rewards/fitness_reward/mean": 4.1685638427734375, + "rewards/fitness_reward/std": 2.7823469638824463, + "rewards/kidney_reward/mean": -0.2978300452232361, + "rewards/kidney_reward/std": 0.8650789856910706, + "rewards/length2tails_reward/mean": 0.3725648522377014, + "rewards/length2tails_reward/std": 0.44893646240234375, + "rewards/thermo_reward/mean": -0.6598291397094727, + "rewards/thermo_reward/std": 1.685317873954773, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.07641523564234376, + "epoch": 0.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4239964485168457, + "learning_rate": 1.963162566797658e-06, + "loss": -0.01, + "num_tokens": 791762.0, + "reward": 3.6204564571380615, + "reward_std": 3.3779494762420654, + "rewards/fitness_reward/mean": 4.185644149780273, + "rewards/fitness_reward/std": 3.01949405670166, + "rewards/kidney_reward/mean": -0.34998631477355957, + "rewards/kidney_reward/std": 0.845792829990387, + "rewards/length2tails_reward/mean": 0.3621581792831421, + "rewards/length2tails_reward/std": 0.44562968611717224, + "rewards/thermo_reward/mean": -0.961467981338501, + "rewards/thermo_reward/std": 1.495065450668335, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 277.8125, + "completions/mean_terminated_length": 277.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.083618309115991, + "epoch": 0.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8104112148284912, + "learning_rate": 1.9612616959383188e-06, + "loss": 0.093, + "num_tokens": 800684.0, + "reward": 3.970524787902832, + "reward_std": 2.618279218673706, + "rewards/fitness_reward/mean": 4.449347496032715, + "rewards/fitness_reward/std": 2.936826467514038, + "rewards/kidney_reward/mean": -0.2391345500946045, + "rewards/kidney_reward/std": 1.0095757246017456, + "rewards/length2tails_reward/mean": 0.45980584621429443, + "rewards/length2tails_reward/std": 0.45448037981987, + "rewards/thermo_reward/mean": -0.9484134316444397, + "rewards/thermo_reward/std": 1.801936388015747, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 266.90625, + "completions/mean_terminated_length": 266.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.05782184097915888, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44250741600990295, + "learning_rate": 1.9593139745400573e-06, + "loss": -0.0085, + "num_tokens": 809257.0, + "reward": 4.371463298797607, + "reward_std": 1.7807085514068604, + "rewards/fitness_reward/mean": 4.943473815917969, + "rewards/fitness_reward/std": 1.6741914749145508, + "rewards/kidney_reward/mean": -0.32885757088661194, + "rewards/kidney_reward/std": 0.9096606373786926, + "rewards/length2tails_reward/mean": 0.3909035325050354, + "rewards/length2tails_reward/std": 0.4627934992313385, + "rewards/thermo_reward/mean": -1.0106147527694702, + "rewards/thermo_reward/std": 1.459147572517395, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 265.59375, + "completions/mean_terminated_length": 265.59375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.13330286438576877, + "epoch": 0.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.56676983833313, + "learning_rate": 1.957319497532067e-06, + "loss": -0.0201, + "num_tokens": 817788.0, + "reward": 3.7524971961975098, + "reward_std": 2.9713351726531982, + "rewards/fitness_reward/mean": 4.150823593139648, + "rewards/fitness_reward/std": 3.1422393321990967, + "rewards/kidney_reward/mean": -0.12235195934772491, + "rewards/kidney_reward/std": 1.0004616975784302, + "rewards/length2tails_reward/mean": 0.4478749632835388, + "rewards/length2tails_reward/std": 0.4696106016635895, + "rewards/thermo_reward/mean": -0.898237943649292, + "rewards/thermo_reward/std": 1.6778795719146729, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.05517435655929148, + "epoch": 0.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18077127635478973, + "learning_rate": 1.9552783621223435e-06, + "loss": -0.0066, + "num_tokens": 826359.0, + "reward": 4.230867862701416, + "reward_std": 1.6108834743499756, + "rewards/fitness_reward/mean": 4.840484619140625, + "rewards/fitness_reward/std": 1.6709182262420654, + "rewards/kidney_reward/mean": -0.1514122188091278, + "rewards/kidney_reward/std": 0.9166415333747864, + "rewards/length2tails_reward/mean": 0.3408389687538147, + "rewards/length2tails_reward/std": 0.445056289434433, + "rewards/thermo_reward/mean": -1.2382405996322632, + "rewards/thermo_reward/std": 1.488350749015808, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06825397931970656, + "epoch": 0.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9398273229598999, + "learning_rate": 1.953190667792947e-06, + "loss": -0.0041, + "num_tokens": 834901.0, + "reward": 3.9511187076568604, + "reward_std": 2.8120744228363037, + "rewards/fitness_reward/mean": 4.169844150543213, + "rewards/fitness_reward/std": 2.7375926971435547, + "rewards/kidney_reward/mean": -0.06316429376602173, + "rewards/kidney_reward/std": 1.0275908708572388, + "rewards/length2tails_reward/mean": 0.2595304250717163, + "rewards/length2tails_reward/std": 0.3988112211227417, + "rewards/thermo_reward/mean": -0.5040514469146729, + "rewards/thermo_reward/std": 1.4217562675476074, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.07531367521733046, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9723393321037292, + "learning_rate": 1.9510565162951534e-06, + "loss": 0.0485, + "num_tokens": 843720.0, + "reward": 3.428475856781006, + "reward_std": 3.01780104637146, + "rewards/fitness_reward/mean": 3.9550271034240723, + "rewards/fitness_reward/std": 3.3777401447296143, + "rewards/kidney_reward/mean": -0.39997780323028564, + "rewards/kidney_reward/std": 0.961540162563324, + "rewards/length2tails_reward/mean": 0.4574941396713257, + "rewards/length2tails_reward/std": 0.46799859404563904, + "rewards/thermo_reward/mean": -0.8818715810775757, + "rewards/thermo_reward/std": 1.7305375337600708, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.09688100079074502, + "epoch": 0.194, + "frac_reward_zero_std": 0.0, + "grad_norm": NaN, + "learning_rate": 1.9488760116444964e-06, + "loss": 0.0737, + "num_tokens": 852552.0, + "reward": 3.657806873321533, + "reward_std": 2.6497535705566406, + "rewards/fitness_reward/mean": 4.058182716369629, + "rewards/fitness_reward/std": 3.0874576568603516, + "rewards/kidney_reward/mean": -0.2950325906276703, + "rewards/kidney_reward/std": 0.8756623268127441, + "rewards/length2tails_reward/mean": 0.4380313456058502, + "rewards/length2tails_reward/std": 0.47069379687309265, + "rewards/thermo_reward/mean": -0.7247352600097656, + "rewards/thermo_reward/std": 1.6496366262435913, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 268.6875, + "completions/mean_terminated_length": 268.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.05954432673752308, + "epoch": 0.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.811093270778656, + "learning_rate": 1.9488760116444964e-06, + "loss": 0.0184, + "num_tokens": 861182.0, + "reward": 3.9953298568725586, + "reward_std": 2.4319796562194824, + "rewards/fitness_reward/mean": 4.486875057220459, + "rewards/fitness_reward/std": 2.3484222888946533, + "rewards/kidney_reward/mean": -0.2622259259223938, + "rewards/kidney_reward/std": 1.0675276517868042, + "rewards/length2tails_reward/mean": 0.37341785430908203, + "rewards/length2tails_reward/std": 0.45848578214645386, + "rewards/thermo_reward/mean": -0.907573401927948, + "rewards/thermo_reward/std": 1.5665401220321655, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 260.2903137207031, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.13262386550195515, + "epoch": 0.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.5504302978515625, + "learning_rate": 1.9466492601156963e-06, + "loss": 0.1024, + "num_tokens": 870037.0, + "reward": 2.7974579334259033, + "reward_std": 2.8245115280151367, + "rewards/fitness_reward/mean": 3.18468976020813, + "rewards/fitness_reward/std": 3.181276798248291, + "rewards/kidney_reward/mean": -0.3316881060600281, + "rewards/kidney_reward/std": 0.7468500137329102, + "rewards/length2tails_reward/mean": 0.2897987961769104, + "rewards/length2tails_reward/std": 0.4419374465942383, + "rewards/thermo_reward/mean": -0.587674617767334, + "rewards/thermo_reward/std": 1.3871588706970215, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 283.9375, + "completions/mean_terminated_length": 283.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12895289808511734, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.474762201309204, + "learning_rate": 1.944376370237481e-06, + "loss": 0.1547, + "num_tokens": 879155.0, + "reward": 4.534636497497559, + "reward_std": 2.668832778930664, + "rewards/fitness_reward/mean": 4.991495132446289, + "rewards/fitness_reward/std": 2.8266241550445557, + "rewards/kidney_reward/mean": -0.2184591293334961, + "rewards/kidney_reward/std": 1.0288792848587036, + "rewards/length2tails_reward/mean": 0.5330479145050049, + "rewards/length2tails_reward/std": 0.44932231307029724, + "rewards/thermo_reward/mean": -0.9617821574211121, + "rewards/thermo_reward/std": 1.8963667154312134, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08518685982562602, + "epoch": 0.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.835224986076355, + "learning_rate": 1.9420574527872966e-06, + "loss": 0.0636, + "num_tokens": 887926.0, + "reward": 3.950529098510742, + "reward_std": 2.339932918548584, + "rewards/fitness_reward/mean": 4.407957077026367, + "rewards/fitness_reward/std": 2.659775733947754, + "rewards/kidney_reward/mean": 0.050680406391620636, + "rewards/kidney_reward/std": 0.8810163736343384, + "rewards/length2tails_reward/mean": 0.29464665055274963, + "rewards/length2tails_reward/std": 0.4334779679775238, + "rewards/thermo_reward/mean": -1.1128590106964111, + "rewards/thermo_reward/std": 1.0931978225708008, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 286.875, + "completions/mean_terminated_length": 286.875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.130730795674026, + "epoch": 0.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2770278453826904, + "learning_rate": 1.9396926207859082e-06, + "loss": 0.138, + "num_tokens": 897138.0, + "reward": 3.412665367126465, + "reward_std": 3.999739408493042, + "rewards/fitness_reward/mean": 3.8380305767059326, + "rewards/fitness_reward/std": 3.9377808570861816, + "rewards/kidney_reward/mean": -0.5111966133117676, + "rewards/kidney_reward/std": 1.0602242946624756, + "rewards/length2tails_reward/mean": 0.5344866514205933, + "rewards/length2tails_reward/std": 0.46006661653518677, + "rewards/thermo_reward/mean": -0.6067769527435303, + "rewards/thermo_reward/std": 1.68917715549469, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.09578574495390058, + "epoch": 0.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2568925619125366, + "learning_rate": 1.9372819894918914e-06, + "loss": -0.0062, + "num_tokens": 905709.0, + "reward": 4.232032299041748, + "reward_std": 2.5994482040405273, + "rewards/fitness_reward/mean": 4.540476322174072, + "rewards/fitness_reward/std": 2.9304285049438477, + "rewards/kidney_reward/mean": -0.3461866080760956, + "rewards/kidney_reward/std": 0.9004994630813599, + "rewards/length2tails_reward/mean": 0.47418013215065, + "rewards/length2tails_reward/std": 0.44722241163253784, + "rewards/thermo_reward/mean": -0.5077908635139465, + "rewards/thermo_reward/std": 1.6851264238357544, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.05232610600069165, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37754666805267334, + "learning_rate": 1.9348256763960142e-06, + "loss": -0.0148, + "num_tokens": 914371.0, + "reward": 3.1689558029174805, + "reward_std": 2.3899450302124023, + "rewards/fitness_reward/mean": 3.684023380279541, + "rewards/fitness_reward/std": 2.4902243614196777, + "rewards/kidney_reward/mean": -0.3063453733921051, + "rewards/kidney_reward/std": 0.674054741859436, + "rewards/length2tails_reward/mean": 0.25765612721443176, + "rewards/length2tails_reward/std": 0.423554390668869, + "rewards/thermo_reward/mean": -0.8526173830032349, + "rewards/thermo_reward/std": 1.3145358562469482, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.11395578144583851, + "epoch": 0.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8413267135620117, + "learning_rate": 1.9323238012155122e-06, + "loss": 0.0022, + "num_tokens": 922987.0, + "reward": 3.0292320251464844, + "reward_std": 3.132319211959839, + "rewards/fitness_reward/mean": 3.6769983768463135, + "rewards/fitness_reward/std": 3.4672458171844482, + "rewards/kidney_reward/mean": -0.5183227062225342, + "rewards/kidney_reward/std": 0.7655011415481567, + "rewards/length2tails_reward/mean": 0.44852757453918457, + "rewards/length2tails_reward/std": 0.4676968455314636, + "rewards/thermo_reward/mean": -1.001473307609558, + "rewards/thermo_reward/std": 1.6753126382827759, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 264.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.0989903915906325, + "epoch": 0.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1482203006744385, + "learning_rate": 1.929776485888251e-06, + "loss": -0.028, + "num_tokens": 931485.0, + "reward": 3.722275733947754, + "reward_std": 2.6445722579956055, + "rewards/fitness_reward/mean": 4.1574249267578125, + "rewards/fitness_reward/std": 2.748945713043213, + "rewards/kidney_reward/mean": -0.23157793283462524, + "rewards/kidney_reward/std": 0.9468393325805664, + "rewards/length2tails_reward/mean": 0.36526206135749817, + "rewards/length2tails_reward/std": 0.4607754945755005, + "rewards/thermo_reward/mean": -0.8213506937026978, + "rewards/thermo_reward/std": 1.4860751628875732, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.07739041280001402, + "epoch": 0.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2668060064315796, + "learning_rate": 1.9271838545667875e-06, + "loss": -0.0093, + "num_tokens": 940076.0, + "reward": 3.640249252319336, + "reward_std": 2.7999203205108643, + "rewards/fitness_reward/mean": 3.994453191757202, + "rewards/fitness_reward/std": 3.2945973873138428, + "rewards/kidney_reward/mean": -0.3617490231990814, + "rewards/kidney_reward/std": 0.8847697973251343, + "rewards/length2tails_reward/mean": 0.5171585083007812, + "rewards/length2tails_reward/std": 0.4940597116947174, + "rewards/thermo_reward/mean": -0.6052376627922058, + "rewards/thermo_reward/std": 2.0294535160064697, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1099214032292366, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6436166763305664, + "learning_rate": 1.9245460336123133e-06, + "loss": 0.0649, + "num_tokens": 948884.0, + "reward": 2.887429714202881, + "reward_std": 3.679654121398926, + "rewards/fitness_reward/mean": 3.3409667015075684, + "rewards/fitness_reward/std": 3.7627742290496826, + "rewards/kidney_reward/mean": -0.4285740852355957, + "rewards/kidney_reward/std": 1.042330265045166, + "rewards/length2tails_reward/mean": 0.3566879630088806, + "rewards/length2tails_reward/std": 0.4686858654022217, + "rewards/thermo_reward/mean": -0.6568437218666077, + "rewards/thermo_reward/std": 1.446584939956665, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 261.15625, + "completions/mean_terminated_length": 261.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.0422589861555025, + "epoch": 0.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6566778421401978, + "learning_rate": 1.9218631515885003e-06, + "loss": -0.0857, + "num_tokens": 957273.0, + "reward": 3.377675771713257, + "reward_std": 2.2366034984588623, + "rewards/fitness_reward/mean": 3.9846253395080566, + "rewards/fitness_reward/std": 2.6026394367218018, + "rewards/kidney_reward/mean": -0.4901122450828552, + "rewards/kidney_reward/std": 0.698078989982605, + "rewards/length2tails_reward/mean": 0.3232583999633789, + "rewards/length2tails_reward/std": 0.4532923698425293, + "rewards/thermo_reward/mean": -0.8854160308837891, + "rewards/thermo_reward/std": 1.4513849020004272, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 266.34375, + "completions/mean_terminated_length": 266.34375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.06348483904730529, + "epoch": 0.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5598238706588745, + "learning_rate": 1.9191353392552343e-06, + "loss": -0.0301, + "num_tokens": 965828.0, + "reward": 3.7526564598083496, + "reward_std": 2.3632280826568604, + "rewards/fitness_reward/mean": 4.508893966674805, + "rewards/fitness_reward/std": 2.7085962295532227, + "rewards/kidney_reward/mean": -0.6108442544937134, + "rewards/kidney_reward/std": 0.7660795450210571, + "rewards/length2tails_reward/mean": 0.46491798758506775, + "rewards/length2tails_reward/std": 0.47420939803123474, + "rewards/thermo_reward/mean": -1.1340885162353516, + "rewards/thermo_reward/std": 1.5848442316055298, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 265.4375, + "completions/mean_terminated_length": 265.4375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.0810333862900734, + "epoch": 0.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3541920185089111, + "learning_rate": 1.9163627295622395e-06, + "loss": -0.0234, + "num_tokens": 974354.0, + "reward": 3.9480810165405273, + "reward_std": 2.8221023082733154, + "rewards/fitness_reward/mean": 4.40368127822876, + "rewards/fitness_reward/std": 3.02516770362854, + "rewards/kidney_reward/mean": -0.20891384780406952, + "rewards/kidney_reward/std": 1.0420880317687988, + "rewards/length2tails_reward/mean": 0.4156648516654968, + "rewards/length2tails_reward/std": 0.43952322006225586, + "rewards/thermo_reward/mean": -0.9101189374923706, + "rewards/thermo_reward/std": 1.2393132448196411, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 267.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.06256379652768373, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2238101363182068, + "learning_rate": 1.9135454576426007e-06, + "loss": -0.0033, + "num_tokens": 982930.0, + "reward": 4.236385822296143, + "reward_std": 2.577726125717163, + "rewards/fitness_reward/mean": 4.564560890197754, + "rewards/fitness_reward/std": 2.516674280166626, + "rewards/kidney_reward/mean": -0.3167581856250763, + "rewards/kidney_reward/std": 0.9142901301383972, + "rewards/length2tails_reward/mean": 0.3586871922016144, + "rewards/length2tails_reward/std": 0.43271785974502563, + "rewards/thermo_reward/mean": -0.5189354419708252, + "rewards/thermo_reward/std": 1.6014158725738525, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 259.0625, + "completions/mean_terminated_length": 259.0625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.05426312144845724, + "epoch": 0.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21272876858711243, + "learning_rate": 1.910683660806177e-06, + "loss": -0.1044, + "num_tokens": 991252.0, + "reward": 4.512127876281738, + "reward_std": 2.468585729598999, + "rewards/fitness_reward/mean": 4.623218536376953, + "rewards/fitness_reward/std": 2.2350494861602783, + "rewards/kidney_reward/mean": -0.07495585083961487, + "rewards/kidney_reward/std": 1.085909128189087, + "rewards/length2tails_reward/mean": 0.3621150851249695, + "rewards/length2tails_reward/std": 0.4594513177871704, + "rewards/thermo_reward/mean": -0.3282831311225891, + "rewards/thermo_reward/std": 1.6514654159545898, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.05443617841228843, + "epoch": 0.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22862917184829712, + "learning_rate": 1.9077774785329087e-06, + "loss": -0.0068, + "num_tokens": 999832.0, + "reward": 4.045974254608154, + "reward_std": 1.8630603551864624, + "rewards/fitness_reward/mean": 4.666137218475342, + "rewards/fitness_reward/std": 2.078634738922119, + "rewards/kidney_reward/mean": -0.4568588137626648, + "rewards/kidney_reward/std": 0.887822687625885, + "rewards/length2tails_reward/mean": 0.3842611312866211, + "rewards/length2tails_reward/std": 0.46035540103912354, + "rewards/thermo_reward/mean": -0.9755982160568237, + "rewards/thermo_reward/std": 1.595231533050537, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 276.46875, + "completions/mean_terminated_length": 276.46875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.10052089486271143, + "epoch": 0.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.551025629043579, + "learning_rate": 1.9048270524660196e-06, + "loss": 0.0763, + "num_tokens": 1008711.0, + "reward": 3.8763861656188965, + "reward_std": 2.691915512084961, + "rewards/fitness_reward/mean": 4.211784362792969, + "rewards/fitness_reward/std": 3.2631149291992188, + "rewards/kidney_reward/mean": -0.20681864023208618, + "rewards/kidney_reward/std": 1.1009536981582642, + "rewards/length2tails_reward/mean": 0.44117099046707153, + "rewards/length2tails_reward/std": 0.4576721787452698, + "rewards/thermo_reward/mean": -0.6845625638961792, + "rewards/thermo_reward/std": 1.8933459520339966, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 277.09375, + "completions/mean_terminated_length": 277.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11264282860793173, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.240277051925659, + "learning_rate": 1.9018325264051138e-06, + "loss": 0.1024, + "num_tokens": 1017610.0, + "reward": 4.128618240356445, + "reward_std": 2.4607045650482178, + "rewards/fitness_reward/mean": 4.626686096191406, + "rewards/fitness_reward/std": 2.66855525970459, + "rewards/kidney_reward/mean": -0.41873425245285034, + "rewards/kidney_reward/std": 0.8528873920440674, + "rewards/length2tails_reward/mean": 0.45648080110549927, + "rewards/length2tails_reward/std": 0.4659040570259094, + "rewards/thermo_reward/mean": -0.8056415319442749, + "rewards/thermo_reward/std": 1.8150216341018677, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.07115628337487578, + "epoch": 0.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28056812286376953, + "learning_rate": 1.8987940462991669e-06, + "loss": -0.0076, + "num_tokens": 1026217.0, + "reward": 5.146751403808594, + "reward_std": 1.9194636344909668, + "rewards/fitness_reward/mean": 5.458418846130371, + "rewards/fitness_reward/std": 1.5903440713882446, + "rewards/kidney_reward/mean": -0.3146427869796753, + "rewards/kidney_reward/std": 0.9314286708831787, + "rewards/length2tails_reward/mean": 0.4887458086013794, + "rewards/length2tails_reward/std": 0.445161372423172, + "rewards/thermo_reward/mean": -0.5530648827552795, + "rewards/thermo_reward/std": 1.9085291624069214, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 288.03125, + "completions/mean_terminated_length": 288.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10206129332073033, + "epoch": 0.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.308597087860107, + "learning_rate": 1.8957117602394128e-06, + "loss": 0.1213, + "num_tokens": 1035466.0, + "reward": 3.710597038269043, + "reward_std": 2.7380971908569336, + "rewards/fitness_reward/mean": 3.901604175567627, + "rewards/fitness_reward/std": 2.8767783641815186, + "rewards/kidney_reward/mean": -0.2230178266763687, + "rewards/kidney_reward/std": 1.0350183248519897, + "rewards/length2tails_reward/mean": 0.439441978931427, + "rewards/length2tails_reward/std": 0.4717496633529663, + "rewards/thermo_reward/mean": -0.37871700525283813, + "rewards/thermo_reward/std": 1.4095770120620728, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07676034257747233, + "epoch": 0.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9029649496078491, + "learning_rate": 1.8925858184521255e-06, + "loss": 0.0311, + "num_tokens": 1044225.0, + "reward": 3.1038806438446045, + "reward_std": 2.6872596740722656, + "rewards/fitness_reward/mean": 3.926145315170288, + "rewards/fitness_reward/std": 3.1735939979553223, + "rewards/kidney_reward/mean": -0.5126339197158813, + "rewards/kidney_reward/std": 0.9371849298477173, + "rewards/length2tails_reward/mean": 0.5384362936019897, + "rewards/length2tails_reward/std": 0.48361003398895264, + "rewards/thermo_reward/mean": -1.401113510131836, + "rewards/thermo_reward/std": 1.7026857137680054, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 262.03125, + "completions/mean_terminated_length": 262.03125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.06442607054486871, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43014267086982727, + "learning_rate": 1.8894163732912974e-06, + "loss": -0.0554, + "num_tokens": 1052642.0, + "reward": 4.679306983947754, + "reward_std": 2.9649219512939453, + "rewards/fitness_reward/mean": 4.555464744567871, + "rewards/fitness_reward/std": 2.9455044269561768, + "rewards/kidney_reward/mean": 0.28108876943588257, + "rewards/kidney_reward/std": 1.0552774667739868, + "rewards/length2tails_reward/mean": 0.4020635783672333, + "rewards/length2tails_reward/std": 0.4390629231929779, + "rewards/thermo_reward/mean": -0.23443603515625, + "rewards/thermo_reward/std": 1.6590263843536377, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 274.34375, + "completions/mean_terminated_length": 274.34375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.12703936896286905, + "epoch": 0.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.771202802658081, + "learning_rate": 1.8862035792312146e-06, + "loss": 0.0538, + "num_tokens": 1061453.0, + "reward": 3.4370691776275635, + "reward_std": 3.001307725906372, + "rewards/fitness_reward/mean": 3.9468846321105957, + "rewards/fitness_reward/std": 3.393200397491455, + "rewards/kidney_reward/mean": -0.46353626251220703, + "rewards/kidney_reward/std": 0.7753000259399414, + "rewards/length2tails_reward/mean": 0.4690501093864441, + "rewards/length2tails_reward/std": 0.4708133935928345, + "rewards/thermo_reward/mean": -0.7906193733215332, + "rewards/thermo_reward/std": 1.7162280082702637, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.1875, + "completions/mean_terminated_length": 264.1875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.05585645651444793, + "epoch": 0.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5052928924560547, + "learning_rate": 1.8829475928589268e-06, + "loss": -0.0554, + "num_tokens": 1069939.0, + "reward": 3.7392311096191406, + "reward_std": 2.379016399383545, + "rewards/fitness_reward/mean": 4.264631271362305, + "rewards/fitness_reward/std": 2.363891839981079, + "rewards/kidney_reward/mean": -0.2556886672973633, + "rewards/kidney_reward/std": 0.9878360629081726, + "rewards/length2tails_reward/mean": 0.39914143085479736, + "rewards/length2tails_reward/std": 0.47185075283050537, + "rewards/thermo_reward/mean": -0.9946828484535217, + "rewards/thermo_reward/std": 1.5393421649932861, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 278.78125, + "completions/mean_terminated_length": 278.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06387731153517962, + "epoch": 0.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9780254364013672, + "learning_rate": 1.8796485728666165e-06, + "loss": 0.1166, + "num_tokens": 1078892.0, + "reward": 3.3900699615478516, + "reward_std": 2.7406327724456787, + "rewards/fitness_reward/mean": 3.765007495880127, + "rewards/fitness_reward/std": 2.5442514419555664, + "rewards/kidney_reward/mean": -0.046911612153053284, + "rewards/kidney_reward/std": 0.9505102634429932, + "rewards/length2tails_reward/mean": 0.27233248949050903, + "rewards/length2tails_reward/std": 0.4196857810020447, + "rewards/thermo_reward/mean": -0.8391291499137878, + "rewards/thermo_reward/std": 1.3835790157318115, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.4375, + "completions/mean_terminated_length": 263.4375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.13032073702197522, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5727119445800781, + "learning_rate": 1.8763066800438634e-06, + "loss": -0.0356, + "num_tokens": 1087354.0, + "reward": 3.966728448867798, + "reward_std": 3.162716865539551, + "rewards/fitness_reward/mean": 4.33012580871582, + "rewards/fitness_reward/std": 3.2471702098846436, + "rewards/kidney_reward/mean": 0.0831863209605217, + "rewards/kidney_reward/std": 1.002663254737854, + "rewards/length2tails_reward/mean": 0.3769824504852295, + "rewards/length2tails_reward/std": 0.4535328149795532, + "rewards/thermo_reward/mean": -0.9984723329544067, + "rewards/thermo_reward/std": 1.4239492416381836, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 282.9375, + "completions/mean_terminated_length": 282.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09651394619140774, + "epoch": 0.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.227329969406128, + "learning_rate": 1.8729220772698095e-06, + "loss": 0.0957, + "num_tokens": 1096440.0, + "reward": 2.99495005607605, + "reward_std": 3.3334567546844482, + "rewards/fitness_reward/mean": 3.3388710021972656, + "rewards/fitness_reward/std": 3.4859747886657715, + "rewards/kidney_reward/mean": -0.3218488097190857, + "rewards/kidney_reward/std": 0.9957481622695923, + "rewards/length2tails_reward/mean": 0.4028223156929016, + "rewards/length2tails_reward/std": 0.45646458864212036, + "rewards/thermo_reward/mean": -0.5674041509628296, + "rewards/thermo_reward/std": 1.813733696937561, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.10553285828791559, + "epoch": 0.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9625844359397888, + "learning_rate": 1.869494929505219e-06, + "loss": 0.0207, + "num_tokens": 1105150.0, + "reward": 3.427802085876465, + "reward_std": 3.8064815998077393, + "rewards/fitness_reward/mean": 3.734769344329834, + "rewards/fitness_reward/std": 4.140002727508545, + "rewards/kidney_reward/mean": -0.16808955371379852, + "rewards/kidney_reward/std": 0.9854453802108765, + "rewards/length2tails_reward/mean": 0.5585049986839294, + "rewards/length2tails_reward/std": 0.47287943959236145, + "rewards/thermo_reward/mean": -0.7250969409942627, + "rewards/thermo_reward/std": 1.93299400806427, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08411063288804144, + "epoch": 0.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2926709651947021, + "learning_rate": 1.8660254037844386e-06, + "loss": 0.0231, + "num_tokens": 1113843.0, + "reward": 3.6425962448120117, + "reward_std": 3.0378100872039795, + "rewards/fitness_reward/mean": 4.0191650390625, + "rewards/fitness_reward/std": 2.870126724243164, + "rewards/kidney_reward/mean": -0.3228492736816406, + "rewards/kidney_reward/std": 0.7792383432388306, + "rewards/length2tails_reward/mean": 0.3848269581794739, + "rewards/length2tails_reward/std": 0.45583122968673706, + "rewards/thermo_reward/mean": -0.6227012872695923, + "rewards/thermo_reward/std": 1.4705400466918945, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.21875, + "completions/mean_terminated_length": 268.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06420792755670846, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7759255766868591, + "learning_rate": 1.8625136692072574e-06, + "loss": -0.0121, + "num_tokens": 1122458.0, + "reward": 3.8680875301361084, + "reward_std": 2.33729887008667, + "rewards/fitness_reward/mean": 4.139684677124023, + "rewards/fitness_reward/std": 2.4506237506866455, + "rewards/kidney_reward/mean": -0.3782632350921631, + "rewards/kidney_reward/std": 0.7467220425605774, + "rewards/length2tails_reward/mean": 0.386510968208313, + "rewards/length2tails_reward/std": 0.4685826301574707, + "rewards/thermo_reward/mean": -0.3581863045692444, + "rewards/thermo_reward/std": 1.5898200273513794, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06495336792431772, + "epoch": 0.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24202904105186462, + "learning_rate": 1.8589598969306644e-06, + "loss": -0.0058, + "num_tokens": 1131058.0, + "reward": 4.778224945068359, + "reward_std": 1.8929619789123535, + "rewards/fitness_reward/mean": 5.180815696716309, + "rewards/fitness_reward/std": 2.0846951007843018, + "rewards/kidney_reward/mean": -0.29253044724464417, + "rewards/kidney_reward/std": 1.0237658023834229, + "rewards/length2tails_reward/mean": 0.3840765953063965, + "rewards/length2tails_reward/std": 0.4389142692089081, + "rewards/thermo_reward/mean": -0.7046886682510376, + "rewards/thermo_reward/std": 1.6220309734344482, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09890062315389514, + "epoch": 0.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5762393474578857, + "learning_rate": 1.8553642601605066e-06, + "loss": 0.0766, + "num_tokens": 1139938.0, + "reward": 4.35146951675415, + "reward_std": 3.159916877746582, + "rewards/fitness_reward/mean": 4.681078910827637, + "rewards/fitness_reward/std": 3.1895546913146973, + "rewards/kidney_reward/mean": -0.3678491711616516, + "rewards/kidney_reward/std": 1.1440036296844482, + "rewards/length2tails_reward/mean": 0.49155309796333313, + "rewards/length2tails_reward/std": 0.44114190340042114, + "rewards/thermo_reward/mean": -0.5371465086936951, + "rewards/thermo_reward/std": 1.8674601316452026, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 277.125, + "completions/mean_terminated_length": 261.7419128417969, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.10243750689551234, + "epoch": 0.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1447086334228516, + "learning_rate": 1.8517269341430474e-06, + "loss": -0.1169, + "num_tokens": 1148838.0, + "reward": 4.371088027954102, + "reward_std": 3.315836191177368, + "rewards/fitness_reward/mean": 4.446979999542236, + "rewards/fitness_reward/std": 3.301693916320801, + "rewards/kidney_reward/mean": 0.21159544587135315, + "rewards/kidney_reward/std": 1.2276256084442139, + "rewards/length2tails_reward/mean": 0.4857953190803528, + "rewards/length2tails_reward/std": 0.47748568654060364, + "rewards/thermo_reward/mean": -0.6062768697738647, + "rewards/thermo_reward/std": 1.8472959995269775, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.042758471332490444, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35708656907081604, + "learning_rate": 1.8480480961564257e-06, + "loss": -0.0114, + "num_tokens": 1157390.0, + "reward": 4.053591251373291, + "reward_std": 1.8370122909545898, + "rewards/fitness_reward/mean": 4.428528308868408, + "rewards/fitness_reward/std": 1.5903440713882446, + "rewards/kidney_reward/mean": -0.29007411003112793, + "rewards/kidney_reward/std": 0.7652667164802551, + "rewards/length2tails_reward/mean": 0.23366816341876984, + "rewards/length2tails_reward/std": 0.39625921845436096, + "rewards/thermo_reward/mean": -0.5766348838806152, + "rewards/thermo_reward/std": 1.304244041442871, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.15004280488938093, + "epoch": 0.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6409659385681152, + "learning_rate": 1.844327925502015e-06, + "loss": -0.0015, + "num_tokens": 1166023.0, + "reward": 3.3718714714050293, + "reward_std": 4.05106258392334, + "rewards/fitness_reward/mean": 3.524176836013794, + "rewards/fitness_reward/std": 4.102492332458496, + "rewards/kidney_reward/mean": -0.29524558782577515, + "rewards/kidney_reward/std": 0.9443924427032471, + "rewards/length2tails_reward/mean": 0.5598483681678772, + "rewards/length2tails_reward/std": 0.4729120135307312, + "rewards/thermo_reward/mean": -0.289289653301239, + "rewards/thermo_reward/std": 1.761705994606018, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08702222676947713, + "epoch": 0.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.362683892250061, + "learning_rate": 1.8405666034956842e-06, + "loss": 0.0883, + "num_tokens": 1174927.0, + "reward": 4.026025295257568, + "reward_std": 2.786008834838867, + "rewards/fitness_reward/mean": 4.502394676208496, + "rewards/fitness_reward/std": 3.0564589500427246, + "rewards/kidney_reward/mean": -0.4614133834838867, + "rewards/kidney_reward/std": 1.0194756984710693, + "rewards/length2tails_reward/mean": 0.4920497536659241, + "rewards/length2tails_reward/std": 0.47884926199913025, + "rewards/thermo_reward/mean": -0.7373506426811218, + "rewards/thermo_reward/std": 1.7863677740097046, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06600146123673767, + "epoch": 0.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3214907646179199, + "learning_rate": 1.8367643134589616e-06, + "loss": -0.009, + "num_tokens": 1183519.0, + "reward": 4.572535514831543, + "reward_std": 1.9083198308944702, + "rewards/fitness_reward/mean": 5.0464630126953125, + "rewards/fitness_reward/std": 1.6709182262420654, + "rewards/kidney_reward/mean": -0.2989814281463623, + "rewards/kidney_reward/std": 0.8420076370239258, + "rewards/length2tails_reward/mean": 0.4243019223213196, + "rewards/length2tails_reward/std": 0.45737841725349426, + "rewards/thermo_reward/mean": -0.8610237836837769, + "rewards/thermo_reward/std": 1.6364610195159912, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 281.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.139061838388443, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1695799827575684, + "learning_rate": 1.8329212407100993e-06, + "loss": 0.1079, + "num_tokens": 1192571.0, + "reward": 3.5707364082336426, + "reward_std": 2.9870543479919434, + "rewards/fitness_reward/mean": 3.8914880752563477, + "rewards/fitness_reward/std": 3.2452211380004883, + "rewards/kidney_reward/mean": -0.30540257692337036, + "rewards/kidney_reward/std": 0.9802507162094116, + "rewards/length2tails_reward/mean": 0.4223281145095825, + "rewards/length2tails_reward/std": 0.48747193813323975, + "rewards/thermo_reward/mean": -0.5472647547721863, + "rewards/thermo_reward/std": 1.4930212497711182, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 274.875, + "completions/mean_terminated_length": 274.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.1304667112417519, + "epoch": 0.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.339874029159546, + "learning_rate": 1.8290375725550415e-06, + "loss": 0.0335, + "num_tokens": 1201399.0, + "reward": 4.265954494476318, + "reward_std": 3.219212293624878, + "rewards/fitness_reward/mean": 4.625532150268555, + "rewards/fitness_reward/std": 3.3417165279388428, + "rewards/kidney_reward/mean": -0.1432976871728897, + "rewards/kidney_reward/std": 0.9689578413963318, + "rewards/length2tails_reward/mean": 0.6148507595062256, + "rewards/length2tails_reward/std": 0.451657772064209, + "rewards/thermo_reward/mean": -0.8832824230194092, + "rewards/thermo_reward/std": 1.83414626121521, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 265.28125, + "completions/mean_terminated_length": 265.28125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.0816122842952609, + "epoch": 0.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1885217428207397, + "learning_rate": 1.825113498278295e-06, + "loss": -0.0397, + "num_tokens": 1209920.0, + "reward": 4.002777576446533, + "reward_std": 2.7675578594207764, + "rewards/fitness_reward/mean": 4.709897994995117, + "rewards/fitness_reward/std": 2.7510030269622803, + "rewards/kidney_reward/mean": -0.6250013113021851, + "rewards/kidney_reward/std": 0.8238852024078369, + "rewards/length2tails_reward/mean": 0.4693373739719391, + "rewards/length2tails_reward/std": 0.46592721343040466, + "rewards/thermo_reward/mean": -1.0239073038101196, + "rewards/thermo_reward/std": 1.7034920454025269, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06394899217411876, + "epoch": 0.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2774324417114258, + "learning_rate": 1.821149209133704e-06, + "loss": -0.0098, + "num_tokens": 1218511.0, + "reward": 3.903625249862671, + "reward_std": 2.363471031188965, + "rewards/fitness_reward/mean": 4.4604902267456055, + "rewards/fitness_reward/std": 2.4980661869049072, + "rewards/kidney_reward/mean": -0.4132443368434906, + "rewards/kidney_reward/std": 0.9047765731811523, + "rewards/length2tails_reward/mean": 0.3875090479850769, + "rewards/length2tails_reward/std": 0.4659252464771271, + "rewards/thermo_reward/mean": -0.89424067735672, + "rewards/thermo_reward/std": 1.5600018501281738, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.5625, + "completions/mean_terminated_length": 267.5625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.07024233182892203, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.926374614238739, + "learning_rate": 1.8171448983351283e-06, + "loss": -0.0145, + "num_tokens": 1227105.0, + "reward": 3.698070764541626, + "reward_std": 3.252227783203125, + "rewards/fitness_reward/mean": 4.266557216644287, + "rewards/fitness_reward/std": 3.184821605682373, + "rewards/kidney_reward/mean": -0.3036004900932312, + "rewards/kidney_reward/std": 0.8457627296447754, + "rewards/length2tails_reward/mean": 0.45677104592323303, + "rewards/length2tails_reward/std": 0.4728664755821228, + "rewards/thermo_reward/mean": -1.061758279800415, + "rewards/thermo_reward/std": 1.7555971145629883, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07606290956027806, + "epoch": 0.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6766843795776367, + "learning_rate": 1.8131007610470274e-06, + "loss": 0.0345, + "num_tokens": 1235833.0, + "reward": 3.8650505542755127, + "reward_std": 2.416300058364868, + "rewards/fitness_reward/mean": 4.350872039794922, + "rewards/fitness_reward/std": 2.4987759590148926, + "rewards/kidney_reward/mean": -0.1330445110797882, + "rewards/kidney_reward/std": 1.0574065446853638, + "rewards/length2tails_reward/mean": 0.4554171562194824, + "rewards/length2tails_reward/std": 0.472525954246521, + "rewards/thermo_reward/mean": -1.0663063526153564, + "rewards/thermo_reward/std": 1.5161410570144653, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.8125, + "completions/mean_terminated_length": 268.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08262498816475272, + "epoch": 0.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5688827633857727, + "learning_rate": 1.8090169943749474e-06, + "loss": -0.0083, + "num_tokens": 1244467.0, + "reward": 3.845224380493164, + "reward_std": 3.0756232738494873, + "rewards/fitness_reward/mean": 4.202630996704102, + "rewards/fitness_reward/std": 3.3465781211853027, + "rewards/kidney_reward/mean": -0.0376429483294487, + "rewards/kidney_reward/std": 1.187543272972107, + "rewards/length2tails_reward/mean": 0.5421406626701355, + "rewards/length2tails_reward/std": 0.4655161201953888, + "rewards/thermo_reward/mean": -0.9482405185699463, + "rewards/thermo_reward/std": 1.8914945125579834, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.09104419802315533, + "epoch": 0.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3129907846450806, + "learning_rate": 1.804893797355914e-06, + "loss": -0.0001, + "num_tokens": 1253067.0, + "reward": 3.577446937561035, + "reward_std": 3.234891891479492, + "rewards/fitness_reward/mean": 4.251026153564453, + "rewards/fitness_reward/std": 3.157855749130249, + "rewards/kidney_reward/mean": -0.5096219182014465, + "rewards/kidney_reward/std": 0.8204793334007263, + "rewards/length2tails_reward/mean": 0.4574005901813507, + "rewards/length2tails_reward/std": 0.46461042761802673, + "rewards/thermo_reward/mean": -1.066237449645996, + "rewards/thermo_reward/std": 1.6232463121414185, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 276.25, + "completions/mean_terminated_length": 276.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12595916108693928, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8075847625732422, + "learning_rate": 1.8007313709487333e-06, + "loss": 0.0794, + "num_tokens": 1261939.0, + "reward": 3.491788387298584, + "reward_std": 2.7610220909118652, + "rewards/fitness_reward/mean": 3.9053263664245605, + "rewards/fitness_reward/std": 3.2118325233459473, + "rewards/kidney_reward/mean": -0.27789413928985596, + "rewards/kidney_reward/std": 0.9338050484657288, + "rewards/length2tails_reward/mean": 0.3519634008407593, + "rewards/length2tails_reward/std": 0.45090824365615845, + "rewards/thermo_reward/mean": -0.7251632213592529, + "rewards/thermo_reward/std": 1.7529805898666382, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 278.8125, + "completions/mean_terminated_length": 278.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09955365816131234, + "epoch": 0.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4168295860290527, + "learning_rate": 1.7965299180241961e-06, + "loss": 0.1004, + "num_tokens": 1270893.0, + "reward": 4.013087272644043, + "reward_std": 3.1201953887939453, + "rewards/fitness_reward/mean": 4.355756759643555, + "rewards/fitness_reward/std": 3.185384750366211, + "rewards/kidney_reward/mean": -0.24855157732963562, + "rewards/kidney_reward/std": 1.0533267259597778, + "rewards/length2tails_reward/mean": 0.4537443518638611, + "rewards/length2tails_reward/std": 0.4465175271034241, + "rewards/thermo_reward/mean": -0.6636590957641602, + "rewards/thermo_reward/std": 1.7202668190002441, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 262.1875, + "completions/mean_terminated_length": 262.1875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.08948546904139221, + "epoch": 0.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4365017414093018, + "learning_rate": 1.7922896433551906e-06, + "loss": -0.068, + "num_tokens": 1279315.0, + "reward": 4.147869110107422, + "reward_std": 2.6149790287017822, + "rewards/fitness_reward/mean": 4.61093282699585, + "rewards/fitness_reward/std": 3.051393985748291, + "rewards/kidney_reward/mean": -0.5923123359680176, + "rewards/kidney_reward/std": 0.7838603258132935, + "rewards/length2tails_reward/mean": 0.4657912850379944, + "rewards/length2tails_reward/std": 0.4592147767543793, + "rewards/thermo_reward/mean": -0.5667106509208679, + "rewards/thermo_reward/std": 1.8495798110961914, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 263.40625, + "completions/mean_terminated_length": 263.40625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.07509045209735632, + "epoch": 0.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3135392665863037, + "learning_rate": 1.7880107536067217e-06, + "loss": -0.0448, + "num_tokens": 1287776.0, + "reward": 4.450174331665039, + "reward_std": 3.310900926589966, + "rewards/fitness_reward/mean": 4.654674530029297, + "rewards/fitness_reward/std": 2.906294345855713, + "rewards/kidney_reward/mean": -0.15413539111614227, + "rewards/kidney_reward/std": 1.2075201272964478, + "rewards/length2tails_reward/mean": 0.4313182830810547, + "rewards/length2tails_reward/std": 0.43745866417884827, + "rewards/thermo_reward/mean": -0.47052454948425293, + "rewards/thermo_reward/std": 1.5457390546798706, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 274.78125, + "completions/mean_terminated_length": 274.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08302459074184299, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.052696704864502, + "learning_rate": 1.7836934573258397e-06, + "loss": 0.081, + "num_tokens": 1296601.0, + "reward": 3.958801746368408, + "reward_std": 2.6489288806915283, + "rewards/fitness_reward/mean": 4.402061462402344, + "rewards/fitness_reward/std": 2.685471773147583, + "rewards/kidney_reward/mean": -0.41507062315940857, + "rewards/kidney_reward/std": 0.9083747863769531, + "rewards/length2tails_reward/mean": 0.41056132316589355, + "rewards/length2tails_reward/std": 0.4427841007709503, + "rewards/thermo_reward/mean": -0.6767306923866272, + "rewards/thermo_reward/std": 1.6160317659378052, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09352858271449804, + "epoch": 0.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9183940291404724, + "learning_rate": 1.7793379649314742e-06, + "loss": 0.0026, + "num_tokens": 1305248.0, + "reward": 4.4203948974609375, + "reward_std": 3.2280428409576416, + "rewards/fitness_reward/mean": 4.428203105926514, + "rewards/fitness_reward/std": 2.9869213104248047, + "rewards/kidney_reward/mean": -0.04570910334587097, + "rewards/kidney_reward/std": 1.0263012647628784, + "rewards/length2tails_reward/mean": 0.49454542994499207, + "rewards/length2tails_reward/std": 0.4444493353366852, + "rewards/thermo_reward/mean": -0.21718043088912964, + "rewards/thermo_reward/std": 1.7211302518844604, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06938954535871744, + "epoch": 0.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5907869935035706, + "learning_rate": 1.7749444887041795e-06, + "loss": -0.0089, + "num_tokens": 1313881.0, + "reward": 4.830990791320801, + "reward_std": 2.054767608642578, + "rewards/fitness_reward/mean": 5.171717166900635, + "rewards/fitness_reward/std": 2.118720531463623, + "rewards/kidney_reward/mean": -0.4066958427429199, + "rewards/kidney_reward/std": 1.0702879428863525, + "rewards/length2tails_reward/mean": 0.4780086874961853, + "rewards/length2tails_reward/std": 0.4582548439502716, + "rewards/thermo_reward/mean": -0.5137604475021362, + "rewards/thermo_reward/std": 1.755508542060852, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 266.9375, + "completions/mean_terminated_length": 266.9375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.08493705908767879, + "epoch": 0.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0603394508361816, + "learning_rate": 1.7705132427757892e-06, + "loss": -0.0185, + "num_tokens": 1322455.0, + "reward": 4.334216117858887, + "reward_std": 2.548267126083374, + "rewards/fitness_reward/mean": 4.720700263977051, + "rewards/fitness_reward/std": 2.269113302230835, + "rewards/kidney_reward/mean": -0.40105870366096497, + "rewards/kidney_reward/std": 1.1038024425506592, + "rewards/length2tails_reward/mean": 0.4668967127799988, + "rewards/length2tails_reward/std": 0.43700388073921204, + "rewards/thermo_reward/mean": -0.6053584814071655, + "rewards/thermo_reward/std": 1.6193071603775024, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08874433115124702, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35046184062957764, + "learning_rate": 1.766044443118978e-06, + "loss": -0.0129, + "num_tokens": 1331245.0, + "reward": 4.575962066650391, + "reward_std": 2.263967752456665, + "rewards/fitness_reward/mean": 5.18062162399292, + "rewards/fitness_reward/std": 2.085413932800293, + "rewards/kidney_reward/mean": -0.4388875961303711, + "rewards/kidney_reward/std": 0.8940988183021545, + "rewards/length2tails_reward/mean": 0.6060910224914551, + "rewards/length2tails_reward/std": 0.45901593565940857, + "rewards/thermo_reward/mean": -1.0734763145446777, + "rewards/thermo_reward/std": 1.7066905498504639, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06172634055837989, + "epoch": 0.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3126675486564636, + "learning_rate": 1.7615383075367368e-06, + "loss": -0.0105, + "num_tokens": 1339845.0, + "reward": 4.250274658203125, + "reward_std": 2.5619237422943115, + "rewards/fitness_reward/mean": 4.572265625, + "rewards/fitness_reward/std": 2.491471529006958, + "rewards/kidney_reward/mean": -0.27898108959198, + "rewards/kidney_reward/std": 0.7466369271278381, + "rewards/length2tails_reward/mean": 0.35668960213661194, + "rewards/length2tails_reward/std": 0.44767114520072937, + "rewards/thermo_reward/mean": -0.5433446764945984, + "rewards/thermo_reward/std": 1.584446907043457, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.71875, + "completions/mean_terminated_length": 267.71875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.09085593721829355, + "epoch": 0.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.448189616203308, + "learning_rate": 1.7569950556517563e-06, + "loss": -0.015, + "num_tokens": 1348444.0, + "reward": 4.617559432983398, + "reward_std": 2.328192710876465, + "rewards/fitness_reward/mean": 5.206052780151367, + "rewards/fitness_reward/std": 2.386223793029785, + "rewards/kidney_reward/mean": -0.29924219846725464, + "rewards/kidney_reward/std": 0.9480889439582825, + "rewards/length2tails_reward/mean": 0.5548272132873535, + "rewards/length2tails_reward/std": 0.45655137300491333, + "rewards/thermo_reward/mean": -1.1551580429077148, + "rewards/thermo_reward/std": 1.9274123907089233, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 267.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.05919750058092177, + "epoch": 0.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24001403152942657, + "learning_rate": 1.7524149088957242e-06, + "loss": -0.006, + "num_tokens": 1357042.0, + "reward": 4.215601444244385, + "reward_std": 1.6457915306091309, + "rewards/fitness_reward/mean": 5.0464630126953125, + "rewards/fitness_reward/std": 1.6709182262420654, + "rewards/kidney_reward/mean": -0.6109973192214966, + "rewards/kidney_reward/std": 0.6350767016410828, + "rewards/length2tails_reward/mean": 0.4455958604812622, + "rewards/length2tails_reward/std": 0.45376908779144287, + "rewards/thermo_reward/mean": -1.2735230922698975, + "rewards/thermo_reward/std": 1.7035574913024902, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.75, + "completions/mean_terminated_length": 266.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.048335404484532773, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4476269781589508, + "learning_rate": 1.747798090498532e-06, + "loss": -0.006, + "num_tokens": 1365610.0, + "reward": 3.9586968421936035, + "reward_std": 2.452965021133423, + "rewards/fitness_reward/mean": 4.494963645935059, + "rewards/fitness_reward/std": 2.396350145339966, + "rewards/kidney_reward/mean": -0.33061426877975464, + "rewards/kidney_reward/std": 0.932668924331665, + "rewards/length2tails_reward/mean": 0.33582741022109985, + "rewards/length2tails_reward/std": 0.45137956738471985, + "rewards/thermo_reward/mean": -0.9098325371742249, + "rewards/thermo_reward/std": 1.6104899644851685, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 289.40625, + "completions/mean_terminated_length": 289.40625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.1205522520467639, + "epoch": 0.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.124131917953491, + "learning_rate": 1.743144825477394e-06, + "loss": 0.151, + "num_tokens": 1374903.0, + "reward": 3.7194204330444336, + "reward_std": 3.6462106704711914, + "rewards/fitness_reward/mean": 3.876713752746582, + "rewards/fitness_reward/std": 3.5486321449279785, + "rewards/kidney_reward/mean": -0.2034071534872055, + "rewards/kidney_reward/std": 1.0929036140441895, + "rewards/length2tails_reward/mean": 0.5232511758804321, + "rewards/length2tails_reward/std": 0.4814155101776123, + "rewards/thermo_reward/mean": -0.37280526757240295, + "rewards/thermo_reward/std": 1.645125389099121, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13657673075795174, + "epoch": 0.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0489912033081055, + "learning_rate": 1.738455340625884e-06, + "loss": 0.1117, + "num_tokens": 1383895.0, + "reward": 3.96601939201355, + "reward_std": 3.3946123123168945, + "rewards/fitness_reward/mean": 4.056211471557617, + "rewards/fitness_reward/std": 3.094337224960327, + "rewards/kidney_reward/mean": -0.0832638144493103, + "rewards/kidney_reward/std": 1.1776015758514404, + "rewards/length2tails_reward/mean": 0.4087567627429962, + "rewards/length2tails_reward/std": 0.4787772297859192, + "rewards/thermo_reward/mean": -0.3014984726905823, + "rewards/thermo_reward/std": 1.4508898258209229, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 268.1875, + "completions/mean_terminated_length": 268.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.06550772534683347, + "epoch": 0.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18433576822280884, + "learning_rate": 1.7337298645028762e-06, + "loss": -0.0032, + "num_tokens": 1392509.0, + "reward": 4.696930885314941, + "reward_std": 2.494917392730713, + "rewards/fitness_reward/mean": 5.274045944213867, + "rewards/fitness_reward/std": 2.107184410095215, + "rewards/kidney_reward/mean": -0.14291353523731232, + "rewards/kidney_reward/std": 1.0486446619033813, + "rewards/length2tails_reward/mean": 0.46765953302383423, + "rewards/length2tails_reward/std": 0.4551301896572113, + "rewards/thermo_reward/mean": -1.2451454401016235, + "rewards/thermo_reward/std": 1.6586072444915771, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 280.125, + "completions/mean_terminated_length": 280.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.1450829952955246, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8760439157485962, + "learning_rate": 1.7289686274214115e-06, + "loss": 0.0762, + "num_tokens": 1401505.0, + "reward": 3.5760703086853027, + "reward_std": 3.456336498260498, + "rewards/fitness_reward/mean": 3.7789759635925293, + "rewards/fitness_reward/std": 3.8132376670837402, + "rewards/kidney_reward/mean": -0.31448692083358765, + "rewards/kidney_reward/std": 1.1399399042129517, + "rewards/length2tails_reward/mean": 0.5606054663658142, + "rewards/length2tails_reward/std": 0.44976624846458435, + "rewards/thermo_reward/mean": -0.37162667512893677, + "rewards/thermo_reward/std": 1.8894951343536377, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 276.4375, + "completions/mean_terminated_length": 276.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08699913625605404, + "epoch": 0.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078630208969116, + "learning_rate": 1.7241718614374676e-06, + "loss": 0.0915, + "num_tokens": 1410383.0, + "reward": 4.635051727294922, + "reward_std": 2.282209873199463, + "rewards/fitness_reward/mean": 4.993934631347656, + "rewards/fitness_reward/std": 2.4263358116149902, + "rewards/kidney_reward/mean": -0.27896836400032043, + "rewards/kidney_reward/std": 1.1161572933197021, + "rewards/length2tails_reward/mean": 0.5413184762001038, + "rewards/length2tails_reward/std": 0.4588867127895355, + "rewards/thermo_reward/mean": -0.7094570398330688, + "rewards/thermo_reward/std": 1.8480961322784424, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 277.53125, + "completions/mean_terminated_length": 277.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11324500991031528, + "epoch": 0.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1256017684936523, + "learning_rate": 1.719339800338651e-06, + "loss": 0.0631, + "num_tokens": 1419296.0, + "reward": 3.5607268810272217, + "reward_std": 3.390486240386963, + "rewards/fitness_reward/mean": 3.993180274963379, + "rewards/fitness_reward/std": 3.614488124847412, + "rewards/kidney_reward/mean": -0.2305232584476471, + "rewards/kidney_reward/std": 1.0963412523269653, + "rewards/length2tails_reward/mean": 0.5686094760894775, + "rewards/length2tails_reward/std": 0.4640563726425171, + "rewards/thermo_reward/mean": -0.9186879396438599, + "rewards/thermo_reward/std": 1.859251856803894, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09377507958561182, + "epoch": 0.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5230877995491028, + "learning_rate": 1.7144726796328032e-06, + "loss": -0.0152, + "num_tokens": 1427960.0, + "reward": 5.218613624572754, + "reward_std": 2.639878273010254, + "rewards/fitness_reward/mean": 4.980596542358398, + "rewards/fitness_reward/std": 2.531390428543091, + "rewards/kidney_reward/mean": 0.03574337065219879, + "rewards/kidney_reward/std": 1.0980381965637207, + "rewards/length2tails_reward/mean": 0.499345064163208, + "rewards/length2tails_reward/std": 0.45680156350135803, + "rewards/thermo_reward/mean": 0.19061768054962158, + "rewards/thermo_reward/std": 1.5000156164169312, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.4375, + "completions/mean_terminated_length": 267.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0813692519441247, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6022782325744629, + "learning_rate": 1.7095707365365209e-06, + "loss": -0.0071, + "num_tokens": 1436550.0, + "reward": 4.692296981811523, + "reward_std": 2.3082761764526367, + "rewards/fitness_reward/mean": 4.976771831512451, + "rewards/fitness_reward/std": 2.091456174850464, + "rewards/kidney_reward/mean": -0.1753438264131546, + "rewards/kidney_reward/std": 1.1265095472335815, + "rewards/length2tails_reward/mean": 0.45045024156570435, + "rewards/length2tails_reward/std": 0.4337252676486969, + "rewards/thermo_reward/mean": -0.618831217288971, + "rewards/thermo_reward/std": 1.7613445520401, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 266.46875, + "completions/mean_terminated_length": 266.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.052766332402825356, + "epoch": 0.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45419779419898987, + "learning_rate": 1.7046342099635947e-06, + "loss": -0.0043, + "num_tokens": 1445109.0, + "reward": 3.656085252761841, + "reward_std": 2.750291109085083, + "rewards/fitness_reward/mean": 4.10206413269043, + "rewards/fitness_reward/std": 2.6468820571899414, + "rewards/kidney_reward/mean": -0.2564317584037781, + "rewards/kidney_reward/std": 1.0081008672714233, + "rewards/length2tails_reward/mean": 0.31663069128990173, + "rewards/length2tails_reward/std": 0.41682401299476624, + "rewards/thermo_reward/mean": -0.7938418984413147, + "rewards/thermo_reward/std": 1.628943681716919, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.14256688253954053, + "epoch": 0.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8583552837371826, + "learning_rate": 1.6996633405133653e-06, + "loss": 0.0395, + "num_tokens": 1453807.0, + "reward": 3.794830799102783, + "reward_std": 3.588675022125244, + "rewards/fitness_reward/mean": 3.962114095687866, + "rewards/fitness_reward/std": 3.6419920921325684, + "rewards/kidney_reward/mean": -0.18258120119571686, + "rewards/kidney_reward/std": 1.058857798576355, + "rewards/length2tails_reward/mean": 0.43951690196990967, + "rewards/length2tails_reward/std": 0.4417087137699127, + "rewards/thermo_reward/mean": -0.37174350023269653, + "rewards/thermo_reward/std": 1.7920690774917603, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0489120373968035, + "epoch": 0.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3171038031578064, + "learning_rate": 1.6946583704589972e-06, + "loss": -0.0062, + "num_tokens": 1462379.0, + "reward": 4.280658721923828, + "reward_std": 1.986229419708252, + "rewards/fitness_reward/mean": 4.737495422363281, + "rewards/fitness_reward/std": 1.6610602140426636, + "rewards/kidney_reward/mean": -0.3628132939338684, + "rewards/kidney_reward/std": 0.8554661870002747, + "rewards/length2tails_reward/mean": 0.30981114506721497, + "rewards/length2tails_reward/std": 0.4345083236694336, + "rewards/thermo_reward/mean": -0.7057662010192871, + "rewards/thermo_reward/std": 1.439083456993103, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09779051598161459, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.927237868309021, + "learning_rate": 1.6896195437356697e-06, + "loss": 0.0958, + "num_tokens": 1471303.0, + "reward": 4.192678451538086, + "reward_std": 2.79734206199646, + "rewards/fitness_reward/mean": 4.619560241699219, + "rewards/fitness_reward/std": 2.6883301734924316, + "rewards/kidney_reward/mean": -0.31013429164886475, + "rewards/kidney_reward/std": 1.0193806886672974, + "rewards/length2tails_reward/mean": 0.5570563077926636, + "rewards/length2tails_reward/std": 0.45604071021080017, + "rewards/thermo_reward/mean": -0.8221579194068909, + "rewards/thermo_reward/std": 1.5529719591140747, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.8125, + "completions/mean_terminated_length": 266.8125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.0863744979724288, + "epoch": 0.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.816534399986267, + "learning_rate": 1.6845471059286886e-06, + "loss": -0.0229, + "num_tokens": 1479873.0, + "reward": 3.920487880706787, + "reward_std": 2.4281582832336426, + "rewards/fitness_reward/mean": 4.564079284667969, + "rewards/fitness_reward/std": 2.8623640537261963, + "rewards/kidney_reward/mean": -0.3698844611644745, + "rewards/kidney_reward/std": 0.9845880270004272, + "rewards/length2tails_reward/mean": 0.5217670202255249, + "rewards/length2tails_reward/std": 0.46941322088241577, + "rewards/thermo_reward/mean": -1.1781814098358154, + "rewards/thermo_reward/std": 1.6795637607574463, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09404147509485483, + "epoch": 0.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7808253169059753, + "learning_rate": 1.6794413042615166e-06, + "loss": -0.0213, + "num_tokens": 1488634.0, + "reward": 5.1834716796875, + "reward_std": 1.9101296663284302, + "rewards/fitness_reward/mean": 5.6643967628479, + "rewards/fitness_reward/std": 1.505463719367981, + "rewards/kidney_reward/mean": -0.19529178738594055, + "rewards/kidney_reward/std": 1.164594054222107, + "rewards/length2tails_reward/mean": 0.4930678606033325, + "rewards/length2tails_reward/std": 0.46066921949386597, + "rewards/thermo_reward/mean": -1.0130927562713623, + "rewards/thermo_reward/std": 1.7528146505355835, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 278.5625, + "completions/mean_terminated_length": 278.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13403520546853542, + "epoch": 0.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6008855104446411, + "learning_rate": 1.6743023875837233e-06, + "loss": 0.0825, + "num_tokens": 1497580.0, + "reward": 4.644342422485352, + "reward_std": 2.5929994583129883, + "rewards/fitness_reward/mean": 5.213815689086914, + "rewards/fitness_reward/std": 2.7655608654022217, + "rewards/kidney_reward/mean": -0.3567451238632202, + "rewards/kidney_reward/std": 1.0828732252120972, + "rewards/length2tails_reward/mean": 0.6193132400512695, + "rewards/length2tails_reward/std": 0.4164426624774933, + "rewards/thermo_reward/mean": -1.0918560028076172, + "rewards/thermo_reward/std": 2.0735950469970703, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.8125, + "completions/mean_terminated_length": 264.8125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.08579262811690569, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9887391328811646, + "learning_rate": 1.669130606358858e-06, + "loss": -0.0684, + "num_tokens": 1506086.0, + "reward": 4.761046409606934, + "reward_std": 2.4325408935546875, + "rewards/fitness_reward/mean": 5.2099928855896, + "rewards/fitness_reward/std": 2.369246482849121, + "rewards/kidney_reward/mean": -0.16196635365486145, + "rewards/kidney_reward/std": 1.1331666707992554, + "rewards/length2tails_reward/mean": 0.6166989803314209, + "rewards/length2tails_reward/std": 0.4662693738937378, + "rewards/thermo_reward/mean": -1.0442757606506348, + "rewards/thermo_reward/std": 1.8124314546585083, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 282.65625, + "completions/mean_terminated_length": 282.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12472502840682864, + "epoch": 0.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4417450428009033, + "learning_rate": 1.6639262126522415e-06, + "loss": 0.1348, + "num_tokens": 1515163.0, + "reward": 3.654130458831787, + "reward_std": 2.815221071243286, + "rewards/fitness_reward/mean": 4.169241905212402, + "rewards/fitness_reward/std": 3.082991361618042, + "rewards/kidney_reward/mean": -0.44877898693084717, + "rewards/kidney_reward/std": 0.953915536403656, + "rewards/length2tails_reward/mean": 0.46090492606163025, + "rewards/length2tails_reward/std": 0.4796352982521057, + "rewards/thermo_reward/mean": -0.8118960857391357, + "rewards/thermo_reward/std": 1.7251765727996826, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.09075398044660687, + "epoch": 0.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7016935348510742, + "learning_rate": 1.6586894601186803e-06, + "loss": -0.0006, + "num_tokens": 1523786.0, + "reward": 4.55905818939209, + "reward_std": 2.30279541015625, + "rewards/fitness_reward/mean": 5.1573991775512695, + "rewards/fitness_reward/std": 2.173652410507202, + "rewards/kidney_reward/mean": -0.48578259348869324, + "rewards/kidney_reward/std": 1.0091580152511597, + "rewards/length2tails_reward/mean": 0.5137011408805847, + "rewards/length2tails_reward/std": 0.456093966960907, + "rewards/thermo_reward/mean": -0.967749834060669, + "rewards/thermo_reward/std": 1.7618026733398438, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 275.3125, + "completions/mean_terminated_length": 275.3125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.10816240543499589, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0683670043945312, + "learning_rate": 1.6534206039901055e-06, + "loss": 0.0612, + "num_tokens": 1532628.0, + "reward": 4.2663373947143555, + "reward_std": 3.062441825866699, + "rewards/fitness_reward/mean": 4.755006790161133, + "rewards/fitness_reward/std": 2.9233791828155518, + "rewards/kidney_reward/mean": -0.5185624361038208, + "rewards/kidney_reward/std": 0.9748953580856323, + "rewards/length2tails_reward/mean": 0.5505917072296143, + "rewards/length2tails_reward/std": 0.47932368516921997, + "rewards/thermo_reward/mean": -0.7340719699859619, + "rewards/thermo_reward/std": 1.7369695901870728, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 262.96875, + "completions/mean_terminated_length": 262.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.07547976076602936, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7118106484413147, + "learning_rate": 1.6481199010631309e-06, + "loss": -0.0746, + "num_tokens": 1541075.0, + "reward": 4.428964614868164, + "reward_std": 2.574803113937378, + "rewards/fitness_reward/mean": 4.689699172973633, + "rewards/fitness_reward/std": 2.8076272010803223, + "rewards/kidney_reward/mean": -0.3593696355819702, + "rewards/kidney_reward/std": 0.9871580600738525, + "rewards/length2tails_reward/mean": 0.5770972967147827, + "rewards/length2tails_reward/std": 0.4420401155948639, + "rewards/thermo_reward/mean": -0.4506470859050751, + "rewards/thermo_reward/std": 1.959010362625122, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11087105982005596, + "epoch": 0.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9738964438438416, + "learning_rate": 1.6427876096865393e-06, + "loss": 0.0369, + "num_tokens": 1549850.0, + "reward": 4.867439270019531, + "reward_std": 3.0394482612609863, + "rewards/fitness_reward/mean": 5.090075492858887, + "rewards/fitness_reward/std": 2.84049391746521, + "rewards/kidney_reward/mean": -0.08992902934551239, + "rewards/kidney_reward/std": 1.4132450819015503, + "rewards/length2tails_reward/mean": 0.609131395816803, + "rewards/length2tails_reward/std": 0.42378756403923035, + "rewards/thermo_reward/mean": -0.6599090099334717, + "rewards/thermo_reward/std": 1.848639965057373, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 266.78125, + "completions/mean_terminated_length": 266.78125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.1331650954671204, + "epoch": 0.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5233911275863647, + "learning_rate": 1.6374239897486897e-06, + "loss": -0.0134, + "num_tokens": 1558419.0, + "reward": 4.055957317352295, + "reward_std": 3.2039151191711426, + "rewards/fitness_reward/mean": 4.328873634338379, + "rewards/fitness_reward/std": 3.307469606399536, + "rewards/kidney_reward/mean": -0.20703302323818207, + "rewards/kidney_reward/std": 1.0310391187667847, + "rewards/length2tails_reward/mean": 0.4950253665447235, + "rewards/length2tails_reward/std": 0.4483602046966553, + "rewards/thermo_reward/mean": -0.5863116979598999, + "rewards/thermo_reward/std": 1.6916553974151611, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1012274301610887, + "epoch": 0.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4355965554714203, + "learning_rate": 1.6320293026648508e-06, + "loss": -0.0079, + "num_tokens": 1567093.0, + "reward": 5.45328426361084, + "reward_std": 1.7587991952896118, + "rewards/fitness_reward/mean": 5.767385959625244, + "rewards/fitness_reward/std": 1.449892282485962, + "rewards/kidney_reward/mean": -0.20507873594760895, + "rewards/kidney_reward/std": 1.233489990234375, + "rewards/length2tails_reward/mean": 0.6022515892982483, + "rewards/length2tails_reward/std": 0.4060637056827545, + "rewards/thermo_reward/mean": -0.7242498397827148, + "rewards/thermo_reward/std": 1.6031584739685059, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08426529681310058, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.54383385181427, + "learning_rate": 1.6266038113644605e-06, + "loss": -0.0065, + "num_tokens": 1575733.0, + "reward": 5.659174919128418, + "reward_std": 1.9989067316055298, + "rewards/fitness_reward/mean": 5.87037467956543, + "rewards/fitness_reward/std": 1.3842169046401978, + "rewards/kidney_reward/mean": -0.31498047709465027, + "rewards/kidney_reward/std": 1.2313039302825928, + "rewards/length2tails_reward/mean": 0.599600613117218, + "rewards/length2tails_reward/std": 0.42347168922424316, + "rewards/thermo_reward/mean": -0.40721940994262695, + "rewards/thermo_reward/std": 1.898787021636963, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 275.84375, + "completions/mean_terminated_length": 275.84375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.10651430813595653, + "epoch": 0.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1690618991851807, + "learning_rate": 1.6211477802783102e-06, + "loss": 0.085, + "num_tokens": 1584592.0, + "reward": 4.864093780517578, + "reward_std": 2.611569881439209, + "rewards/fitness_reward/mean": 5.151647567749023, + "rewards/fitness_reward/std": 2.1961724758148193, + "rewards/kidney_reward/mean": -0.056402117013931274, + "rewards/kidney_reward/std": 1.097105622291565, + "rewards/length2tails_reward/mean": 0.5109585523605347, + "rewards/length2tails_reward/std": 0.44378817081451416, + "rewards/thermo_reward/mean": -0.7741846442222595, + "rewards/thermo_reward/std": 1.9916399717330933, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 261.0, + "completions/mean_terminated_length": 261.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.09026870923116803, + "epoch": 0.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0212987661361694, + "learning_rate": 1.615661475325658e-06, + "loss": -0.0984, + "num_tokens": 1592976.0, + "reward": 4.4314727783203125, + "reward_std": 3.0072882175445557, + "rewards/fitness_reward/mean": 5.038105487823486, + "rewards/fitness_reward/std": 3.0175745487213135, + "rewards/kidney_reward/mean": -0.34879419207572937, + "rewards/kidney_reward/std": 1.0839778184890747, + "rewards/length2tails_reward/mean": 0.6546170711517334, + "rewards/length2tails_reward/std": 0.42878323793411255, + "rewards/thermo_reward/mean": -1.191779613494873, + "rewards/thermo_reward/std": 1.9129706621170044, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 279.40625, + "completions/mean_terminated_length": 279.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1478779586032033, + "epoch": 0.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.895674228668213, + "learning_rate": 1.6101451639012677e-06, + "loss": 0.0838, + "num_tokens": 1601949.0, + "reward": 4.561385154724121, + "reward_std": 3.6965315341949463, + "rewards/fitness_reward/mean": 4.620615005493164, + "rewards/fitness_reward/std": 3.3410534858703613, + "rewards/kidney_reward/mean": -0.08034095913171768, + "rewards/kidney_reward/std": 1.157870888710022, + "rewards/length2tails_reward/mean": 0.5722231864929199, + "rewards/length2tails_reward/std": 0.4358516335487366, + "rewards/thermo_reward/mean": -0.3242303729057312, + "rewards/thermo_reward/std": 1.781525731086731, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.1301782624796033, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1593161821365356, + "learning_rate": 1.604599114862375e-06, + "loss": 0.0082, + "num_tokens": 1610623.0, + "reward": 4.000077247619629, + "reward_std": 3.2169172763824463, + "rewards/fitness_reward/mean": 4.352954864501953, + "rewards/fitness_reward/std": 3.510301113128662, + "rewards/kidney_reward/mean": -0.22353899478912354, + "rewards/kidney_reward/std": 1.2535496950149536, + "rewards/length2tails_reward/mean": 0.5596377849578857, + "rewards/length2tails_reward/std": 0.44628483057022095, + "rewards/thermo_reward/mean": -0.7620350122451782, + "rewards/thermo_reward/std": 1.802309274673462, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.0893715862184763, + "epoch": 0.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2550925016403198, + "learning_rate": 1.5990235985155857e-06, + "loss": 0.0254, + "num_tokens": 1619269.0, + "reward": 4.491425037384033, + "reward_std": 2.9865565299987793, + "rewards/fitness_reward/mean": 4.715381145477295, + "rewards/fitness_reward/std": 3.102266788482666, + "rewards/kidney_reward/mean": -0.037865400314331055, + "rewards/kidney_reward/std": 1.1800258159637451, + "rewards/length2tails_reward/mean": 0.448363333940506, + "rewards/length2tails_reward/std": 0.4457945227622986, + "rewards/thermo_reward/mean": -0.6342282891273499, + "rewards/thermo_reward/std": 1.8643159866333008, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 275.625, + "completions/mean_terminated_length": 275.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12593226647004485, + "epoch": 0.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3675857782363892, + "learning_rate": 1.5934188866037015e-06, + "loss": 0.0445, + "num_tokens": 1628121.0, + "reward": 3.7773280143737793, + "reward_std": 3.876007318496704, + "rewards/fitness_reward/mean": 4.293154716491699, + "rewards/fitness_reward/std": 3.9880714416503906, + "rewards/kidney_reward/mean": -0.42351239919662476, + "rewards/kidney_reward/std": 1.1867891550064087, + "rewards/length2tails_reward/mean": 0.6898555159568787, + "rewards/length2tails_reward/std": 0.38954636454582214, + "rewards/thermo_reward/mean": -0.9530682563781738, + "rewards/thermo_reward/std": 2.0106310844421387, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.15625, + "completions/mean_terminated_length": 263.15625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.14193418761715293, + "epoch": 0.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.44107985496521, + "learning_rate": 1.587785252292473e-06, + "loss": -0.0755, + "num_tokens": 1636574.0, + "reward": 5.446175575256348, + "reward_std": 2.6817963123321533, + "rewards/fitness_reward/mean": 5.443154811859131, + "rewards/fitness_reward/std": 2.656590461730957, + "rewards/kidney_reward/mean": -0.061875101178884506, + "rewards/kidney_reward/std": 1.2702064514160156, + "rewards/length2tails_reward/mean": 0.6279922723770142, + "rewards/length2tails_reward/std": 0.4024559259414673, + "rewards/thermo_reward/mean": -0.2460794299840927, + "rewards/thermo_reward/std": 2.1052651405334473, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1061564115807414, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2134290933609009, + "learning_rate": 1.5821229701572893e-06, + "loss": 0.0174, + "num_tokens": 1645268.0, + "reward": 4.806356430053711, + "reward_std": 2.3874363899230957, + "rewards/fitness_reward/mean": 5.02333927154541, + "rewards/fitness_reward/std": 2.3042352199554443, + "rewards/kidney_reward/mean": -0.09063389897346497, + "rewards/kidney_reward/std": 1.093129277229309, + "rewards/length2tails_reward/mean": 0.48920953273773193, + "rewards/length2tails_reward/std": 0.44646406173706055, + "rewards/thermo_reward/mean": -0.5879369974136353, + "rewards/thermo_reward/std": 1.769646167755127, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11166581884026527, + "epoch": 0.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3294186294078827, + "learning_rate": 1.5764323161697932e-06, + "loss": -0.0034, + "num_tokens": 1653945.0, + "reward": 6.120540618896484, + "reward_std": 1.7913304567337036, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157715559005737, + "rewards/kidney_reward/mean": 0.07283775508403778, + "rewards/kidney_reward/std": 1.3584492206573486, + "rewards/length2tails_reward/mean": 0.6484661102294922, + "rewards/length2tails_reward/std": 0.38605931401252747, + "rewards/thermo_reward/mean": -0.30869537591934204, + "rewards/thermo_reward/std": 1.9456088542938232, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11868252046406269, + "epoch": 0.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6086766719818115, + "learning_rate": 1.5707135676844319e-06, + "loss": 0.0242, + "num_tokens": 1662654.0, + "reward": 5.367982387542725, + "reward_std": 2.7623069286346436, + "rewards/fitness_reward/mean": 5.724161148071289, + "rewards/fitness_reward/std": 2.252316951751709, + "rewards/kidney_reward/mean": -0.4771338403224945, + "rewards/kidney_reward/std": 1.0981281995773315, + "rewards/length2tails_reward/mean": 0.6890479326248169, + "rewards/length2tails_reward/std": 0.36876070499420166, + "rewards/thermo_reward/mean": -0.5797474384307861, + "rewards/thermo_reward/std": 1.96684730052948, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.10897703189402819, + "epoch": 0.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7039817571640015, + "learning_rate": 1.564967003424938e-06, + "loss": -0.0034, + "num_tokens": 1671306.0, + "reward": 4.808743476867676, + "reward_std": 3.007719039916992, + "rewards/fitness_reward/mean": 4.748951435089111, + "rewards/fitness_reward/std": 2.986243724822998, + "rewards/kidney_reward/mean": -0.17581090331077576, + "rewards/kidney_reward/std": 1.232522964477539, + "rewards/length2tails_reward/mean": 0.5685861110687256, + "rewards/length2tails_reward/std": 0.4413212239742279, + "rewards/thermo_reward/mean": 0.011101648211479187, + "rewards/thermo_reward/std": 1.8753315210342407, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11015730071812868, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9565957188606262, + "learning_rate": 1.5591929034707466e-06, + "loss": 0.027, + "num_tokens": 1680101.0, + "reward": 5.647148132324219, + "reward_std": 2.985407829284668, + "rewards/fitness_reward/mean": 5.495434761047363, + "rewards/fitness_reward/std": 2.468876838684082, + "rewards/kidney_reward/mean": 0.06246180832386017, + "rewards/kidney_reward/std": 1.225472331047058, + "rewards/length2tails_reward/mean": 0.7170220613479614, + "rewards/length2tails_reward/std": 0.3801810145378113, + "rewards/thermo_reward/mean": -0.11754542589187622, + "rewards/thermo_reward/std": 1.9760782718658447, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.12005980499088764, + "epoch": 0.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6544195413589478, + "learning_rate": 1.553391549243344e-06, + "loss": -0.0182, + "num_tokens": 1688763.0, + "reward": 4.954888820648193, + "reward_std": 2.4991729259490967, + "rewards/fitness_reward/mean": 5.40878963470459, + "rewards/fitness_reward/std": 2.3679397106170654, + "rewards/kidney_reward/mean": -0.49122127890586853, + "rewards/kidney_reward/std": 1.202668309211731, + "rewards/length2tails_reward/mean": 0.6947495937347412, + "rewards/length2tails_reward/std": 0.41318637132644653, + "rewards/thermo_reward/mean": -0.7639557123184204, + "rewards/thermo_reward/std": 1.8023275136947632, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12109254207462072, + "epoch": 0.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2701301574707031, + "learning_rate": 1.5475632234925502e-06, + "loss": 0.042, + "num_tokens": 1697570.0, + "reward": 4.846124649047852, + "reward_std": 2.819533348083496, + "rewards/fitness_reward/mean": 5.228612899780273, + "rewards/fitness_reward/std": 2.733485221862793, + "rewards/kidney_reward/mean": -0.49513113498687744, + "rewards/kidney_reward/std": 1.1379722356796265, + "rewards/length2tails_reward/mean": 0.7269919514656067, + "rewards/length2tails_reward/std": 0.378961980342865, + "rewards/thermo_reward/mean": -0.6333409547805786, + "rewards/thermo_reward/std": 1.9405333995819092, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 263.8125, + "completions/mean_terminated_length": 263.8125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.10591288423165679, + "epoch": 0.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9085234999656677, + "learning_rate": 1.54170821028274e-06, + "loss": -0.0855, + "num_tokens": 1706044.0, + "reward": 5.504973411560059, + "reward_std": 2.731473684310913, + "rewards/fitness_reward/mean": 5.504761219024658, + "rewards/fitness_reward/std": 2.3768563270568848, + "rewards/kidney_reward/mean": -0.12942233681678772, + "rewards/kidney_reward/std": 1.1659127473831177, + "rewards/length2tails_reward/mean": 0.6184656023979187, + "rewards/length2tails_reward/std": 0.4209434986114502, + "rewards/thermo_reward/mean": -0.17938566207885742, + "rewards/thermo_reward/std": 1.864436388015747, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11573971435427666, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7765392065048218, + "learning_rate": 1.5358267949789964e-06, + "loss": 0.0344, + "num_tokens": 1714808.0, + "reward": 5.148538112640381, + "reward_std": 2.2965903282165527, + "rewards/fitness_reward/mean": 5.610849857330322, + "rewards/fitness_reward/std": 2.3347136974334717, + "rewards/kidney_reward/mean": -0.3197159171104431, + "rewards/kidney_reward/std": 1.0197172164916992, + "rewards/length2tails_reward/mean": 0.6769318580627441, + "rewards/length2tails_reward/std": 0.43633151054382324, + "rewards/thermo_reward/mean": -0.9433727860450745, + "rewards/thermo_reward/std": 2.0293006896972656, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 275.0625, + "completions/mean_terminated_length": 275.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11778477020561695, + "epoch": 0.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.153994560241699, + "learning_rate": 1.5299192642332049e-06, + "loss": 0.0619, + "num_tokens": 1723642.0, + "reward": 5.358760356903076, + "reward_std": 2.5633435249328613, + "rewards/fitness_reward/mean": 5.500892639160156, + "rewards/fitness_reward/std": 2.3947198390960693, + "rewards/kidney_reward/mean": -0.1386294811964035, + "rewards/kidney_reward/std": 1.0449038743972778, + "rewards/length2tails_reward/mean": 0.6533316373825073, + "rewards/length2tails_reward/std": 0.40731319785118103, + "rewards/thermo_reward/mean": -0.47230052947998047, + "rewards/thermo_reward/std": 2.0186452865600586, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12082445435225964, + "epoch": 0.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.474387526512146, + "learning_rate": 1.5239859059700793e-06, + "loss": -0.0076, + "num_tokens": 1732328.0, + "reward": 5.87947940826416, + "reward_std": 1.8399745225906372, + "rewards/fitness_reward/mean": 5.973363876342773, + "rewards/fitness_reward/std": 1.3069151639938354, + "rewards/kidney_reward/mean": -0.43369150161743164, + "rewards/kidney_reward/std": 1.1159157752990723, + "rewards/length2tails_reward/mean": 0.746300220489502, + "rewards/length2tails_reward/std": 0.3608926236629486, + "rewards/thermo_reward/mean": -0.1272280216217041, + "rewards/thermo_reward/std": 1.7771265506744385, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.11528191063553095, + "epoch": 0.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6634423136711121, + "learning_rate": 1.5180270093731302e-06, + "loss": -0.0124, + "num_tokens": 1740991.0, + "reward": 5.016768932342529, + "reward_std": 2.8234593868255615, + "rewards/fitness_reward/mean": 5.254176616668701, + "rewards/fitness_reward/std": 2.630495309829712, + "rewards/kidney_reward/mean": -0.429448664188385, + "rewards/kidney_reward/std": 1.11064612865448, + "rewards/length2tails_reward/mean": 0.6609193086624146, + "rewards/length2tails_reward/std": 0.4230346083641052, + "rewards/thermo_reward/mean": -0.375826895236969, + "rewards/thermo_reward/std": 1.7914979457855225, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 278.34375, + "completions/mean_terminated_length": 278.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.14113888517022133, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484531164169312, + "learning_rate": 1.5120428648705715e-06, + "loss": 0.0702, + "num_tokens": 1749930.0, + "reward": 4.45833683013916, + "reward_std": 3.1881518363952637, + "rewards/fitness_reward/mean": 5.084860324859619, + "rewards/fitness_reward/std": 3.2290408611297607, + "rewards/kidney_reward/mean": -0.6342756748199463, + "rewards/kidney_reward/std": 1.010218620300293, + "rewards/length2tails_reward/mean": 0.7552772164344788, + "rewards/length2tails_reward/std": 0.3683543801307678, + "rewards/thermo_reward/mean": -0.9964100122451782, + "rewards/thermo_reward/std": 2.1216630935668945, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.12928149849176407, + "epoch": 0.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6312239170074463, + "learning_rate": 1.5060337641211636e-06, + "loss": -0.0221, + "num_tokens": 1758495.0, + "reward": 5.229574203491211, + "reward_std": 3.2260067462921143, + "rewards/fitness_reward/mean": 5.103326797485352, + "rewards/fitness_reward/std": 3.447283983230591, + "rewards/kidney_reward/mean": -0.06352463364601135, + "rewards/kidney_reward/std": 1.290382742881775, + "rewards/length2tails_reward/mean": 0.6177878379821777, + "rewards/length2tails_reward/std": 0.41295352578163147, + "rewards/thermo_reward/mean": 0.007125034928321838, + "rewards/thermo_reward/std": 2.120759963989258, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 280.5625, + "completions/mean_terminated_length": 280.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14953106082975864, + "epoch": 0.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.925811529159546, + "learning_rate": 1.5e-06, + "loss": 0.1222, + "num_tokens": 1767505.0, + "reward": 5.001443386077881, + "reward_std": 3.1893393993377686, + "rewards/fitness_reward/mean": 5.0646443367004395, + "rewards/fitness_reward/std": 2.9247078895568848, + "rewards/kidney_reward/mean": -0.13537967205047607, + "rewards/kidney_reward/std": 1.2401924133300781, + "rewards/length2tails_reward/mean": 0.6247571706771851, + "rewards/length2tails_reward/std": 0.42491886019706726, + "rewards/thermo_reward/mean": -0.30340126156806946, + "rewards/thermo_reward/std": 1.8612476587295532, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.12770347204059362, + "epoch": 0.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9480125904083252, + "learning_rate": 1.4939418665842309e-06, + "loss": -0.012, + "num_tokens": 1776047.0, + "reward": 5.106375694274902, + "reward_std": 3.3707869052886963, + "rewards/fitness_reward/mean": 5.213399887084961, + "rewards/fitness_reward/std": 3.1237714290618896, + "rewards/kidney_reward/mean": 0.16533440351486206, + "rewards/kidney_reward/std": 1.2188705205917358, + "rewards/length2tails_reward/mean": 0.6536604166030884, + "rewards/length2tails_reward/std": 0.41311925649642944, + "rewards/thermo_reward/mean": -0.7062134742736816, + "rewards/thermo_reward/std": 1.8557100296020508, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.11901164799928665, + "epoch": 0.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2556489706039429, + "learning_rate": 1.4878596591387326e-06, + "loss": -0.0105, + "num_tokens": 1784682.0, + "reward": 5.204383850097656, + "reward_std": 3.6209285259246826, + "rewards/fitness_reward/mean": 5.13774299621582, + "rewards/fitness_reward/std": 3.0786664485931396, + "rewards/kidney_reward/mean": 0.12757733464241028, + "rewards/kidney_reward/std": 1.3261386156082153, + "rewards/length2tails_reward/mean": 0.7046136856079102, + "rewards/length2tails_reward/std": 0.3748266100883484, + "rewards/thermo_reward/mean": -0.34660279750823975, + "rewards/thermo_reward/std": 1.6802020072937012, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 267.80645751953125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.1570515912026167, + "epoch": 0.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4501383304595947, + "learning_rate": 1.4817536741017151e-06, + "loss": 0.1255, + "num_tokens": 1793770.0, + "reward": 5.006608009338379, + "reward_std": 3.3003995418548584, + "rewards/fitness_reward/mean": 5.15645170211792, + "rewards/fitness_reward/std": 3.2961273193359375, + "rewards/kidney_reward/mean": -0.27005457878112793, + "rewards/kidney_reward/std": 1.1989458799362183, + "rewards/length2tails_reward/mean": 0.7012870907783508, + "rewards/length2tails_reward/std": 0.39138534665107727, + "rewards/thermo_reward/mean": -0.38027605414390564, + "rewards/thermo_reward/std": 1.9341998100280762, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 280.4375, + "completions/mean_terminated_length": 280.4375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.17629681713879108, + "epoch": 0.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0244832038879395, + "learning_rate": 1.4756242090702753e-06, + "loss": -0.0026, + "num_tokens": 1802776.0, + "reward": 5.560153484344482, + "reward_std": 2.769331693649292, + "rewards/fitness_reward/mean": 5.607787132263184, + "rewards/fitness_reward/std": 2.7524917125701904, + "rewards/kidney_reward/mean": -0.1659838706254959, + "rewards/kidney_reward/std": 1.1878471374511719, + "rewards/length2tails_reward/mean": 0.7757689952850342, + "rewards/length2tails_reward/std": 0.3364262878894806, + "rewards/thermo_reward/mean": -0.3171682059764862, + "rewards/thermo_reward/std": 2.1596550941467285, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.18510614708065987, + "epoch": 0.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8747198581695557, + "learning_rate": 1.4694715627858908e-06, + "loss": -0.0064, + "num_tokens": 1811491.0, + "reward": 4.97410774230957, + "reward_std": 4.071346759796143, + "rewards/fitness_reward/mean": 4.744723320007324, + "rewards/fitness_reward/std": 3.860710620880127, + "rewards/kidney_reward/mean": -0.04718928039073944, + "rewards/kidney_reward/std": 1.0953369140625, + "rewards/length2tails_reward/mean": 0.7307334542274475, + "rewards/length2tails_reward/std": 0.36576011776924133, + "rewards/thermo_reward/mean": 0.14059212803840637, + "rewards/thermo_reward/std": 1.8796796798706055, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.17540221381932497, + "epoch": 0.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8250596523284912, + "learning_rate": 1.4632960351198617e-06, + "loss": -0.0062, + "num_tokens": 1820202.0, + "reward": 4.9364848136901855, + "reward_std": 3.8868162631988525, + "rewards/fitness_reward/mean": 4.679059028625488, + "rewards/fitness_reward/std": 3.7941133975982666, + "rewards/kidney_reward/mean": 0.12615957856178284, + "rewards/kidney_reward/std": 1.3088197708129883, + "rewards/length2tails_reward/mean": 0.7231643199920654, + "rewards/length2tails_reward/std": 0.3762193024158478, + "rewards/thermo_reward/mean": 0.027110159397125244, + "rewards/thermo_reward/std": 2.1192996501922607, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.14119845256209373, + "epoch": 0.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1828947067260742, + "learning_rate": 1.4570979270586943e-06, + "loss": -0.0941, + "num_tokens": 1829020.0, + "reward": 4.36683988571167, + "reward_std": 3.606301784515381, + "rewards/fitness_reward/mean": 4.932483673095703, + "rewards/fitness_reward/std": 3.6595046520233154, + "rewards/kidney_reward/mean": -0.5763438940048218, + "rewards/kidney_reward/std": 0.9190284013748169, + "rewards/length2tails_reward/mean": 0.7905447483062744, + "rewards/length2tails_reward/std": 0.33445608615875244, + "rewards/thermo_reward/mean": -0.9502164125442505, + "rewards/thermo_reward/std": 2.0955722332000732, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 279.1875, + "completions/mean_terminated_length": 279.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18222328554838896, + "epoch": 0.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.100304126739502, + "learning_rate": 1.4508775406894306e-06, + "loss": 0.0794, + "num_tokens": 1837986.0, + "reward": 5.177816867828369, + "reward_std": 3.4072232246398926, + "rewards/fitness_reward/mean": 5.094880104064941, + "rewards/fitness_reward/std": 3.1885640621185303, + "rewards/kidney_reward/mean": -0.011289328336715698, + "rewards/kidney_reward/std": 1.2078779935836792, + "rewards/length2tails_reward/mean": 0.6524442434310913, + "rewards/length2tails_reward/std": 0.4188677966594696, + "rewards/thermo_reward/mean": -0.14905908703804016, + "rewards/thermo_reward/std": 1.923143982887268, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.13147749193012714, + "epoch": 0.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3564049005508423, + "learning_rate": 1.4446351791849273e-06, + "loss": 0.0129, + "num_tokens": 1846750.0, + "reward": 5.809633255004883, + "reward_std": 3.2329182624816895, + "rewards/fitness_reward/mean": 5.548314094543457, + "rewards/fitness_reward/std": 2.9709882736206055, + "rewards/kidney_reward/mean": 0.1898169368505478, + "rewards/kidney_reward/std": 1.201176643371582, + "rewards/length2tails_reward/mean": 0.7714184522628784, + "rewards/length2tails_reward/std": 0.33874425292015076, + "rewards/thermo_reward/mean": -0.05288762226700783, + "rewards/thermo_reward/std": 1.9934961795806885, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 280.5625, + "completions/mean_terminated_length": 280.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.148380596190691, + "epoch": 0.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1136276721954346, + "learning_rate": 1.4383711467890773e-06, + "loss": 0.1043, + "num_tokens": 1855760.0, + "reward": 5.165239334106445, + "reward_std": 3.7728660106658936, + "rewards/fitness_reward/mean": 5.120419502258301, + "rewards/fitness_reward/std": 3.3934967517852783, + "rewards/kidney_reward/mean": -0.13617633283138275, + "rewards/kidney_reward/std": 1.219999074935913, + "rewards/length2tails_reward/mean": 0.6721340417861938, + "rewards/length2tails_reward/std": 0.37673434615135193, + "rewards/thermo_reward/mean": -0.11025162041187286, + "rewards/thermo_reward/std": 2.06036639213562, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 279.40625, + "completions/mean_terminated_length": 279.40625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.14004177413880825, + "epoch": 0.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625136375427246, + "learning_rate": 1.4320857488019824e-06, + "loss": 0.094, + "num_tokens": 1864733.0, + "reward": 5.282578468322754, + "reward_std": 2.975006103515625, + "rewards/fitness_reward/mean": 5.515386581420898, + "rewards/fitness_reward/std": 3.1072726249694824, + "rewards/kidney_reward/mean": 0.1614631861448288, + "rewards/kidney_reward/std": 1.4302942752838135, + "rewards/length2tails_reward/mean": 0.7769113779067993, + "rewards/length2tails_reward/std": 0.3295917212963104, + "rewards/thermo_reward/mean": -1.015535831451416, + "rewards/thermo_reward/std": 2.2133193016052246, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.1491750106215477, + "epoch": 0.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8770372867584229, + "learning_rate": 1.4257792915650725e-06, + "loss": -0.0188, + "num_tokens": 1873420.0, + "reward": 5.447704315185547, + "reward_std": 3.252995491027832, + "rewards/fitness_reward/mean": 5.594566345214844, + "rewards/fitness_reward/std": 2.7997264862060547, + "rewards/kidney_reward/mean": -0.48993390798568726, + "rewards/kidney_reward/std": 1.1592376232147217, + "rewards/length2tails_reward/mean": 0.8639081716537476, + "rewards/length2tails_reward/std": 0.25261324644088745, + "rewards/thermo_reward/mean": -0.23574352264404297, + "rewards/thermo_reward/std": 2.2791671752929688, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.28125, + "completions/mean_terminated_length": 266.28125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.16167325340211391, + "epoch": 0.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9747185707092285, + "learning_rate": 1.419452082446177e-06, + "loss": -0.0616, + "num_tokens": 1881973.0, + "reward": 5.3051886558532715, + "reward_std": 2.940993070602417, + "rewards/fitness_reward/mean": 5.305299282073975, + "rewards/fitness_reward/std": 2.8063652515411377, + "rewards/kidney_reward/mean": -0.20494236052036285, + "rewards/kidney_reward/std": 1.165364384651184, + "rewards/length2tails_reward/mean": 0.7368552684783936, + "rewards/length2tails_reward/std": 0.37866973876953125, + "rewards/thermo_reward/mean": -0.1637059599161148, + "rewards/thermo_reward/std": 1.9145219326019287, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12187883397564292, + "epoch": 0.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6405536532402039, + "learning_rate": 1.4131044298245418e-06, + "loss": -0.0104, + "num_tokens": 1890686.0, + "reward": 5.89649772644043, + "reward_std": 2.0160536766052246, + "rewards/fitness_reward/mean": 5.973363876342773, + "rewards/fitness_reward/std": 1.3069151639938354, + "rewards/kidney_reward/mean": -0.33626803755760193, + "rewards/kidney_reward/std": 1.403552532196045, + "rewards/length2tails_reward/mean": 0.6964526176452637, + "rewards/length2tails_reward/std": 0.41551831364631653, + "rewards/thermo_reward/mean": -0.1656903624534607, + "rewards/thermo_reward/std": 1.8359384536743164, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 267.8125, + "completions/mean_terminated_length": 267.8125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.13484436459839344, + "epoch": 0.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.467594861984253, + "learning_rate": 1.4067366430758004e-06, + "loss": -0.045, + "num_tokens": 1899288.0, + "reward": 4.882638454437256, + "reward_std": 2.991011619567871, + "rewards/fitness_reward/mean": 5.251405715942383, + "rewards/fitness_reward/std": 2.9786839485168457, + "rewards/kidney_reward/mean": -0.5040518045425415, + "rewards/kidney_reward/std": 1.2910178899765015, + "rewards/length2tails_reward/mean": 0.7873901128768921, + "rewards/length2tails_reward/std": 0.35044264793395996, + "rewards/thermo_reward/mean": -0.6271776556968689, + "rewards/thermo_reward/std": 2.0984861850738525, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12841098196804523, + "epoch": 0.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4370866119861603, + "learning_rate": 1.400349032556895e-06, + "loss": 0.0072, + "num_tokens": 1907962.0, + "reward": 5.410283088684082, + "reward_std": 3.0423872470855713, + "rewards/fitness_reward/mean": 5.468869209289551, + "rewards/fitness_reward/std": 2.5450143814086914, + "rewards/kidney_reward/mean": -0.1920437067747116, + "rewards/kidney_reward/std": 1.4910067319869995, + "rewards/length2tails_reward/mean": 0.681010365486145, + "rewards/length2tails_reward/std": 0.38825103640556335, + "rewards/thermo_reward/mean": -0.26563379168510437, + "rewards/thermo_reward/std": 2.002378225326538, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.16766507271677256, + "epoch": 0.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.009949207305908, + "learning_rate": 1.393941909590951e-06, + "loss": 0.0082, + "num_tokens": 1916716.0, + "reward": 4.8232502937316895, + "reward_std": 3.339430809020996, + "rewards/fitness_reward/mean": 5.122108459472656, + "rewards/fitness_reward/std": 3.3906185626983643, + "rewards/kidney_reward/mean": -0.31029796600341797, + "rewards/kidney_reward/std": 1.0983688831329346, + "rewards/length2tails_reward/mean": 0.7626557350158691, + "rewards/length2tails_reward/std": 0.3781687319278717, + "rewards/thermo_reward/mean": -0.6687458753585815, + "rewards/thermo_reward/std": 2.1708645820617676, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.59375, + "completions/mean_terminated_length": 267.59375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.10749762738123536, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5059139728546143, + "learning_rate": 1.3875155864521028e-06, + "loss": -0.0014, + "num_tokens": 1925311.0, + "reward": 6.180752277374268, + "reward_std": 1.7418967485427856, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": 0.008736655116081238, + "rewards/kidney_reward/std": 1.3900187015533447, + "rewards/length2tails_reward/mean": 0.5785348415374756, + "rewards/length2tails_reward/std": 0.3666441738605499, + "rewards/thermo_reward/mean": -0.0892057865858078, + "rewards/thermo_reward/std": 1.8389132022857666, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12798354495316744, + "epoch": 0.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3278595209121704, + "learning_rate": 1.3810703763502743e-06, + "loss": -0.0031, + "num_tokens": 1934037.0, + "reward": 6.395440101623535, + "reward_std": 1.592313289642334, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.09271426498889923, + "rewards/kidney_reward/std": 1.35648775100708, + "rewards/length2tails_reward/mean": 0.8260397911071777, + "rewards/length2tails_reward/std": 0.28667116165161133, + "rewards/thermo_reward/mean": -0.30006542801856995, + "rewards/thermo_reward/std": 2.1212196350097656, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 276.78125, + "completions/mean_terminated_length": 276.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1318447468802333, + "epoch": 0.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2908531427383423, + "learning_rate": 1.374606593415912e-06, + "loss": 0.0577, + "num_tokens": 1942926.0, + "reward": 5.608564853668213, + "reward_std": 2.8474864959716797, + "rewards/fitness_reward/mean": 5.65217399597168, + "rewards/fitness_reward/std": 2.6082570552825928, + "rewards/kidney_reward/mean": -0.11263138800859451, + "rewards/kidney_reward/std": 1.3512107133865356, + "rewards/length2tails_reward/mean": 0.7171779870986938, + "rewards/length2tails_reward/std": 0.3670293688774109, + "rewards/thermo_reward/mean": -0.3331752121448517, + "rewards/thermo_reward/std": 2.078946828842163, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11585133709013462, + "epoch": 0.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5567060708999634, + "learning_rate": 1.3681245526846781e-06, + "loss": -0.0016, + "num_tokens": 1951603.0, + "reward": 5.846627235412598, + "reward_std": 1.9089988470077515, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": -0.44368448853492737, + "rewards/kidney_reward/std": 1.363287329673767, + "rewards/length2tails_reward/mean": 0.7362430095672607, + "rewards/length2tails_reward/std": 0.3501988351345062, + "rewards/thermo_reward/mean": -0.3838888108730316, + "rewards/thermo_reward/std": 2.1188266277313232, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12520943023264408, + "epoch": 0.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4506056010723114, + "learning_rate": 1.361624570082092e-06, + "loss": -0.0028, + "num_tokens": 1960293.0, + "reward": 5.522380828857422, + "reward_std": 2.2656264305114746, + "rewards/fitness_reward/mean": 5.861753940582275, + "rewards/fitness_reward/std": 2.045424222946167, + "rewards/kidney_reward/mean": -0.05382596701383591, + "rewards/kidney_reward/std": 1.3001813888549805, + "rewards/length2tails_reward/mean": 0.7590430974960327, + "rewards/length2tails_reward/std": 0.3836628794670105, + "rewards/thermo_reward/mean": -1.0044409036636353, + "rewards/thermo_reward/std": 2.1063218116760254, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1355925602838397, + "epoch": 0.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0922815799713135, + "learning_rate": 1.3551069624081371e-06, + "loss": 0.0038, + "num_tokens": 1969001.0, + "reward": 5.617181777954102, + "reward_std": 2.6733498573303223, + "rewards/fitness_reward/mean": 5.599064826965332, + "rewards/fitness_reward/std": 2.7911274433135986, + "rewards/kidney_reward/mean": -0.12046325206756592, + "rewards/kidney_reward/std": 1.403606653213501, + "rewards/length2tails_reward/mean": 0.7542349100112915, + "rewards/length2tails_reward/std": 0.3374869227409363, + "rewards/thermo_reward/mean": -0.2204195111989975, + "rewards/thermo_reward/std": 1.9917058944702148, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.15132870338857174, + "epoch": 0.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.819502592086792, + "learning_rate": 1.3485720473218152e-06, + "loss": -0.0074, + "num_tokens": 1977664.0, + "reward": 5.580442428588867, + "reward_std": 2.7787983417510986, + "rewards/fitness_reward/mean": 5.590456962585449, + "rewards/fitness_reward/std": 2.43131947517395, + "rewards/kidney_reward/mean": -0.43889474868774414, + "rewards/kidney_reward/std": 1.0456558465957642, + "rewards/length2tails_reward/mean": 0.7070517539978027, + "rewards/length2tails_reward/std": 0.37388545274734497, + "rewards/thermo_reward/mean": 0.06533941626548767, + "rewards/thermo_reward/std": 2.0557384490966797, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1192570636048913, + "epoch": 0.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5030652284622192, + "learning_rate": 1.3420201433256689e-06, + "loss": -0.0049, + "num_tokens": 1986337.0, + "reward": 5.5395050048828125, + "reward_std": 1.7637563943862915, + "rewards/fitness_reward/mean": 5.87037467956543, + "rewards/fitness_reward/std": 1.3842169046401978, + "rewards/kidney_reward/mean": -0.41122961044311523, + "rewards/kidney_reward/std": 1.297377109527588, + "rewards/length2tails_reward/mean": 0.7080308794975281, + "rewards/length2tails_reward/std": 0.37696659564971924, + "rewards/thermo_reward/mean": -0.6045256853103638, + "rewards/thermo_reward/std": 1.8804250955581665, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 279.78125, + "completions/mean_terminated_length": 279.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1760839968919754, + "epoch": 0.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4869167804718018, + "learning_rate": 1.3354515697502551e-06, + "loss": 0.0889, + "num_tokens": 1995322.0, + "reward": 5.507205963134766, + "reward_std": 3.225966691970825, + "rewards/fitness_reward/mean": 5.797521591186523, + "rewards/fitness_reward/std": 2.754859685897827, + "rewards/kidney_reward/mean": -0.3719463348388672, + "rewards/kidney_reward/std": 1.0769273042678833, + "rewards/length2tails_reward/mean": 0.8598864078521729, + "rewards/length2tails_reward/std": 0.22975370287895203, + "rewards/thermo_reward/mean": -0.6386279463768005, + "rewards/thermo_reward/std": 2.108335494995117, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 291.34375, + "completions/mean_terminated_length": 291.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17017631325870752, + "epoch": 0.458, + "frac_reward_zero_std": 0.0, + "grad_norm": NaN, + "learning_rate": 1.3288666467385831e-06, + "loss": 0.16, + "num_tokens": 2004677.0, + "reward": 5.238361358642578, + "reward_std": 3.569972276687622, + "rewards/fitness_reward/mean": 5.240414142608643, + "rewards/fitness_reward/std": 3.045217752456665, + "rewards/kidney_reward/mean": -0.010104184970259666, + "rewards/kidney_reward/std": 1.4684005975723267, + "rewards/length2tails_reward/mean": 0.8191745281219482, + "rewards/length2tails_reward/std": 0.2940421402454376, + "rewards/thermo_reward/mean": -0.40358853340148926, + "rewards/thermo_reward/std": 2.2815184593200684, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13159669190645218, + "epoch": 0.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3491942584514618, + "learning_rate": 1.3288666467385831e-06, + "loss": 0.0035, + "num_tokens": 2013377.0, + "reward": 6.155538082122803, + "reward_std": 1.6413859128952026, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.5473537445068359, + "rewards/kidney_reward/std": 1.161616563796997, + "rewards/length2tails_reward/mean": 0.8180912733078003, + "rewards/length2tails_reward/std": 0.2914470434188843, + "rewards/thermo_reward/mean": -0.3212563991546631, + "rewards/thermo_reward/std": 2.1364221572875977, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 276.0, + "completions/mean_terminated_length": 276.0, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.18014958128333092, + "epoch": 0.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8534668684005737, + "learning_rate": 1.3222656952305111e-06, + "loss": 0.0644, + "num_tokens": 2022241.0, + "reward": 4.984916687011719, + "reward_std": 3.7312705516815186, + "rewards/fitness_reward/mean": 5.267140865325928, + "rewards/fitness_reward/std": 3.2565789222717285, + "rewards/kidney_reward/mean": -0.46993765234947205, + "rewards/kidney_reward/std": 1.2665092945098877, + "rewards/length2tails_reward/mean": 0.7773940563201904, + "rewards/length2tails_reward/std": 0.2976566553115845, + "rewards/thermo_reward/mean": -0.48320838809013367, + "rewards/thermo_reward/std": 1.9547299146652222, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13126275781542063, + "epoch": 0.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4601180851459503, + "learning_rate": 1.3156490369471024e-06, + "loss": 0.0114, + "num_tokens": 2030975.0, + "reward": 6.162424087524414, + "reward_std": 1.498247742652893, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.6099071502685547, + "rewards/kidney_reward/std": 1.1115127801895142, + "rewards/length2tails_reward/mean": 0.7627843618392944, + "rewards/length2tails_reward/std": 0.28889045119285583, + "rewards/thermo_reward/mean": 0.19467884302139282, + "rewards/thermo_reward/std": 2.1010055541992188, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 297.09375, + "completions/mean_terminated_length": 282.3548278808594, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.18137010838836432, + "epoch": 0.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2529149055480957, + "learning_rate": 1.3090169943749473e-06, + "loss": 0.1627, + "num_tokens": 2040514.0, + "reward": 5.853922367095947, + "reward_std": 3.2280406951904297, + "rewards/fitness_reward/mean": 5.682772636413574, + "rewards/fitness_reward/std": 2.829298257827759, + "rewards/kidney_reward/mean": -0.005696475505828857, + "rewards/kidney_reward/std": 1.4939913749694824, + "rewards/length2tails_reward/mean": 0.833533763885498, + "rewards/length2tails_reward/std": 0.2704388499259949, + "rewards/thermo_reward/mean": -0.06877095997333527, + "rewards/thermo_reward/std": 1.9118250608444214, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.13818693347275257, + "epoch": 0.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.66115403175354, + "learning_rate": 1.3023698907504446e-06, + "loss": -0.0141, + "num_tokens": 2049172.0, + "reward": 5.821286201477051, + "reward_std": 2.581409215927124, + "rewards/fitness_reward/mean": 5.822164535522461, + "rewards/fitness_reward/std": 2.2371702194213867, + "rewards/kidney_reward/mean": -0.13880418241024017, + "rewards/kidney_reward/std": 1.2737301588058472, + "rewards/length2tails_reward/mean": 0.7588136196136475, + "rewards/length2tails_reward/std": 0.34267228841781616, + "rewards/thermo_reward/mean": -0.2423594444990158, + "rewards/thermo_reward/std": 1.951093316078186, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1545769814401865, + "epoch": 0.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.714938998222351, + "learning_rate": 1.2957080500440467e-06, + "loss": 0.056, + "num_tokens": 2057968.0, + "reward": 5.967822074890137, + "reward_std": 3.091534376144409, + "rewards/fitness_reward/mean": 5.846134662628174, + "rewards/fitness_reward/std": 2.5937700271606445, + "rewards/kidney_reward/mean": 0.10892915725708008, + "rewards/kidney_reward/std": 1.498690128326416, + "rewards/length2tails_reward/mean": 0.7655143737792969, + "rewards/length2tails_reward/std": 0.3138628900051117, + "rewards/thermo_reward/mean": -0.2483123540878296, + "rewards/thermo_reward/std": 2.0395705699920654, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10781193990260363, + "epoch": 0.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49131742119789124, + "learning_rate": 1.2890317969444716e-06, + "loss": -0.0025, + "num_tokens": 2066652.0, + "reward": 5.7716875076293945, + "reward_std": 2.1354894638061523, + "rewards/fitness_reward/mean": 6.086244583129883, + "rewards/fitness_reward/std": 1.8417463302612305, + "rewards/kidney_reward/mean": -0.2309199869632721, + "rewards/kidney_reward/std": 1.3746833801269531, + "rewards/length2tails_reward/mean": 0.8065834045410156, + "rewards/length2tails_reward/std": 0.27748697996139526, + "rewards/thermo_reward/mean": -0.8014854192733765, + "rewards/thermo_reward/std": 2.1019039154052734, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14223221316933632, + "epoch": 0.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.983418583869934, + "learning_rate": 1.2823414568428766e-06, + "loss": 0.114, + "num_tokens": 2075636.0, + "reward": 6.0223798751831055, + "reward_std": 2.5816240310668945, + "rewards/fitness_reward/mean": 6.011655807495117, + "rewards/fitness_reward/std": 2.2284603118896484, + "rewards/kidney_reward/mean": -0.22647760808467865, + "rewards/kidney_reward/std": 1.2773698568344116, + "rewards/length2tails_reward/mean": 0.8090081214904785, + "rewards/length2tails_reward/std": 0.32567527890205383, + "rewards/thermo_reward/mean": -0.15657764673233032, + "rewards/thermo_reward/std": 2.0863871574401855, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14971571415662766, + "epoch": 0.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5143944025039673, + "learning_rate": 1.275637355816999e-06, + "loss": 0.0018, + "num_tokens": 2084375.0, + "reward": 6.407557487487793, + "reward_std": 2.3935375213623047, + "rewards/fitness_reward/mean": 6.113137245178223, + "rewards/fitness_reward/std": 2.182605266571045, + "rewards/kidney_reward/mean": -0.023072291165590286, + "rewards/kidney_reward/std": 1.416988492012024, + "rewards/length2tails_reward/mean": 0.8817094564437866, + "rewards/length2tails_reward/std": 0.18478648364543915, + "rewards/thermo_reward/mean": 0.17105701565742493, + "rewards/thermo_reward/std": 1.9090884923934937, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 280.6875, + "completions/mean_terminated_length": 280.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14836317393928766, + "epoch": 0.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8189572095870972, + "learning_rate": 1.2689198206152656e-06, + "loss": 0.1236, + "num_tokens": 2093389.0, + "reward": 5.878040313720703, + "reward_std": 2.821176528930664, + "rewards/fitness_reward/mean": 5.928430080413818, + "rewards/fitness_reward/std": 2.644589424133301, + "rewards/kidney_reward/mean": -0.15635544061660767, + "rewards/kidney_reward/std": 1.3480225801467896, + "rewards/length2tails_reward/mean": 0.8759337067604065, + "rewards/length2tails_reward/std": 0.20841535925865173, + "rewards/thermo_reward/mean": -0.3823922872543335, + "rewards/thermo_reward/std": 2.18420147895813, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1411373158916831, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6222948431968689, + "learning_rate": 1.2621891786408648e-06, + "loss": -0.0029, + "num_tokens": 2102134.0, + "reward": 6.234904766082764, + "reward_std": 1.6686646938323975, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.16512131690979004, + "rewards/kidney_reward/std": 1.1555894613265991, + "rewards/length2tails_reward/mean": 0.8744947910308838, + "rewards/length2tails_reward/std": 0.242760568857193, + "rewards/thermo_reward/mean": -0.16100062429904938, + "rewards/thermo_reward/std": 2.1503944396972656, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 292.1875, + "completions/mean_terminated_length": 277.2903137207031, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.171529040671885, + "epoch": 0.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.054434776306152, + "learning_rate": 1.2554457579357905e-06, + "loss": 0.2517, + "num_tokens": 2111516.0, + "reward": 5.733481407165527, + "reward_std": 2.8645641803741455, + "rewards/fitness_reward/mean": 5.781833648681641, + "rewards/fitness_reward/std": 2.8185718059539795, + "rewards/kidney_reward/mean": -0.21167702972888947, + "rewards/kidney_reward/std": 1.3747280836105347, + "rewards/length2tails_reward/mean": 0.8289727568626404, + "rewards/length2tails_reward/std": 0.2659308910369873, + "rewards/thermo_reward/mean": -0.2995145916938782, + "rewards/thermo_reward/std": 2.0944507122039795, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 283.125, + "completions/mean_terminated_length": 283.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1622915817424655, + "epoch": 0.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.460477590560913, + "learning_rate": 1.2486898871648551e-06, + "loss": 0.1185, + "num_tokens": 2120608.0, + "reward": 5.520163536071777, + "reward_std": 3.4349632263183594, + "rewards/fitness_reward/mean": 5.4481120109558105, + "rewards/fitness_reward/std": 3.304231643676758, + "rewards/kidney_reward/mean": -0.07317894697189331, + "rewards/kidney_reward/std": 1.4033994674682617, + "rewards/length2tails_reward/mean": 0.8261253833770752, + "rewards/length2tails_reward/std": 0.265546977519989, + "rewards/thermo_reward/mean": -0.1957801878452301, + "rewards/thermo_reward/std": 2.2139532566070557, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1435080012306571, + "epoch": 0.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4143276810646057, + "learning_rate": 1.2419218955996676e-06, + "loss": 0.002, + "num_tokens": 2129322.0, + "reward": 5.902777671813965, + "reward_std": 3.1238839626312256, + "rewards/fitness_reward/mean": 5.9073591232299805, + "rewards/fitness_reward/std": 2.3329126834869385, + "rewards/kidney_reward/mean": -0.12808813154697418, + "rewards/kidney_reward/std": 1.4451590776443481, + "rewards/length2tails_reward/mean": 0.8207823038101196, + "rewards/length2tails_reward/std": 0.26687997579574585, + "rewards/thermo_reward/mean": -0.2914661467075348, + "rewards/thermo_reward/std": 2.187708854675293, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13894751016050577, + "epoch": 0.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44220390915870667, + "learning_rate": 1.23514211310259e-06, + "loss": 0.0021, + "num_tokens": 2138040.0, + "reward": 6.006943225860596, + "reward_std": 2.443209648132324, + "rewards/fitness_reward/mean": 5.999410629272461, + "rewards/fitness_reward/std": 1.8257852792739868, + "rewards/kidney_reward/mean": -0.348245769739151, + "rewards/kidney_reward/std": 1.2318572998046875, + "rewards/length2tails_reward/mean": 0.8122061491012573, + "rewards/length2tails_reward/std": 0.276017963886261, + "rewards/thermo_reward/mean": -0.042792417109012604, + "rewards/thermo_reward/std": 2.094027280807495, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 266.3125, + "completions/mean_terminated_length": 266.3125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.17298025824129581, + "epoch": 0.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7220354080200195, + "learning_rate": 1.2283508701106558e-06, + "loss": -0.0307, + "num_tokens": 2146594.0, + "reward": 4.9828290939331055, + "reward_std": 3.9370384216308594, + "rewards/fitness_reward/mean": 4.851231098175049, + "rewards/fitness_reward/std": 3.5942587852478027, + "rewards/kidney_reward/mean": -0.17364293336868286, + "rewards/kidney_reward/std": 1.478359580039978, + "rewards/length2tails_reward/mean": 0.7703011631965637, + "rewards/length2tails_reward/std": 0.34750691056251526, + "rewards/thermo_reward/mean": 0.05168786644935608, + "rewards/thermo_reward/std": 2.085308074951172, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.15128333121538162, + "epoch": 0.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6833405494689941, + "learning_rate": 1.2215484976194675e-06, + "loss": -0.0132, + "num_tokens": 2155308.0, + "reward": 6.069390296936035, + "reward_std": 2.2282562255859375, + "rewards/fitness_reward/mean": 5.655192852020264, + "rewards/fitness_reward/std": 2.1310982704162598, + "rewards/kidney_reward/mean": 0.05876028537750244, + "rewards/kidney_reward/std": 1.2751644849777222, + "rewards/length2tails_reward/mean": 0.8252370357513428, + "rewards/length2tails_reward/std": 0.31697702407836914, + "rewards/thermo_reward/mean": 0.35701656341552734, + "rewards/thermo_reward/std": 1.7305238246917725, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14280968811362982, + "epoch": 0.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45008811354637146, + "learning_rate": 1.2147353271670632e-06, + "loss": 0.0031, + "num_tokens": 2164174.0, + "reward": 6.298008918762207, + "reward_std": 1.5095510482788086, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.48818615078926086, + "rewards/kidney_reward/std": 1.0899704694747925, + "rewards/length2tails_reward/mean": 0.8062364459037781, + "rewards/length2tails_reward/std": 0.30648234486579895, + "rewards/thermo_reward/mean": -0.08955463021993637, + "rewards/thermo_reward/std": 2.0243654251098633, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.1534036574885249, + "epoch": 0.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.114922285079956, + "learning_rate": 1.207911690817759e-06, + "loss": 0.0352, + "num_tokens": 2172945.0, + "reward": 5.256867408752441, + "reward_std": 3.398367404937744, + "rewards/fitness_reward/mean": 5.528585433959961, + "rewards/fitness_reward/std": 2.7048287391662598, + "rewards/kidney_reward/mean": -0.4768460988998413, + "rewards/kidney_reward/std": 1.2065118551254272, + "rewards/length2tails_reward/mean": 0.8892650008201599, + "rewards/length2tails_reward/std": 0.20420534908771515, + "rewards/thermo_reward/mean": -0.5112224221229553, + "rewards/thermo_reward/std": 2.252676010131836, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.1748004462569952, + "epoch": 0.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6925712823867798, + "learning_rate": 1.2010779211459648e-06, + "loss": 0.0527, + "num_tokens": 2181871.0, + "reward": 5.066378593444824, + "reward_std": 3.6943271160125732, + "rewards/fitness_reward/mean": 5.181694507598877, + "rewards/fitness_reward/std": 3.5308194160461426, + "rewards/kidney_reward/mean": 0.019244499504566193, + "rewards/kidney_reward/std": 1.2347512245178223, + "rewards/length2tails_reward/mean": 0.8258422613143921, + "rewards/length2tails_reward/std": 0.28488725423812866, + "rewards/thermo_reward/mean": -0.6627975702285767, + "rewards/thermo_reward/std": 1.9828616380691528, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15635219123214483, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4665292501449585, + "learning_rate": 1.194234351219972e-06, + "loss": 0.0, + "num_tokens": 2190591.0, + "reward": 6.17600154876709, + "reward_std": 2.4434690475463867, + "rewards/fitness_reward/mean": 5.8430070877075195, + "rewards/fitness_reward/std": 2.135439872741699, + "rewards/kidney_reward/mean": 0.01794009655714035, + "rewards/kidney_reward/std": 1.4815893173217773, + "rewards/length2tails_reward/mean": 0.8287307620048523, + "rewards/length2tails_reward/std": 0.2827480435371399, + "rewards/thermo_reward/mean": 0.23368358612060547, + "rewards/thermo_reward/std": 2.0299649238586426, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.19010232388973236, + "epoch": 0.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.568342685699463, + "learning_rate": 1.1873813145857248e-06, + "loss": -0.0351, + "num_tokens": 2199236.0, + "reward": 5.8709235191345215, + "reward_std": 3.2817866802215576, + "rewards/fitness_reward/mean": 5.546199798583984, + "rewards/fitness_reward/std": 3.0127499103546143, + "rewards/kidney_reward/mean": 0.23835378885269165, + "rewards/kidney_reward/std": 1.2311129570007324, + "rewards/length2tails_reward/mean": 0.8606287837028503, + "rewards/length2tails_reward/std": 0.23748965561389923, + "rewards/thermo_reward/mean": -0.019220426678657532, + "rewards/thermo_reward/std": 1.8968557119369507, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.18050073459744453, + "epoch": 0.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.935668468475342, + "learning_rate": 1.18051914525056e-06, + "loss": -0.044, + "num_tokens": 2207843.0, + "reward": 5.597233772277832, + "reward_std": 2.9492597579956055, + "rewards/fitness_reward/mean": 5.869199752807617, + "rewards/fitness_reward/std": 2.4897172451019287, + "rewards/kidney_reward/mean": -0.33863043785095215, + "rewards/kidney_reward/std": 1.3294682502746582, + "rewards/length2tails_reward/mean": 0.8189811706542969, + "rewards/length2tails_reward/std": 0.30258670449256897, + "rewards/thermo_reward/mean": -0.6147923469543457, + "rewards/thermo_reward/std": 1.957696557044983, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14694790355861187, + "epoch": 0.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0416942834854126, + "learning_rate": 1.1736481776669305e-06, + "loss": 0.0087, + "num_tokens": 2216584.0, + "reward": 6.303023815155029, + "reward_std": 2.348665952682495, + "rewards/fitness_reward/mean": 6.197032451629639, + "rewards/fitness_reward/std": 1.7298640012741089, + "rewards/kidney_reward/mean": -0.2510518729686737, + "rewards/kidney_reward/std": 1.2121409177780151, + "rewards/length2tails_reward/mean": 0.8488996028900146, + "rewards/length2tails_reward/std": 0.2167581021785736, + "rewards/thermo_reward/mean": 0.03858397901058197, + "rewards/thermo_reward/std": 2.0483031272888184, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.40625, + "completions/mean_terminated_length": 266.40625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.16056162863969803, + "epoch": 0.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8536911010742188, + "learning_rate": 1.1667687467161023e-06, + "loss": -0.0806, + "num_tokens": 2225141.0, + "reward": 6.463656902313232, + "reward_std": 2.581038236618042, + "rewards/fitness_reward/mean": 6.238892555236816, + "rewards/fitness_reward/std": 1.9935071468353271, + "rewards/kidney_reward/mean": 0.04028014838695526, + "rewards/kidney_reward/std": 1.5900325775146484, + "rewards/length2tails_reward/mean": 0.8447715044021606, + "rewards/length2tails_reward/std": 0.26232168078422546, + "rewards/thermo_reward/mean": -0.013136669993400574, + "rewards/thermo_reward/std": 2.1490957736968994, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.13892832025885582, + "epoch": 0.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7996954321861267, + "learning_rate": 1.1598811876918349e-06, + "loss": -0.0899, + "num_tokens": 2233609.0, + "reward": 5.907028675079346, + "reward_std": 3.229994058609009, + "rewards/fitness_reward/mean": 5.730376243591309, + "rewards/fitness_reward/std": 2.654407262802124, + "rewards/kidney_reward/mean": -0.02263645827770233, + "rewards/kidney_reward/std": 1.5903600454330444, + "rewards/length2tails_reward/mean": 0.7845340371131897, + "rewards/length2tails_reward/std": 0.3012000024318695, + "rewards/thermo_reward/mean": -0.016326233744621277, + "rewards/thermo_reward/std": 2.06636643409729, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12667525000870228, + "epoch": 0.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5889103412628174, + "learning_rate": 1.1529858362840382e-06, + "loss": -0.0069, + "num_tokens": 2242297.0, + "reward": 6.215231895446777, + "reward_std": 1.858242154121399, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": 0.13728436827659607, + "rewards/kidney_reward/std": 1.205039381980896, + "rewards/length2tails_reward/mean": 0.724676787853241, + "rewards/length2tails_reward/std": 0.3713679313659668, + "rewards/thermo_reward/mean": -0.22186486423015594, + "rewards/thermo_reward/std": 2.0663976669311523, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.146596341393888, + "epoch": 0.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9780179858207703, + "learning_rate": 1.1460830285624116e-06, + "loss": 0.0013, + "num_tokens": 2251071.0, + "reward": 6.679516792297363, + "reward_std": 1.516502022743225, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2098180651664734, + "rewards/kidney_reward/std": 1.3393386602401733, + "rewards/length2tails_reward/mean": 0.8087002635002136, + "rewards/length2tails_reward/std": 0.3076275885105133, + "rewards/thermo_reward/mean": -0.2317536324262619, + "rewards/thermo_reward/std": 2.1532649993896484, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 263.53125, + "completions/mean_terminated_length": 263.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.16931257583200932, + "epoch": 0.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3027231693267822, + "learning_rate": 1.1391731009600653e-06, + "loss": -0.1214, + "num_tokens": 2259536.0, + "reward": 5.622883319854736, + "reward_std": 3.0197651386260986, + "rewards/fitness_reward/mean": 5.9119553565979, + "rewards/fitness_reward/std": 2.6751906871795654, + "rewards/kidney_reward/mean": -0.5463274121284485, + "rewards/kidney_reward/std": 1.3963298797607422, + "rewards/length2tails_reward/mean": 0.9041286110877991, + "rewards/length2tails_reward/std": 0.17764364182949066, + "rewards/thermo_reward/mean": -0.48388081789016724, + "rewards/thermo_reward/std": 2.3388588428497314, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 280.84375, + "completions/mean_terminated_length": 280.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16965920012444258, + "epoch": 0.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7378101348876953, + "learning_rate": 1.1322563902571225e-06, + "loss": 0.0972, + "num_tokens": 2268555.0, + "reward": 5.932882785797119, + "reward_std": 2.845289468765259, + "rewards/fitness_reward/mean": 5.962001800537109, + "rewards/fitness_reward/std": 2.4900543689727783, + "rewards/kidney_reward/mean": -0.1587393879890442, + "rewards/kidney_reward/std": 1.3558075428009033, + "rewards/length2tails_reward/mean": 0.868542492389679, + "rewards/length2tails_reward/std": 0.2091815173625946, + "rewards/thermo_reward/mean": -0.3337695002555847, + "rewards/thermo_reward/std": 2.0171165466308594, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.14380551129579544, + "epoch": 0.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4390329420566559, + "learning_rate": 1.1253332335643042e-06, + "loss": 0.0044, + "num_tokens": 2277321.0, + "reward": 6.579376220703125, + "reward_std": 1.263524055480957, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.2627079486846924, + "rewards/kidney_reward/std": 1.264919638633728, + "rewards/length2tails_reward/mean": 0.883327305316925, + "rewards/length2tails_reward/std": 0.17075875401496887, + "rewards/thermo_reward/mean": -0.2027987837791443, + "rewards/thermo_reward/std": 2.105802536010742, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 278.125, + "completions/mean_terminated_length": 278.125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14420662447810173, + "epoch": 0.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7461501955986023, + "learning_rate": 1.1184039683065012e-06, + "loss": 0.0868, + "num_tokens": 2286253.0, + "reward": 6.388462066650391, + "reward_std": 2.1272552013397217, + "rewards/fitness_reward/mean": 5.992526054382324, + "rewards/fitness_reward/std": 1.858837366104126, + "rewards/kidney_reward/mean": 0.113833948969841, + "rewards/kidney_reward/std": 1.5173331499099731, + "rewards/length2tails_reward/mean": 0.8281090259552002, + "rewards/length2tails_reward/std": 0.2703193128108978, + "rewards/thermo_reward/mean": 0.26398321986198425, + "rewards/thermo_reward/std": 1.9258671998977661, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 277.59375, + "completions/mean_terminated_length": 277.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1499346848577261, + "epoch": 0.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4616916179656982, + "learning_rate": 1.1114689322063254e-06, + "loss": 0.1001, + "num_tokens": 2295168.0, + "reward": 6.525530815124512, + "reward_std": 2.2752037048339844, + "rewards/fitness_reward/mean": 6.118172645568848, + "rewards/fitness_reward/std": 2.155168056488037, + "rewards/kidney_reward/mean": 0.012175392359495163, + "rewards/kidney_reward/std": 1.2831966876983643, + "rewards/length2tails_reward/mean": 0.8546762466430664, + "rewards/length2tails_reward/std": 0.2366992086172104, + "rewards/thermo_reward/mean": 0.37520337104797363, + "rewards/thermo_reward/std": 1.903710126876831, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.18243697751313448, + "epoch": 0.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7324366569519043, + "learning_rate": 1.1045284632676535e-06, + "loss": -0.1355, + "num_tokens": 2303680.0, + "reward": 5.568639755249023, + "reward_std": 3.6000919342041016, + "rewards/fitness_reward/mean": 5.43031120300293, + "rewards/fitness_reward/std": 3.362344741821289, + "rewards/kidney_reward/mean": 0.006486307829618454, + "rewards/kidney_reward/std": 1.351361870765686, + "rewards/length2tails_reward/mean": 0.8496532440185547, + "rewards/length2tails_reward/std": 0.27389219403266907, + "rewards/thermo_reward/mean": -0.1546546071767807, + "rewards/thermo_reward/std": 2.102027416229248, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.78125, + "completions/mean_terminated_length": 262.78125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.15076796151697636, + "epoch": 0.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.42953622341156, + "learning_rate": 1.0975828997591495e-06, + "loss": -0.1263, + "num_tokens": 2312121.0, + "reward": 5.996249198913574, + "reward_std": 2.5402796268463135, + "rewards/fitness_reward/mean": 6.155496120452881, + "rewards/fitness_reward/std": 1.952719807624817, + "rewards/kidney_reward/mean": -0.03896676003932953, + "rewards/kidney_reward/std": 1.3960011005401611, + "rewards/length2tails_reward/mean": 0.6987098455429077, + "rewards/length2tails_reward/std": 0.34228160977363586, + "rewards/thermo_reward/mean": -0.6288823485374451, + "rewards/thermo_reward/std": 2.154966354370117, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1448302799835801, + "epoch": 0.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9759800434112549, + "learning_rate": 1.0906325801977803e-06, + "loss": -0.0063, + "num_tokens": 2320804.0, + "reward": 6.080069541931152, + "reward_std": 3.0260355472564697, + "rewards/fitness_reward/mean": 5.839569091796875, + "rewards/fitness_reward/std": 2.6037561893463135, + "rewards/kidney_reward/mean": 0.0014719441533088684, + "rewards/kidney_reward/std": 1.2305020093917847, + "rewards/length2tails_reward/mean": 0.7766636610031128, + "rewards/length2tails_reward/std": 0.2810504734516144, + "rewards/thermo_reward/mean": 0.09119720011949539, + "rewards/thermo_reward/std": 2.016065835952759, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11841264460235834, + "epoch": 0.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6071414351463318, + "learning_rate": 1.0836778433323157e-06, + "loss": 0.0077, + "num_tokens": 2329484.0, + "reward": 5.9090495109558105, + "reward_std": 2.454660415649414, + "rewards/fitness_reward/mean": 5.994655609130859, + "rewards/fitness_reward/std": 2.3544161319732666, + "rewards/kidney_reward/mean": -0.20192506909370422, + "rewards/kidney_reward/std": 1.1880277395248413, + "rewards/length2tails_reward/mean": 0.7841193675994873, + "rewards/length2tails_reward/std": 0.2957149147987366, + "rewards/thermo_reward/mean": -0.36134740710258484, + "rewards/thermo_reward/std": 2.038168430328369, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 285.1875, + "completions/mean_terminated_length": 270.06451416015625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17456878162920475, + "epoch": 0.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8888232707977295, + "learning_rate": 1.0767190281268186e-06, + "loss": 0.1326, + "num_tokens": 2338642.0, + "reward": 5.616744041442871, + "reward_std": 3.449208974838257, + "rewards/fitness_reward/mean": 5.386440277099609, + "rewards/fitness_reward/std": 3.242466449737549, + "rewards/kidney_reward/mean": 0.167307510972023, + "rewards/kidney_reward/std": 1.3582977056503296, + "rewards/length2tails_reward/mean": 0.775815486907959, + "rewards/length2tails_reward/std": 0.2952445447444916, + "rewards/thermo_reward/mean": -0.09460898488759995, + "rewards/thermo_reward/std": 2.12760853767395, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12719732522964478, + "epoch": 0.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46400102972984314, + "learning_rate": 1.069756473744125e-06, + "loss": -0.0049, + "num_tokens": 2347435.0, + "reward": 5.965997219085693, + "reward_std": 1.212018370628357, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.5127175450325012, + "rewards/kidney_reward/std": 1.2551615238189697, + "rewards/length2tails_reward/mean": 0.8376985192298889, + "rewards/length2tails_reward/std": 0.2924360930919647, + "rewards/thermo_reward/mean": -0.7447768449783325, + "rewards/thermo_reward/std": 2.066016912460327, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 279.6875, + "completions/mean_terminated_length": 279.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14143760781735182, + "epoch": 0.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2774434089660645, + "learning_rate": 1.0627905195293135e-06, + "loss": 0.0847, + "num_tokens": 2356417.0, + "reward": 4.858714580535889, + "reward_std": 4.066253662109375, + "rewards/fitness_reward/mean": 4.707261085510254, + "rewards/fitness_reward/std": 3.9718406200408936, + "rewards/kidney_reward/mean": -0.04432570934295654, + "rewards/kidney_reward/std": 1.2094591856002808, + "rewards/length2tails_reward/mean": 0.773606538772583, + "rewards/length2tails_reward/std": 0.2912238538265228, + "rewards/thermo_reward/mean": -0.03957007825374603, + "rewards/thermo_reward/std": 1.5913993120193481, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.1580907767638564, + "epoch": 0.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421672821044922, + "learning_rate": 1.055821504993164e-06, + "loss": 0.0416, + "num_tokens": 2365205.0, + "reward": 5.788364410400391, + "reward_std": 2.8610990047454834, + "rewards/fitness_reward/mean": 5.786640644073486, + "rewards/fitness_reward/std": 2.7975590229034424, + "rewards/kidney_reward/mean": -0.1281442642211914, + "rewards/kidney_reward/std": 1.226645827293396, + "rewards/length2tails_reward/mean": 0.7794501781463623, + "rewards/length2tails_reward/std": 0.33686086535453796, + "rewards/thermo_reward/mean": -0.25813406705856323, + "rewards/thermo_reward/std": 2.3839523792266846, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15838243439793587, + "epoch": 0.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9077131748199463, + "learning_rate": 1.0488497697956134e-06, + "loss": 0.0283, + "num_tokens": 2374001.0, + "reward": 5.788833141326904, + "reward_std": 2.9266490936279297, + "rewards/fitness_reward/mean": 5.630908966064453, + "rewards/fitness_reward/std": 2.692647933959961, + "rewards/kidney_reward/mean": -0.3564684987068176, + "rewards/kidney_reward/std": 1.5307029485702515, + "rewards/length2tails_reward/mean": 0.8538310527801514, + "rewards/length2tails_reward/std": 0.21145357191562653, + "rewards/thermo_reward/mean": 0.2454012632369995, + "rewards/thermo_reward/std": 1.9245532751083374, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 281.9375, + "completions/mean_terminated_length": 281.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1725459909066558, + "epoch": 0.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.738239288330078, + "learning_rate": 1.0418756537291995e-06, + "loss": 0.1011, + "num_tokens": 2383055.0, + "reward": 6.081518173217773, + "reward_std": 2.6257402896881104, + "rewards/fitness_reward/mean": 5.872672080993652, + "rewards/fitness_reward/std": 2.471569776535034, + "rewards/kidney_reward/mean": 0.218740314245224, + "rewards/kidney_reward/std": 1.32453191280365, + "rewards/length2tails_reward/mean": 0.7809317111968994, + "rewards/length2tails_reward/std": 0.2966778874397278, + "rewards/thermo_reward/mean": -0.19151431322097778, + "rewards/thermo_reward/std": 1.905684232711792, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14585830736905336, + "epoch": 0.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9640302658081055, + "learning_rate": 1.034899496702501e-06, + "loss": 0.0022, + "num_tokens": 2391748.0, + "reward": 5.990623474121094, + "reward_std": 2.397167921066284, + "rewards/fitness_reward/mean": 5.930706977844238, + "rewards/fitness_reward/std": 2.2439072132110596, + "rewards/kidney_reward/mean": -0.07944446057081223, + "rewards/kidney_reward/std": 1.4068588018417358, + "rewards/length2tails_reward/mean": 0.8241256475448608, + "rewards/length2tails_reward/std": 0.23539581894874573, + "rewards/thermo_reward/mean": -0.21278566122055054, + "rewards/thermo_reward/std": 2.2069005966186523, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.15625540539622307, + "epoch": 0.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5687947273254395, + "learning_rate": 1.0279216387235689e-06, + "loss": 0.0167, + "num_tokens": 2400459.0, + "reward": 5.703569412231445, + "reward_std": 2.96517014503479, + "rewards/fitness_reward/mean": 5.671274185180664, + "rewards/fitness_reward/std": 2.8722054958343506, + "rewards/kidney_reward/mean": 0.06752986460924149, + "rewards/kidney_reward/std": 1.258401870727539, + "rewards/length2tails_reward/mean": 0.7752214670181274, + "rewards/length2tails_reward/std": 0.32070493698120117, + "rewards/thermo_reward/mean": -0.39054977893829346, + "rewards/thermo_reward/std": 2.105935573577881, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1423500245437026, + "epoch": 0.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6806622743606567, + "learning_rate": 1.020942419883357e-06, + "loss": 0.0033, + "num_tokens": 2409169.0, + "reward": 6.5700788497924805, + "reward_std": 1.7746098041534424, + "rewards/fitness_reward/mean": 6.296484470367432, + "rewards/fitness_reward/std": 1.6677184104919434, + "rewards/kidney_reward/mean": -0.19006654620170593, + "rewards/kidney_reward/std": 1.2162213325500488, + "rewards/length2tails_reward/mean": 0.8175604939460754, + "rewards/length2tails_reward/std": 0.2305397242307663, + "rewards/thermo_reward/mean": 0.32847630977630615, + "rewards/thermo_reward/std": 1.8085222244262695, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 280.28125, + "completions/mean_terminated_length": 280.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14487546402961016, + "epoch": 0.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8299687504768372, + "learning_rate": 1.0139621803391454e-06, + "loss": 0.0467, + "num_tokens": 2418170.0, + "reward": 6.545259952545166, + "reward_std": 1.3277207612991333, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.17356184124946594, + "rewards/kidney_reward/std": 1.263597846031189, + "rewards/length2tails_reward/mean": 0.8215415477752686, + "rewards/length2tails_reward/std": 0.2258286327123642, + "rewards/thermo_reward/mean": -0.3292858600616455, + "rewards/thermo_reward/std": 2.039425849914551, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1272141383960843, + "epoch": 0.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7546288371086121, + "learning_rate": 1.0069812602979615e-06, + "loss": 0.0003, + "num_tokens": 2426857.0, + "reward": 5.912240028381348, + "reward_std": 2.3872926235198975, + "rewards/fitness_reward/mean": 5.988445281982422, + "rewards/fitness_reward/std": 1.878536343574524, + "rewards/kidney_reward/mean": -0.03585119545459747, + "rewards/kidney_reward/std": 1.2344835996627808, + "rewards/length2tails_reward/mean": 0.7913399934768677, + "rewards/length2tails_reward/std": 0.2957223057746887, + "rewards/thermo_reward/mean": -0.512228786945343, + "rewards/thermo_reward/std": 2.159263849258423, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 279.125, + "completions/mean_terminated_length": 279.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17997045908123255, + "epoch": 0.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.979198694229126, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 2435821.0, + "reward": 5.03554630279541, + "reward_std": 3.796201229095459, + "rewards/fitness_reward/mean": 4.998668193817139, + "rewards/fitness_reward/std": 3.7583067417144775, + "rewards/kidney_reward/mean": 0.06615039706230164, + "rewards/kidney_reward/std": 1.4136227369308472, + "rewards/length2tails_reward/mean": 0.7691352367401123, + "rewards/length2tails_reward/std": 0.32233336567878723, + "rewards/thermo_reward/mean": -0.376961886882782, + "rewards/thermo_reward/std": 1.9719232320785522, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14837611094117165, + "epoch": 0.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6687464118003845, + "learning_rate": 9.930187397020384e-07, + "loss": -0.0023, + "num_tokens": 2444545.0, + "reward": 5.772210121154785, + "reward_std": 2.6880130767822266, + "rewards/fitness_reward/mean": 5.614042282104492, + "rewards/fitness_reward/std": 2.792747974395752, + "rewards/kidney_reward/mean": -0.2538801431655884, + "rewards/kidney_reward/std": 1.353262186050415, + "rewards/length2tails_reward/mean": 0.8017355799674988, + "rewards/length2tails_reward/std": 0.2695324122905731, + "rewards/thermo_reward/mean": 0.1693476289510727, + "rewards/thermo_reward/std": 2.0165786743164062, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.15388427395373583, + "epoch": 0.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8253613710403442, + "learning_rate": 9.860378196608547e-07, + "loss": -0.0131, + "num_tokens": 2453192.0, + "reward": 6.641109466552734, + "reward_std": 2.0452044010162354, + "rewards/fitness_reward/mean": 6.2334065437316895, + "rewards/fitness_reward/std": 2.024541139602661, + "rewards/kidney_reward/mean": -0.07310893386602402, + "rewards/kidney_reward/std": 1.2692768573760986, + "rewards/length2tails_reward/mean": 0.7746356725692749, + "rewards/length2tails_reward/std": 0.2963363826274872, + "rewards/thermo_reward/mean": 0.5011963844299316, + "rewards/thermo_reward/std": 1.5752555131912231, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 275.09375, + "completions/mean_terminated_length": 275.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16355876345187426, + "epoch": 0.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3657917976379395, + "learning_rate": 9.790575801166431e-07, + "loss": 0.0445, + "num_tokens": 2462027.0, + "reward": 5.599199295043945, + "reward_std": 3.717531204223633, + "rewards/fitness_reward/mean": 5.3801445960998535, + "rewards/fitness_reward/std": 3.222134828567505, + "rewards/kidney_reward/mean": 0.3526960015296936, + "rewards/kidney_reward/std": 1.5738078355789185, + "rewards/length2tails_reward/mean": 0.8324109315872192, + "rewards/length2tails_reward/std": 0.26209557056427, + "rewards/thermo_reward/mean": -0.3307921588420868, + "rewards/thermo_reward/std": 2.23809814453125, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.12554469238966703, + "epoch": 0.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.652640700340271, + "learning_rate": 9.720783612764313e-07, + "loss": -0.0163, + "num_tokens": 2470667.0, + "reward": 6.005587100982666, + "reward_std": 2.7134058475494385, + "rewards/fitness_reward/mean": 6.015375137329102, + "rewards/fitness_reward/std": 2.2088727951049805, + "rewards/kidney_reward/mean": -0.1422906517982483, + "rewards/kidney_reward/std": 1.4053009748458862, + "rewards/length2tails_reward/mean": 0.7117120623588562, + "rewards/length2tails_reward/std": 0.341067373752594, + "rewards/thermo_reward/mean": -0.23314118385314941, + "rewards/thermo_reward/std": 1.9345332384109497, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14547014515846968, + "epoch": 0.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6224685907363892, + "learning_rate": 9.651005032974993e-07, + "loss": 0.0016, + "num_tokens": 2479362.0, + "reward": 6.701007843017578, + "reward_std": 1.1542237997055054, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.002499748021364212, + "rewards/kidney_reward/std": 1.3801093101501465, + "rewards/length2tails_reward/mean": 0.792314887046814, + "rewards/length2tails_reward/std": 0.26681551337242126, + "rewards/thermo_reward/mean": 0.2327187955379486, + "rewards/thermo_reward/std": 1.6495410203933716, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15400028508156538, + "epoch": 0.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3945049047470093, + "learning_rate": 9.581243462708005e-07, + "loss": 0.0046, + "num_tokens": 2488097.0, + "reward": 5.999971389770508, + "reward_std": 2.1510202884674072, + "rewards/fitness_reward/mean": 5.867298603057861, + "rewards/fitness_reward/std": 2.019099235534668, + "rewards/kidney_reward/mean": -0.004271138459444046, + "rewards/kidney_reward/std": 1.3482393026351929, + "rewards/length2tails_reward/mean": 0.7591778039932251, + "rewards/length2tails_reward/std": 0.32747241854667664, + "rewards/thermo_reward/mean": -0.10997310280799866, + "rewards/thermo_reward/std": 2.031148672103882, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13353883754462004, + "epoch": 0.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4228142499923706, + "learning_rate": 9.511502302043867e-07, + "loss": 0.0021, + "num_tokens": 2496780.0, + "reward": 6.48953104019165, + "reward_std": 1.3167486190795898, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.1205441802740097, + "rewards/kidney_reward/std": 1.3729273080825806, + "rewards/length2tails_reward/mean": 0.7895287275314331, + "rewards/length2tails_reward/std": 0.30240556597709656, + "rewards/thermo_reward/mean": -0.2717765271663666, + "rewards/thermo_reward/std": 1.9287211894989014, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.17670563887804747, + "epoch": 0.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1856436729431152, + "learning_rate": 9.441784950068361e-07, + "loss": 0.022, + "num_tokens": 2505570.0, + "reward": 5.552561283111572, + "reward_std": 3.0569052696228027, + "rewards/fitness_reward/mean": 5.5452375411987305, + "rewards/fitness_reward/std": 3.0043234825134277, + "rewards/kidney_reward/mean": -0.028089947998523712, + "rewards/kidney_reward/std": 1.2665034532546997, + "rewards/length2tails_reward/mean": 0.8060954809188843, + "rewards/length2tails_reward/std": 0.26724883913993835, + "rewards/thermo_reward/mean": -0.36030930280685425, + "rewards/thermo_reward/std": 2.0128753185272217, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.13899183459579945, + "epoch": 0.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8525024056434631, + "learning_rate": 9.372094804706866e-07, + "loss": -0.0047, + "num_tokens": 2514271.0, + "reward": 5.807683944702148, + "reward_std": 2.6605942249298096, + "rewards/fitness_reward/mean": 5.804408073425293, + "rewards/fitness_reward/std": 2.3758177757263184, + "rewards/kidney_reward/mean": 0.0376010537147522, + "rewards/kidney_reward/std": 1.4945979118347168, + "rewards/length2tails_reward/mean": 0.8038283586502075, + "rewards/length2tails_reward/std": 0.29997193813323975, + "rewards/thermo_reward/mean": -0.4329639673233032, + "rewards/thermo_reward/std": 2.102201223373413, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13818454928696156, + "epoch": 0.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0629791021347046, + "learning_rate": 9.302435262558747e-07, + "loss": 0.0278, + "num_tokens": 2523044.0, + "reward": 5.959643363952637, + "reward_std": 2.1231353282928467, + "rewards/fitness_reward/mean": 5.888861656188965, + "rewards/fitness_reward/std": 1.9181643724441528, + "rewards/kidney_reward/mean": -0.033825114369392395, + "rewards/kidney_reward/std": 1.4832185506820679, + "rewards/length2tails_reward/mean": 0.8217208981513977, + "rewards/length2tails_reward/std": 0.28788596391677856, + "rewards/thermo_reward/mean": -0.23547188937664032, + "rewards/thermo_reward/std": 2.246788501739502, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 258.8125, + "completions/mean_terminated_length": 258.8125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.1594973113387823, + "epoch": 0.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1755261421203613, + "learning_rate": 9.232809718731813e-07, + "loss": -0.1287, + "num_tokens": 2531358.0, + "reward": 5.4473443031311035, + "reward_std": 3.200383186340332, + "rewards/fitness_reward/mean": 5.256423473358154, + "rewards/fitness_reward/std": 3.2978856563568115, + "rewards/kidney_reward/mean": 0.06129808351397514, + "rewards/kidney_reward/std": 1.3483083248138428, + "rewards/length2tails_reward/mean": 0.8028110265731812, + "rewards/length2tails_reward/std": 0.3241569399833679, + "rewards/thermo_reward/mean": -0.08086204528808594, + "rewards/thermo_reward/std": 2.1148059368133545, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12140040285885334, + "epoch": 0.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4034607410430908, + "learning_rate": 9.163221566676847e-07, + "loss": -0.0016, + "num_tokens": 2540012.0, + "reward": 6.423543930053711, + "reward_std": 1.8661997318267822, + "rewards/fitness_reward/mean": 6.188730716705322, + "rewards/fitness_reward/std": 1.7741566896438599, + "rewards/kidney_reward/mean": -0.32657331228256226, + "rewards/kidney_reward/std": 1.2123817205429077, + "rewards/length2tails_reward/mean": 0.7576315402984619, + "rewards/length2tails_reward/std": 0.2643510401248932, + "rewards/thermo_reward/mean": 0.4173838794231415, + "rewards/thermo_reward/std": 1.7833548784255981, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14943666849285364, + "epoch": 0.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.043025255203247, + "learning_rate": 9.093674198022199e-07, + "loss": 0.0795, + "num_tokens": 2548938.0, + "reward": 5.3123087882995605, + "reward_std": 3.6125266551971436, + "rewards/fitness_reward/mean": 5.243748664855957, + "rewards/fitness_reward/std": 3.3299062252044678, + "rewards/kidney_reward/mean": -0.48788362741470337, + "rewards/kidney_reward/std": 1.26163649559021, + "rewards/length2tails_reward/mean": 0.8052570223808289, + "rewards/length2tails_reward/std": 0.28455302119255066, + "rewards/thermo_reward/mean": 0.22237543761730194, + "rewards/thermo_reward/std": 1.7955067157745361, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.1679877294227481, + "epoch": 0.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9829885959625244, + "learning_rate": 9.024171002408506e-07, + "loss": -0.0103, + "num_tokens": 2557580.0, + "reward": 5.355369567871094, + "reward_std": 3.582345724105835, + "rewards/fitness_reward/mean": 5.362346649169922, + "rewards/fitness_reward/std": 3.2837119102478027, + "rewards/kidney_reward/mean": -0.1609899401664734, + "rewards/kidney_reward/std": 1.333462119102478, + "rewards/length2tails_reward/mean": 0.744073748588562, + "rewards/length2tails_reward/std": 0.31323665380477905, + "rewards/thermo_reward/mean": -0.2250012755393982, + "rewards/thermo_reward/std": 2.1173300743103027, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 267.40625, + "completions/mean_terminated_length": 267.40625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.13892160076647997, + "epoch": 0.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8492016792297363, + "learning_rate": 8.954715367323466e-07, + "loss": -0.1088, + "num_tokens": 2566169.0, + "reward": 5.543354034423828, + "reward_std": 2.9506261348724365, + "rewards/fitness_reward/mean": 5.78358268737793, + "rewards/fitness_reward/std": 2.8111298084259033, + "rewards/kidney_reward/mean": -0.7873251438140869, + "rewards/kidney_reward/std": 1.2815355062484741, + "rewards/length2tails_reward/mean": 0.8598273992538452, + "rewards/length2tails_reward/std": 0.22865496575832367, + "rewards/thermo_reward/mean": -0.12304553389549255, + "rewards/thermo_reward/std": 2.137643814086914, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 282.40625, + "completions/mean_terminated_length": 282.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16803938429802656, + "epoch": 0.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.047661304473877, + "learning_rate": 8.885310677936746e-07, + "loss": 0.077, + "num_tokens": 2575238.0, + "reward": 5.033448696136475, + "reward_std": 3.4656853675842285, + "rewards/fitness_reward/mean": 4.970721244812012, + "rewards/fitness_reward/std": 3.5634567737579346, + "rewards/kidney_reward/mean": 0.032870735973119736, + "rewards/kidney_reward/std": 1.3879930973052979, + "rewards/length2tails_reward/mean": 0.7979098558425903, + "rewards/length2tails_reward/std": 0.3307546377182007, + "rewards/thermo_reward/mean": -0.3063697814941406, + "rewards/thermo_reward/std": 2.1015470027923584, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.128823634237051, + "epoch": 0.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35255303978919983, + "learning_rate": 8.81596031693499e-07, + "loss": -0.001, + "num_tokens": 2583933.0, + "reward": 6.1219048500061035, + "reward_std": 2.2153446674346924, + "rewards/fitness_reward/mean": 6.1872687339782715, + "rewards/fitness_reward/std": 1.7819706201553345, + "rewards/kidney_reward/mean": -0.0763387680053711, + "rewards/kidney_reward/std": 1.3682671785354614, + "rewards/length2tails_reward/mean": 0.7683770060539246, + "rewards/length2tails_reward/std": 0.31339552998542786, + "rewards/thermo_reward/mean": -0.4385773539543152, + "rewards/thermo_reward/std": 2.0758681297302246, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.12887464184314013, + "epoch": 0.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7549186944961548, + "learning_rate": 8.746667664356955e-07, + "loss": -0.0288, + "num_tokens": 2592597.0, + "reward": 5.737493515014648, + "reward_std": 2.3524863719940186, + "rewards/fitness_reward/mean": 6.003878116607666, + "rewards/fitness_reward/std": 2.2695066928863525, + "rewards/kidney_reward/mean": -0.4935506284236908, + "rewards/kidney_reward/std": 1.170538067817688, + "rewards/length2tails_reward/mean": 0.8755277395248413, + "rewards/length2tails_reward/std": 0.22547662258148193, + "rewards/thermo_reward/mean": -0.4769827723503113, + "rewards/thermo_reward/std": 2.1505541801452637, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.13547687977552414, + "epoch": 0.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8880061507225037, + "learning_rate": 8.677436097428774e-07, + "loss": 0.0043, + "num_tokens": 2601243.0, + "reward": 6.46945858001709, + "reward_std": 1.442918062210083, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.06878862529993057, + "rewards/kidney_reward/std": 1.2600641250610352, + "rewards/length2tails_reward/mean": 0.7656205892562866, + "rewards/length2tails_reward/std": 0.2914549708366394, + "rewards/thermo_reward/mean": -0.07734374701976776, + "rewards/thermo_reward/std": 2.0001652240753174, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.1765116062015295, + "epoch": 0.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8891026973724365, + "learning_rate": 8.608268990399348e-07, + "loss": 0.0201, + "num_tokens": 2609981.0, + "reward": 6.024785995483398, + "reward_std": 3.543193817138672, + "rewards/fitness_reward/mean": 5.517086982727051, + "rewards/fitness_reward/std": 3.1167924404144287, + "rewards/kidney_reward/mean": -0.03205416351556778, + "rewards/kidney_reward/std": 1.2035332918167114, + "rewards/length2tails_reward/mean": 0.8274141550064087, + "rewards/length2tails_reward/std": 0.23523229360580444, + "rewards/thermo_reward/mean": 0.6337454319000244, + "rewards/thermo_reward/std": 1.706175446510315, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 278.6875, + "completions/mean_terminated_length": 278.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15879554394632578, + "epoch": 0.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7817070484161377, + "learning_rate": 8.539169714375885e-07, + "loss": 0.1088, + "num_tokens": 2618931.0, + "reward": 6.548776626586914, + "reward_std": 2.27897310256958, + "rewards/fitness_reward/mean": 6.193077087402344, + "rewards/fitness_reward/std": 1.7509506940841675, + "rewards/kidney_reward/mean": 0.06910586357116699, + "rewards/kidney_reward/std": 1.4774768352508545, + "rewards/length2tails_reward/mean": 0.8031742572784424, + "rewards/length2tails_reward/std": 0.27509331703186035, + "rewards/thermo_reward/mean": 0.24070586264133453, + "rewards/thermo_reward/std": 1.9352507591247559, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13626266457140446, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3759794235229492, + "learning_rate": 8.47014163715962e-07, + "loss": 0.0122, + "num_tokens": 2627639.0, + "reward": 6.452775001525879, + "reward_std": 2.030623197555542, + "rewards/fitness_reward/mean": 6.20017147064209, + "rewards/fitness_reward/std": 1.7131540775299072, + "rewards/kidney_reward/mean": -0.03645924851298332, + "rewards/kidney_reward/std": 1.2895108461380005, + "rewards/length2tails_reward/mean": 0.7727314233779907, + "rewards/length2tails_reward/std": 0.3181847929954529, + "rewards/thermo_reward/mean": 0.15530048310756683, + "rewards/thermo_reward/std": 1.8542375564575195, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.15447265189141035, + "epoch": 0.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0929714441299438, + "learning_rate": 8.401188123081652e-07, + "loss": -0.044, + "num_tokens": 2636274.0, + "reward": 5.9095001220703125, + "reward_std": 2.3706412315368652, + "rewards/fitness_reward/mean": 5.9298272132873535, + "rewards/fitness_reward/std": 2.1688692569732666, + "rewards/kidney_reward/mean": -0.1626996546983719, + "rewards/kidney_reward/std": 1.2477351427078247, + "rewards/length2tails_reward/mean": 0.8582019805908203, + "rewards/length2tails_reward/std": 0.24992448091506958, + "rewards/thermo_reward/mean": -0.30705544352531433, + "rewards/thermo_reward/std": 2.1065003871917725, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12860836926847696, + "epoch": 0.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2735062837600708, + "learning_rate": 8.332312532838978e-07, + "loss": 0.0021, + "num_tokens": 2644955.0, + "reward": 6.560126304626465, + "reward_std": 1.2234059572219849, + "rewards/fitness_reward/mean": 6.2823309898376465, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.23910409212112427, + "rewards/kidney_reward/std": 1.4007219076156616, + "rewards/length2tails_reward/mean": 0.7388077974319458, + "rewards/length2tails_reward/std": 0.28337112069129944, + "rewards/thermo_reward/mean": -0.052918002009391785, + "rewards/thermo_reward/std": 1.9731281995773315, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 264.90625, + "completions/mean_terminated_length": 264.90625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.1408443758264184, + "epoch": 0.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8262622952461243, + "learning_rate": 8.263518223330696e-07, + "loss": -0.0483, + "num_tokens": 2653464.0, + "reward": 5.640756130218506, + "reward_std": 3.0255091190338135, + "rewards/fitness_reward/mean": 5.2705078125, + "rewards/fitness_reward/std": 2.9505343437194824, + "rewards/kidney_reward/mean": 0.08989809453487396, + "rewards/kidney_reward/std": 1.3180209398269653, + "rewards/length2tails_reward/mean": 0.7812709212303162, + "rewards/length2tails_reward/std": 0.2842097282409668, + "rewards/thermo_reward/mean": 0.2599630355834961, + "rewards/thermo_reward/std": 1.8476253747940063, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13957977015525103, + "epoch": 0.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1814037561416626, + "learning_rate": 8.194808547494401e-07, + "loss": -0.002, + "num_tokens": 2662137.0, + "reward": 5.991512775421143, + "reward_std": 2.895458459854126, + "rewards/fitness_reward/mean": 5.794535160064697, + "rewards/fitness_reward/std": 2.769155263900757, + "rewards/kidney_reward/mean": -0.18175917863845825, + "rewards/kidney_reward/std": 1.2326058149337769, + "rewards/length2tails_reward/mean": 0.7655566930770874, + "rewards/length2tails_reward/std": 0.3070470690727234, + "rewards/thermo_reward/mean": 0.1929364800453186, + "rewards/thermo_reward/std": 1.990727424621582, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.16747733019292355, + "epoch": 0.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.186681032180786, + "learning_rate": 8.126186854142751e-07, + "loss": -0.0209, + "num_tokens": 2670825.0, + "reward": 5.879621505737305, + "reward_std": 3.4141082763671875, + "rewards/fitness_reward/mean": 5.552954196929932, + "rewards/fitness_reward/std": 3.2934844493865967, + "rewards/kidney_reward/mean": -0.05889531224966049, + "rewards/kidney_reward/std": 1.345290184020996, + "rewards/length2tails_reward/mean": 0.8045759201049805, + "rewards/length2tails_reward/std": 0.27133816480636597, + "rewards/thermo_reward/mean": 0.30994102358818054, + "rewards/thermo_reward/std": 1.9813530445098877, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 264.5625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.14618254080414772, + "epoch": 0.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9095790386199951, + "learning_rate": 8.057656487800282e-07, + "loss": -0.0528, + "num_tokens": 2679323.0, + "reward": 5.235918998718262, + "reward_std": 3.7534329891204834, + "rewards/fitness_reward/mean": 5.308850288391113, + "rewards/fitness_reward/std": 3.433398962020874, + "rewards/kidney_reward/mean": -0.2223380208015442, + "rewards/kidney_reward/std": 1.5016045570373535, + "rewards/length2tails_reward/mean": 0.8398911952972412, + "rewards/length2tails_reward/std": 0.2600333094596863, + "rewards/thermo_reward/mean": -0.3434699773788452, + "rewards/thermo_reward/std": 2.2440428733825684, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 267.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.17721352633088827, + "epoch": 0.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4808382987976074, + "learning_rate": 7.989220788540355e-07, + "loss": -0.0362, + "num_tokens": 2687899.0, + "reward": 6.460134983062744, + "reward_std": 2.5526137351989746, + "rewards/fitness_reward/mean": 5.929336071014404, + "rewards/fitness_reward/std": 2.6182987689971924, + "rewards/kidney_reward/mean": 0.3675292432308197, + "rewards/kidney_reward/std": 1.189038634300232, + "rewards/length2tails_reward/mean": 0.8435029983520508, + "rewards/length2tails_reward/std": 0.240304633975029, + "rewards/thermo_reward/mean": 0.27231690287590027, + "rewards/thermo_reward/std": 1.809454321861267, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12361082248389721, + "epoch": 0.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5217465162277222, + "learning_rate": 7.920883091822408e-07, + "loss": -0.0029, + "num_tokens": 2696539.0, + "reward": 5.749210357666016, + "reward_std": 2.427088737487793, + "rewards/fitness_reward/mean": 5.804673671722412, + "rewards/fitness_reward/std": 2.3728742599487305, + "rewards/kidney_reward/mean": -0.2303834855556488, + "rewards/kidney_reward/std": 1.2232918739318848, + "rewards/length2tails_reward/mean": 0.7257441282272339, + "rewards/length2tails_reward/std": 0.32820820808410645, + "rewards/thermo_reward/mean": -0.24341507256031036, + "rewards/thermo_reward/std": 1.9496954679489136, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1583899725228548, + "epoch": 0.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7271563410758972, + "learning_rate": 7.852646728329367e-07, + "loss": 0.0016, + "num_tokens": 2705282.0, + "reward": 6.917350769042969, + "reward_std": 2.0332014560699463, + "rewards/fitness_reward/mean": 6.211165428161621, + "rewards/fitness_reward/std": 1.6548012495040894, + "rewards/kidney_reward/mean": 0.4039209187030792, + "rewards/kidney_reward/std": 1.3662333488464355, + "rewards/length2tails_reward/mean": 0.8635237216949463, + "rewards/length2tails_reward/std": 0.18739113211631775, + "rewards/thermo_reward/mean": 0.5766885280609131, + "rewards/thermo_reward/std": 1.5489522218704224, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14540214743465185, + "epoch": 0.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38060182332992554, + "learning_rate": 7.784515023805327e-07, + "loss": -0.0004, + "num_tokens": 2714001.0, + "reward": 5.810445308685303, + "reward_std": 3.080669641494751, + "rewards/fitness_reward/mean": 5.936678886413574, + "rewards/fitness_reward/std": 2.600179672241211, + "rewards/kidney_reward/mean": -0.6011654138565063, + "rewards/kidney_reward/std": 1.3190113306045532, + "rewards/length2tails_reward/mean": 0.8296725153923035, + "rewards/length2tails_reward/std": 0.23379944264888763, + "rewards/thermo_reward/mean": -0.06613816320896149, + "rewards/thermo_reward/std": 1.96107816696167, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12451317626982927, + "epoch": 0.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5161978602409363, + "learning_rate": 7.716491298893441e-07, + "loss": -0.0002, + "num_tokens": 2722678.0, + "reward": 6.549075126647949, + "reward_std": 1.461757779121399, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.6216371059417725, + "rewards/kidney_reward/std": 1.185909390449524, + "rewards/length2tails_reward/mean": 0.756341814994812, + "rewards/length2tails_reward/std": 0.2793433666229248, + "rewards/thermo_reward/mean": 0.5709758996963501, + "rewards/thermo_reward/std": 1.6642720699310303, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.14000983629375696, + "epoch": 0.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.64600670337677, + "learning_rate": 7.648578868974099e-07, + "loss": -0.0092, + "num_tokens": 2731339.0, + "reward": 5.813279628753662, + "reward_std": 2.7142364978790283, + "rewards/fitness_reward/mean": 5.919684410095215, + "rewards/fitness_reward/std": 2.6460466384887695, + "rewards/kidney_reward/mean": -0.23450270295143127, + "rewards/kidney_reward/std": 1.137681484222412, + "rewards/length2tails_reward/mean": 0.783237636089325, + "rewards/length2tails_reward/std": 0.29252496361732483, + "rewards/thermo_reward/mean": -0.36992621421813965, + "rewards/thermo_reward/std": 1.9649900197982788, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 282.8125, + "completions/mean_terminated_length": 267.6128845214844, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.14463838562369347, + "epoch": 0.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.689681529998779, + "learning_rate": 7.580781044003324e-07, + "loss": 0.1848, + "num_tokens": 2740421.0, + "reward": 6.179220199584961, + "reward_std": 3.108948230743408, + "rewards/fitness_reward/mean": 5.872677803039551, + "rewards/fitness_reward/std": 2.8283746242523193, + "rewards/kidney_reward/mean": 0.012635260820388794, + "rewards/kidney_reward/std": 1.3495537042617798, + "rewards/length2tails_reward/mean": 0.7865300178527832, + "rewards/length2tails_reward/std": 0.28116706013679504, + "rewards/thermo_reward/mean": 0.20718349516391754, + "rewards/thermo_reward/std": 1.7808927297592163, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 280.46875, + "completions/mean_terminated_length": 280.46875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.17828217148780823, + "epoch": 0.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1162149906158447, + "learning_rate": 7.513101128351453e-07, + "loss": 0.0979, + "num_tokens": 2749428.0, + "reward": 5.4172563552856445, + "reward_std": 3.199577569961548, + "rewards/fitness_reward/mean": 5.461665630340576, + "rewards/fitness_reward/std": 2.932718276977539, + "rewards/kidney_reward/mean": -0.5087636709213257, + "rewards/kidney_reward/std": 1.2203556299209595, + "rewards/length2tails_reward/mean": 0.780242919921875, + "rewards/length2tails_reward/std": 0.3494933247566223, + "rewards/thermo_reward/mean": 0.02982361614704132, + "rewards/thermo_reward/std": 2.083951711654663, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 275.78125, + "completions/mean_terminated_length": 275.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.261963858269155, + "epoch": 0.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.851346731185913, + "learning_rate": 7.445542420642096e-07, + "loss": 0.0294, + "num_tokens": 2758285.0, + "reward": 5.509346008300781, + "reward_std": 4.53924560546875, + "rewards/fitness_reward/mean": 4.731644630432129, + "rewards/fitness_reward/std": 4.147398948669434, + "rewards/kidney_reward/mean": 0.5509114861488342, + "rewards/kidney_reward/std": 1.3219982385635376, + "rewards/length2tails_reward/mean": 0.8193047046661377, + "rewards/length2tails_reward/std": 0.25560668110847473, + "rewards/thermo_reward/mean": 0.5948399901390076, + "rewards/thermo_reward/std": 1.9399741888046265, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.1475894758477807, + "epoch": 0.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0615999698638916, + "learning_rate": 7.378108213591354e-07, + "loss": -0.0026, + "num_tokens": 2766952.0, + "reward": 6.231350898742676, + "reward_std": 2.787792921066284, + "rewards/fitness_reward/mean": 5.961513996124268, + "rewards/fitness_reward/std": 2.4785311222076416, + "rewards/kidney_reward/mean": 0.45111918449401855, + "rewards/kidney_reward/std": 1.5606229305267334, + "rewards/length2tails_reward/mean": 0.7947746515274048, + "rewards/length2tails_reward/std": 0.26889491081237793, + "rewards/thermo_reward/mean": -0.30883270502090454, + "rewards/thermo_reward/std": 2.0008814334869385, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13641659449785948, + "epoch": 0.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.543372631072998, + "learning_rate": 7.310801793847343e-07, + "loss": 0.0021, + "num_tokens": 2775656.0, + "reward": 6.5946245193481445, + "reward_std": 1.5121303796768188, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.062211520969867706, + "rewards/kidney_reward/std": 1.4607387781143188, + "rewards/length2tails_reward/mean": 0.799560546875, + "rewards/length2tails_reward/std": 0.27528977394104004, + "rewards/thermo_reward/mean": -0.3309165835380554, + "rewards/thermo_reward/std": 2.116473913192749, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 276.15625, + "completions/mean_terminated_length": 276.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14614746067672968, + "epoch": 0.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1665120124816895, + "learning_rate": 7.243626441830009e-07, + "loss": 0.0701, + "num_tokens": 2784525.0, + "reward": 6.4105424880981445, + "reward_std": 2.441462278366089, + "rewards/fitness_reward/mean": 6.148130416870117, + "rewards/fitness_reward/std": 1.9925299882888794, + "rewards/kidney_reward/mean": 0.13602781295776367, + "rewards/kidney_reward/std": 1.4797165393829346, + "rewards/length2tails_reward/mean": 0.8131794929504395, + "rewards/length2tails_reward/std": 0.2802060544490814, + "rewards/thermo_reward/mean": -0.017794162034988403, + "rewards/thermo_reward/std": 2.036123275756836, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 278.25, + "completions/mean_terminated_length": 278.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1566220736131072, + "epoch": 0.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6108603477478027, + "learning_rate": 7.176585431571233e-07, + "loss": 0.0987, + "num_tokens": 2793461.0, + "reward": 6.976290225982666, + "reward_std": 2.466521978378296, + "rewards/fitness_reward/mean": 6.239953994750977, + "rewards/fitness_reward/std": 1.987504005432129, + "rewards/kidney_reward/mean": 0.2159939557313919, + "rewards/kidney_reward/std": 1.3283164501190186, + "rewards/length2tails_reward/mean": 0.8266392350196838, + "rewards/length2tails_reward/std": 0.24396318197250366, + "rewards/thermo_reward/mean": 0.8433594107627869, + "rewards/thermo_reward/std": 1.7143508195877075, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1250614272430539, + "epoch": 0.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5148164629936218, + "learning_rate": 7.109682030555282e-07, + "loss": 0.0044, + "num_tokens": 2802121.0, + "reward": 6.539069175720215, + "reward_std": 1.664305329322815, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.029780671000480652, + "rewards/kidney_reward/std": 1.4029945135116577, + "rewards/length2tails_reward/mean": 0.7379071712493896, + "rewards/length2tails_reward/std": 0.3075510859489441, + "rewards/thermo_reward/mean": 0.17430394887924194, + "rewards/thermo_reward/std": 1.9058327674865723, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.1713531417772174, + "epoch": 0.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.541980028152466, + "learning_rate": 7.042919499559536e-07, + "loss": 0.0256, + "num_tokens": 2810877.0, + "reward": 6.0339860916137695, + "reward_std": 2.8362503051757812, + "rewards/fitness_reward/mean": 5.7353620529174805, + "rewards/fitness_reward/std": 2.6534714698791504, + "rewards/kidney_reward/mean": -0.24237263202667236, + "rewards/kidney_reward/std": 1.372509479522705, + "rewards/length2tails_reward/mean": 0.8674904704093933, + "rewards/length2tails_reward/std": 0.18604622781276703, + "rewards/thermo_reward/mean": 0.40587544441223145, + "rewards/thermo_reward/std": 1.596633791923523, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 281.9375, + "completions/mean_terminated_length": 281.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1418048208579421, + "epoch": 0.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.55742609500885, + "learning_rate": 6.976301092495556e-07, + "loss": 0.025, + "num_tokens": 2819931.0, + "reward": 6.955789566040039, + "reward_std": 1.3362483978271484, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1048911064863205, + "rewards/kidney_reward/std": 1.2173396348953247, + "rewards/length2tails_reward/mean": 0.7285305261611938, + "rewards/length2tails_reward/std": 0.3037022650241852, + "rewards/thermo_reward/mean": 0.25982531905174255, + "rewards/thermo_reward/std": 2.0253074169158936, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 267.40625, + "completions/mean_terminated_length": 267.40625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.1485171476379037, + "epoch": 0.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6387808322906494, + "learning_rate": 6.909830056250526e-07, + "loss": -0.0624, + "num_tokens": 2828520.0, + "reward": 5.970319747924805, + "reward_std": 2.572580099105835, + "rewards/fitness_reward/mean": 5.823781967163086, + "rewards/fitness_reward/std": 2.2292187213897705, + "rewards/kidney_reward/mean": 0.08642309904098511, + "rewards/kidney_reward/std": 1.3586586713790894, + "rewards/length2tails_reward/mean": 0.8334128856658936, + "rewards/length2tails_reward/std": 0.27404242753982544, + "rewards/thermo_reward/mean": -0.21005354821681976, + "rewards/thermo_reward/std": 2.043229579925537, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1293806629255414, + "epoch": 0.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7523674368858337, + "learning_rate": 6.843509630528976e-07, + "loss": 0.0026, + "num_tokens": 2837222.0, + "reward": 6.185390472412109, + "reward_std": 2.2776901721954346, + "rewards/fitness_reward/mean": 6.226982116699219, + "rewards/fitness_reward/std": 2.060880661010742, + "rewards/kidney_reward/mean": -0.24002555012702942, + "rewards/kidney_reward/std": 1.4971431493759155, + "rewards/length2tails_reward/mean": 0.8265526294708252, + "rewards/length2tails_reward/std": 0.18807940185070038, + "rewards/thermo_reward/mean": -0.2564352750778198, + "rewards/thermo_reward/std": 2.1351709365844727, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.71875, + "completions/mean_terminated_length": 265.71875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.14506491273641586, + "epoch": 0.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8249183297157288, + "learning_rate": 6.77734304769489e-07, + "loss": -0.0628, + "num_tokens": 2845757.0, + "reward": 5.609094619750977, + "reward_std": 3.602327346801758, + "rewards/fitness_reward/mean": 5.562655448913574, + "rewards/fitness_reward/std": 3.26778507232666, + "rewards/kidney_reward/mean": -0.17317365109920502, + "rewards/kidney_reward/std": 1.4439454078674316, + "rewards/length2tails_reward/mean": 0.8142207860946655, + "rewards/length2tails_reward/std": 0.2130184769630432, + "rewards/thermo_reward/mean": -0.14105884730815887, + "rewards/thermo_reward/std": 2.0546224117279053, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12599319033324718, + "epoch": 0.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3221113383769989, + "learning_rate": 6.711333532614167e-07, + "loss": -0.0001, + "num_tokens": 2854436.0, + "reward": 6.323266983032227, + "reward_std": 2.3031466007232666, + "rewards/fitness_reward/mean": 6.206177711486816, + "rewards/fitness_reward/std": 1.681239366531372, + "rewards/kidney_reward/mean": -0.11081783473491669, + "rewards/kidney_reward/std": 1.1887176036834717, + "rewards/length2tails_reward/mean": 0.8017334938049316, + "rewards/length2tails_reward/std": 0.22630544006824493, + "rewards/thermo_reward/mean": -0.05587046593427658, + "rewards/thermo_reward/std": 1.9467616081237793, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14385926723480225, + "epoch": 0.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.863189458847046, + "learning_rate": 6.645484302497451e-07, + "loss": 0.0393, + "num_tokens": 2863218.0, + "reward": 5.9234395027160645, + "reward_std": 2.1091227531433105, + "rewards/fitness_reward/mean": 6.135817527770996, + "rewards/fitness_reward/std": 2.0592422485351562, + "rewards/kidney_reward/mean": -0.16652746498584747, + "rewards/kidney_reward/std": 1.1908307075500488, + "rewards/length2tails_reward/mean": 0.8032562732696533, + "rewards/length2tails_reward/std": 0.25795942544937134, + "rewards/thermo_reward/mean": -0.6598567962646484, + "rewards/thermo_reward/std": 2.0198681354522705, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 275.4375, + "completions/mean_terminated_length": 275.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14171724021434784, + "epoch": 0.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0005085468292236, + "learning_rate": 6.579798566743313e-07, + "loss": 0.0739, + "num_tokens": 2872064.0, + "reward": 6.158988952636719, + "reward_std": 2.413141965866089, + "rewards/fitness_reward/mean": 6.1257853507995605, + "rewards/fitness_reward/std": 2.113737106323242, + "rewards/kidney_reward/mean": -0.23006314039230347, + "rewards/kidney_reward/std": 1.3204492330551147, + "rewards/length2tails_reward/mean": 0.7789495587348938, + "rewards/length2tails_reward/std": 0.28807762265205383, + "rewards/thermo_reward/mean": -0.09300464391708374, + "rewards/thermo_reward/std": 1.988651156425476, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1332209836691618, + "epoch": 0.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4491146206855774, + "learning_rate": 6.51427952678185e-07, + "loss": 0.0021, + "num_tokens": 2880764.0, + "reward": 6.950940132141113, + "reward_std": 1.3148001432418823, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.06448443233966827, + "rewards/kidney_reward/std": 1.3716617822647095, + "rewards/length2tails_reward/mean": 0.7650790214538574, + "rewards/length2tails_reward/std": 0.2712799906730652, + "rewards/thermo_reward/mean": 0.40122920274734497, + "rewards/thermo_reward/std": 1.916085958480835, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.19188074674457312, + "epoch": 0.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4808740615844727, + "learning_rate": 6.448930375918631e-07, + "loss": 0.0101, + "num_tokens": 2889708.0, + "reward": 6.363583564758301, + "reward_std": 3.193633556365967, + "rewards/fitness_reward/mean": 5.766847133636475, + "rewards/fitness_reward/std": 2.871739625930786, + "rewards/kidney_reward/mean": 0.1560065597295761, + "rewards/kidney_reward/std": 1.4915741682052612, + "rewards/length2tails_reward/mean": 0.7530208826065063, + "rewards/length2tails_reward/std": 0.30450528860092163, + "rewards/thermo_reward/mean": 0.6609548926353455, + "rewards/thermo_reward/std": 1.5844378471374512, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1405778331682086, + "epoch": 0.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5762265920639038, + "learning_rate": 6.383754299179078e-07, + "loss": -0.0001, + "num_tokens": 2898380.0, + "reward": 5.537961483001709, + "reward_std": 3.1746127605438232, + "rewards/fitness_reward/mean": 5.317624568939209, + "rewards/fitness_reward/std": 3.140000104904175, + "rewards/kidney_reward/mean": 0.0035296976566314697, + "rewards/kidney_reward/std": 1.4678502082824707, + "rewards/length2tails_reward/mean": 0.7269901037216187, + "rewards/length2tails_reward/std": 0.33388474583625793, + "rewards/thermo_reward/mean": 0.07364928722381592, + "rewards/thermo_reward/std": 2.0159270763397217, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.123139763250947, + "epoch": 0.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4227535724639893, + "learning_rate": 6.31875447315322e-07, + "loss": -0.004, + "num_tokens": 2907054.0, + "reward": 6.01489782333374, + "reward_std": 2.012277603149414, + "rewards/fitness_reward/mean": 6.004064559936523, + "rewards/fitness_reward/std": 1.8035767078399658, + "rewards/kidney_reward/mean": -0.061723992228507996, + "rewards/kidney_reward/std": 1.3236358165740967, + "rewards/length2tails_reward/mean": 0.7410252690315247, + "rewards/length2tails_reward/std": 0.33866503834724426, + "rewards/thermo_reward/mean": -0.28712227940559387, + "rewards/thermo_reward/std": 1.983262062072754, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 275.40625, + "completions/mean_terminated_length": 275.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14197211991995573, + "epoch": 0.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8882272243499756, + "learning_rate": 6.253934065840879e-07, + "loss": 0.0632, + "num_tokens": 2915899.0, + "reward": 5.8689494132995605, + "reward_std": 2.980410575866699, + "rewards/fitness_reward/mean": 5.711335182189941, + "rewards/fitness_reward/std": 2.7298057079315186, + "rewards/kidney_reward/mean": -0.27478981018066406, + "rewards/kidney_reward/std": 1.34125816822052, + "rewards/length2tails_reward/mean": 0.7590400576591492, + "rewards/length2tails_reward/std": 0.3217729926109314, + "rewards/thermo_reward/mean": 0.21049821376800537, + "rewards/thermo_reward/std": 1.6486570835113525, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.16120561491698027, + "epoch": 0.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4252558946609497, + "learning_rate": 6.189296236497259e-07, + "loss": 0.0085, + "num_tokens": 2924676.0, + "reward": 6.017755508422852, + "reward_std": 2.7611100673675537, + "rewards/fitness_reward/mean": 5.929872512817383, + "rewards/fitness_reward/std": 2.1686391830444336, + "rewards/kidney_reward/mean": -0.19728940725326538, + "rewards/kidney_reward/std": 1.508063793182373, + "rewards/length2tails_reward/mean": 0.9154133200645447, + "rewards/length2tails_reward/std": 0.13163615763187408, + "rewards/thermo_reward/mean": -0.08465149998664856, + "rewards/thermo_reward/std": 2.1873550415039062, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13629860151559114, + "epoch": 0.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6146358847618103, + "learning_rate": 6.124844135478971e-07, + "loss": -0.0018, + "num_tokens": 2933409.0, + "reward": 6.3639020919799805, + "reward_std": 1.8899461030960083, + "rewards/fitness_reward/mean": 6.188002586364746, + "rewards/fitness_reward/std": 1.7780461311340332, + "rewards/kidney_reward/mean": 0.26472312211990356, + "rewards/kidney_reward/std": 1.3054981231689453, + "rewards/length2tails_reward/mean": 0.8750399351119995, + "rewards/length2tails_reward/std": 0.20164364576339722, + "rewards/thermo_reward/mean": -0.3504437208175659, + "rewards/thermo_reward/std": 2.15000581741333, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1350244265049696, + "epoch": 0.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46959754824638367, + "learning_rate": 6.060580904090489e-07, + "loss": -0.0032, + "num_tokens": 2942130.0, + "reward": 6.7464599609375, + "reward_std": 1.4381029605865479, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.09605660289525986, + "rewards/kidney_reward/std": 1.3236219882965088, + "rewards/length2tails_reward/mean": 0.8153715133666992, + "rewards/length2tails_reward/std": 0.2927325665950775, + "rewards/thermo_reward/mean": 0.012558825314044952, + "rewards/thermo_reward/std": 1.980684757232666, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 278.84375, + "completions/mean_terminated_length": 278.84375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.1624396527186036, + "epoch": 0.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9301204681396484, + "learning_rate": 5.996509674431051e-07, + "loss": -0.0045, + "num_tokens": 2951085.0, + "reward": 6.039576530456543, + "reward_std": 2.4619998931884766, + "rewards/fitness_reward/mean": 6.033344268798828, + "rewards/fitness_reward/std": 2.1146256923675537, + "rewards/kidney_reward/mean": -0.047126639634370804, + "rewards/kidney_reward/std": 1.3129688501358032, + "rewards/length2tails_reward/mean": 0.8374546766281128, + "rewards/length2tails_reward/std": 0.27305513620376587, + "rewards/thermo_reward/mean": -0.35913562774658203, + "rewards/thermo_reward/std": 2.1510767936706543, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12986998166888952, + "epoch": 0.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7267351150512695, + "learning_rate": 5.932633569241999e-07, + "loss": -0.004, + "num_tokens": 2959786.0, + "reward": 6.215915679931641, + "reward_std": 1.7919178009033203, + "rewards/fitness_reward/mean": 6.293497085571289, + "rewards/fitness_reward/std": 1.6846182346343994, + "rewards/kidney_reward/mean": -0.48145976662635803, + "rewards/kidney_reward/std": 1.4974082708358765, + "rewards/length2tails_reward/mean": 0.7296417951583862, + "rewards/length2tails_reward/std": 0.3265739381313324, + "rewards/thermo_reward/mean": -0.03852371871471405, + "rewards/thermo_reward/std": 1.8306382894515991, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 277.6875, + "completions/mean_terminated_length": 277.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14375455770641565, + "epoch": 0.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5397517681121826, + "learning_rate": 5.868955701754583e-07, + "loss": 0.0027, + "num_tokens": 2968704.0, + "reward": 6.217046737670898, + "reward_std": 3.013193130493164, + "rewards/fitness_reward/mean": 5.526139259338379, + "rewards/fitness_reward/std": 3.077094078063965, + "rewards/kidney_reward/mean": 0.12864208221435547, + "rewards/kidney_reward/std": 1.2796517610549927, + "rewards/length2tails_reward/mean": 0.8049825429916382, + "rewards/length2tails_reward/std": 0.2577892541885376, + "rewards/thermo_reward/mean": 0.8506805896759033, + "rewards/thermo_reward/std": 1.3328129053115845, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 282.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14949253015220165, + "epoch": 0.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.905804395675659, + "learning_rate": 5.805479175538228e-07, + "loss": 0.1501, + "num_tokens": 2977768.0, + "reward": 6.654813766479492, + "reward_std": 2.559910774230957, + "rewards/fitness_reward/mean": 5.95985221862793, + "rewards/fitness_reward/std": 2.4967284202575684, + "rewards/kidney_reward/mean": 0.1843625009059906, + "rewards/kidney_reward/std": 1.3213741779327393, + "rewards/length2tails_reward/mean": 0.7851734161376953, + "rewards/length2tails_reward/std": 0.31144633889198303, + "rewards/thermo_reward/mean": 0.8129734992980957, + "rewards/thermo_reward/std": 1.5214051008224487, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 276.46875, + "completions/mean_terminated_length": 276.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1573938773944974, + "epoch": 0.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2604691982269287, + "learning_rate": 5.742207084349273e-07, + "loss": 0.0695, + "num_tokens": 2986647.0, + "reward": 6.545932769775391, + "reward_std": 2.2556028366088867, + "rewards/fitness_reward/mean": 6.107923984527588, + "rewards/fitness_reward/std": 2.2110416889190674, + "rewards/kidney_reward/mean": -0.17903482913970947, + "rewards/kidney_reward/std": 1.1938183307647705, + "rewards/length2tails_reward/mean": 0.9086741805076599, + "rewards/length2tails_reward/std": 0.1183965727686882, + "rewards/thermo_reward/mean": 0.6007147431373596, + "rewards/thermo_reward/std": 1.7732858657836914, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 282.0625, + "completions/mean_terminated_length": 282.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16670695785433054, + "epoch": 0.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8591322898864746, + "learning_rate": 5.679142511980175e-07, + "loss": 0.1724, + "num_tokens": 2995705.0, + "reward": 6.364426612854004, + "reward_std": 2.4766600131988525, + "rewards/fitness_reward/mean": 6.218327522277832, + "rewards/fitness_reward/std": 2.109841823577881, + "rewards/kidney_reward/mean": -0.07648028433322906, + "rewards/kidney_reward/std": 1.3622918128967285, + "rewards/length2tails_reward/mean": 0.8389837741851807, + "rewards/length2tails_reward/std": 0.25265058875083923, + "rewards/thermo_reward/mean": -0.05081367492675781, + "rewards/thermo_reward/std": 2.0749711990356445, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 290.4375, + "completions/mean_terminated_length": 290.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16736070439219475, + "epoch": 0.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8917258381843567, + "learning_rate": 5.616288532109224e-07, + "loss": 0.022, + "num_tokens": 3005031.0, + "reward": 5.868127346038818, + "reward_std": 2.222395420074463, + "rewards/fitness_reward/mean": 5.939970970153809, + "rewards/fitness_reward/std": 2.117759943008423, + "rewards/kidney_reward/mean": -0.30231744050979614, + "rewards/kidney_reward/std": 1.301476240158081, + "rewards/length2tails_reward/mean": 0.8077414035797119, + "rewards/length2tails_reward/std": 0.26545435190200806, + "rewards/thermo_reward/mean": -0.24524012207984924, + "rewards/thermo_reward/std": 2.0206568241119385, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.13543427735567093, + "epoch": 0.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4829248189926147, + "learning_rate": 5.553648208150728e-07, + "loss": -0.0313, + "num_tokens": 3013596.0, + "reward": 5.708004951477051, + "reward_std": 3.3088643550872803, + "rewards/fitness_reward/mean": 5.3620171546936035, + "rewards/fitness_reward/std": 3.26958966255188, + "rewards/kidney_reward/mean": 0.10526783764362335, + "rewards/kidney_reward/std": 1.2809340953826904, + "rewards/length2tails_reward/mean": 0.6367802023887634, + "rewards/length2tails_reward/std": 0.39182284474372864, + "rewards/thermo_reward/mean": 0.2683173418045044, + "rewards/thermo_reward/std": 1.797824740409851, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 274.96875, + "completions/mean_terminated_length": 274.96875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.14805607683956623, + "epoch": 0.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5612566471099854, + "learning_rate": 5.491224593105694e-07, + "loss": -0.0167, + "num_tokens": 3022427.0, + "reward": 6.30277156829834, + "reward_std": 2.0895543098449707, + "rewards/fitness_reward/mean": 6.204276084899902, + "rewards/fitness_reward/std": 1.6913362741470337, + "rewards/kidney_reward/mean": -0.19850590825080872, + "rewards/kidney_reward/std": 1.399698257446289, + "rewards/length2tails_reward/mean": 0.760990560054779, + "rewards/length2tails_reward/std": 0.2998768985271454, + "rewards/thermo_reward/mean": 0.015001252293586731, + "rewards/thermo_reward/std": 2.113967180252075, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15043269284069538, + "epoch": 0.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.009800672531128, + "learning_rate": 5.42902072941306e-07, + "loss": 0.0068, + "num_tokens": 3031164.0, + "reward": 6.181963920593262, + "reward_std": 2.685715675354004, + "rewards/fitness_reward/mean": 5.919095039367676, + "rewards/fitness_reward/std": 2.649914264678955, + "rewards/kidney_reward/mean": -0.24342183768749237, + "rewards/kidney_reward/std": 1.2877310514450073, + "rewards/length2tails_reward/mean": 0.8092514872550964, + "rewards/length2tails_reward/std": 0.2687968909740448, + "rewards/thermo_reward/mean": 0.36453354358673096, + "rewards/thermo_reward/std": 1.8220114707946777, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1562238810583949, + "epoch": 0.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.349087715148926, + "learning_rate": 5.367039648801385e-07, + "loss": 0.0074, + "num_tokens": 3039895.0, + "reward": 5.90168571472168, + "reward_std": 2.823669195175171, + "rewards/fitness_reward/mean": 5.8231048583984375, + "rewards/fitness_reward/std": 2.693870782852173, + "rewards/kidney_reward/mean": -0.2964228391647339, + "rewards/kidney_reward/std": 1.2563472986221313, + "rewards/length2tails_reward/mean": 0.7685627937316895, + "rewards/length2tails_reward/std": 0.33195197582244873, + "rewards/thermo_reward/mean": 0.06930221617221832, + "rewards/thermo_reward/std": 2.0728096961975098, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.14733695331960917, + "epoch": 0.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.797904372215271, + "learning_rate": 5.305284372141095e-07, + "loss": -0.0051, + "num_tokens": 3048551.0, + "reward": 6.2037858963012695, + "reward_std": 2.359827756881714, + "rewards/fitness_reward/mean": 6.003262996673584, + "rewards/fitness_reward/std": 2.3144755363464355, + "rewards/kidney_reward/mean": 0.26831483840942383, + "rewards/kidney_reward/std": 1.3605223894119263, + "rewards/length2tails_reward/mean": 0.7673937082290649, + "rewards/length2tails_reward/std": 0.29162222146987915, + "rewards/thermo_reward/mean": -0.2509657144546509, + "rewards/thermo_reward/std": 1.9805631637573242, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 280.0625, + "completions/mean_terminated_length": 280.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15698255505412817, + "epoch": 0.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9059972763061523, + "learning_rate": 5.243757909297246e-07, + "loss": 0.1457, + "num_tokens": 3057545.0, + "reward": 6.684830665588379, + "reward_std": 2.255929470062256, + "rewards/fitness_reward/mean": 6.252006530761719, + "rewards/fitness_reward/std": 1.9193217754364014, + "rewards/kidney_reward/mean": 0.38714808225631714, + "rewards/kidney_reward/std": 1.3580050468444824, + "rewards/length2tails_reward/mean": 0.7600257396697998, + "rewards/length2tails_reward/std": 0.33787864446640015, + "rewards/thermo_reward/mean": 0.0984865352511406, + "rewards/thermo_reward/std": 2.025115489959717, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1372505398467183, + "epoch": 0.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41981613636016846, + "learning_rate": 5.182463258982846e-07, + "loss": 0.0016, + "num_tokens": 3066232.0, + "reward": 6.245865821838379, + "reward_std": 1.3084803819656372, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.4447236657142639, + "rewards/kidney_reward/std": 1.3307983875274658, + "rewards/length2tails_reward/mean": 0.7939730286598206, + "rewards/length2tails_reward/std": 0.2816673517227173, + "rewards/thermo_reward/mean": -0.2311713695526123, + "rewards/thermo_reward/std": 1.8473246097564697, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.177174954675138, + "epoch": 0.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8986427783966064, + "learning_rate": 5.121403408612671e-07, + "loss": -0.0039, + "num_tokens": 3074995.0, + "reward": 5.444323539733887, + "reward_std": 3.1555228233337402, + "rewards/fitness_reward/mean": 5.578423976898193, + "rewards/fitness_reward/std": 3.2130587100982666, + "rewards/kidney_reward/mean": -0.5759067535400391, + "rewards/kidney_reward/std": 1.0323132276535034, + "rewards/length2tails_reward/mean": 0.8635549545288086, + "rewards/length2tails_reward/std": 0.2309083193540573, + "rewards/thermo_reward/mean": -0.12407190352678299, + "rewards/thermo_reward/std": 2.0656275749206543, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 279.03125, + "completions/mean_terminated_length": 279.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1382511956617236, + "epoch": 0.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28194135427474976, + "learning_rate": 5.060581334157692e-07, + "loss": 0.0046, + "num_tokens": 3083956.0, + "reward": 6.333901882171631, + "reward_std": 1.6132252216339111, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.1382146179676056, + "rewards/kidney_reward/std": 1.2211270332336426, + "rewards/length2tails_reward/mean": 0.7396640777587891, + "rewards/length2tails_reward/std": 0.34703871607780457, + "rewards/thermo_reward/mean": -0.12847588956356049, + "rewards/thermo_reward/std": 1.7511825561523438, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.17926898691803217, + "epoch": 0.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.676492214202881, + "learning_rate": 5.000000000000002e-07, + "loss": -0.0297, + "num_tokens": 3092520.0, + "reward": 6.2205400466918945, + "reward_std": 3.0040314197540283, + "rewards/fitness_reward/mean": 5.88077449798584, + "rewards/fitness_reward/std": 2.7958946228027344, + "rewards/kidney_reward/mean": 0.011723548173904419, + "rewards/kidney_reward/std": 1.3006480932235718, + "rewards/length2tails_reward/mean": 0.6432532668113708, + "rewards/length2tails_reward/std": 0.3428514003753662, + "rewards/thermo_reward/mean": 0.34617942571640015, + "rewards/thermo_reward/std": 1.814773678779602, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13824842125177383, + "epoch": 0.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6739991903305054, + "learning_rate": 4.939662358788364e-07, + "loss": 0.0083, + "num_tokens": 3101199.0, + "reward": 7.00583553314209, + "reward_std": 1.7048717737197876, + "rewards/fitness_reward/mean": 6.30168342590332, + "rewards/fitness_reward/std": 1.6383068561553955, + "rewards/kidney_reward/mean": -0.24639709293842316, + "rewards/kidney_reward/std": 1.4407916069030762, + "rewards/length2tails_reward/mean": 0.7735517621040344, + "rewards/length2tails_reward/std": 0.20475666224956512, + "rewards/thermo_reward/mean": 1.2679250240325928, + "rewards/thermo_reward/std": 1.1704320907592773, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1290579428896308, + "epoch": 0.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.526790976524353, + "learning_rate": 4.879571351294286e-07, + "loss": -0.001, + "num_tokens": 3109921.0, + "reward": 6.389036655426025, + "reward_std": 1.4514721632003784, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.377413809299469, + "rewards/kidney_reward/std": 1.091863989830017, + "rewards/length2tails_reward/mean": 0.8268851041793823, + "rewards/length2tails_reward/std": 0.26292723417282104, + "rewards/thermo_reward/mean": -0.23457345366477966, + "rewards/thermo_reward/std": 2.134856700897217, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.13011477049440145, + "epoch": 0.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.507643461227417, + "learning_rate": 4.819729906268699e-07, + "loss": -0.0111, + "num_tokens": 3118587.0, + "reward": 6.725968837738037, + "reward_std": 2.484793186187744, + "rewards/fitness_reward/mean": 6.2299323081970215, + "rewards/fitness_reward/std": 2.0441930294036865, + "rewards/kidney_reward/mean": 0.29642677307128906, + "rewards/kidney_reward/std": 1.4404501914978027, + "rewards/length2tails_reward/mean": 0.8288535475730896, + "rewards/length2tails_reward/std": 0.2590929865837097, + "rewards/thermo_reward/mean": 0.2812193036079407, + "rewards/thermo_reward/std": 1.9625108242034912, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.1574456738308072, + "epoch": 0.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.192049503326416, + "learning_rate": 4.76014094029921e-07, + "loss": 0.0221, + "num_tokens": 3127368.0, + "reward": 5.9911274909973145, + "reward_std": 3.1412923336029053, + "rewards/fitness_reward/mean": 5.862482070922852, + "rewards/fitness_reward/std": 2.868088960647583, + "rewards/kidney_reward/mean": -0.2520975172519684, + "rewards/kidney_reward/std": 1.3299691677093506, + "rewards/length2tails_reward/mean": 0.8562111258506775, + "rewards/length2tails_reward/std": 0.2258239984512329, + "rewards/thermo_reward/mean": 0.08128249645233154, + "rewards/thermo_reward/std": 1.9431575536727905, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 277.53125, + "completions/mean_terminated_length": 277.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1446541864424944, + "epoch": 0.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.883546829223633, + "learning_rate": 4.700807357667952e-07, + "loss": 0.0751, + "num_tokens": 3136281.0, + "reward": 6.001834869384766, + "reward_std": 2.6639564037323, + "rewards/fitness_reward/mean": 5.9749755859375, + "rewards/fitness_reward/std": 2.4309489727020264, + "rewards/kidney_reward/mean": -0.41944435238838196, + "rewards/kidney_reward/std": 1.414008378982544, + "rewards/length2tails_reward/mean": 0.8256665468215942, + "rewards/length2tails_reward/std": 0.2446584701538086, + "rewards/thermo_reward/mean": 0.06032890826463699, + "rewards/thermo_reward/std": 1.9380780458450317, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.1585037438198924, + "epoch": 0.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5597310066223145, + "learning_rate": 4.641732050210031e-07, + "loss": -0.0062, + "num_tokens": 3144952.0, + "reward": 5.679251670837402, + "reward_std": 3.744248151779175, + "rewards/fitness_reward/mean": 5.753543853759766, + "rewards/fitness_reward/std": 2.92307186126709, + "rewards/kidney_reward/mean": 0.015622451901435852, + "rewards/kidney_reward/std": 1.5301791429519653, + "rewards/length2tails_reward/mean": 0.7561187148094177, + "rewards/length2tails_reward/std": 0.3244445323944092, + "rewards/thermo_reward/mean": -0.5422676205635071, + "rewards/thermo_reward/std": 2.146629810333252, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14853220619261265, + "epoch": 0.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22003161907196045, + "learning_rate": 4.5829178971726023e-07, + "loss": 0.0001, + "num_tokens": 3153644.0, + "reward": 6.998676300048828, + "reward_std": 1.162030816078186, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1410554051399231, + "rewards/kidney_reward/std": 1.3983737230300903, + "rewards/length2tails_reward/mean": 0.8080669641494751, + "rewards/length2tails_reward/std": 0.23570886254310608, + "rewards/thermo_reward/mean": 0.2696669101715088, + "rewards/thermo_reward/std": 1.6834814548492432, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.12786609400063753, + "epoch": 0.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.924838662147522, + "learning_rate": 4.524367765074498e-07, + "loss": -0.0082, + "num_tokens": 3162341.0, + "reward": 5.59239387512207, + "reward_std": 3.2712528705596924, + "rewards/fitness_reward/mean": 5.735169410705566, + "rewards/fitness_reward/std": 2.9949986934661865, + "rewards/kidney_reward/mean": -0.12405352294445038, + "rewards/kidney_reward/std": 1.2700321674346924, + "rewards/length2tails_reward/mean": 0.8160407543182373, + "rewards/length2tails_reward/std": 0.25917312502861023, + "rewards/thermo_reward/mean": -0.5695180892944336, + "rewards/thermo_reward/std": 1.9807058572769165, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1366393668577075, + "epoch": 0.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4617418348789215, + "learning_rate": 4.46608450756656e-07, + "loss": 0.0093, + "num_tokens": 3171036.0, + "reward": 6.625120162963867, + "reward_std": 1.5828049182891846, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.11791878938674927, + "rewards/kidney_reward/std": 1.4317349195480347, + "rewards/length2tails_reward/mean": 0.8119157552719116, + "rewards/length2tails_reward/std": 0.22039470076560974, + "rewards/thermo_reward/mean": -0.014416981488466263, + "rewards/thermo_reward/std": 1.7909175157546997, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.13373654335737228, + "epoch": 0.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0905619859695435, + "learning_rate": 4.408070965292533e-07, + "loss": -0.0696, + "num_tokens": 3179480.0, + "reward": 5.402747631072998, + "reward_std": 3.670199155807495, + "rewards/fitness_reward/mean": 5.362958908081055, + "rewards/fitness_reward/std": 3.5691020488739014, + "rewards/kidney_reward/mean": -0.31657668948173523, + "rewards/kidney_reward/std": 1.1466575860977173, + "rewards/length2tails_reward/mean": 0.8078165054321289, + "rewards/length2tails_reward/std": 0.234373077750206, + "rewards/thermo_reward/mean": -0.007755070924758911, + "rewards/thermo_reward/std": 1.9276933670043945, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11955827847123146, + "epoch": 0.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9351580142974854, + "learning_rate": 4.350329965750621e-07, + "loss": -0.0021, + "num_tokens": 3188146.0, + "reward": 6.064743995666504, + "reward_std": 1.4324169158935547, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.6712437868118286, + "rewards/kidney_reward/std": 1.1112077236175537, + "rewards/length2tails_reward/mean": 0.7376230359077454, + "rewards/length2tails_reward/std": 0.32265642285346985, + "rewards/thermo_reward/mean": -0.3387202024459839, + "rewards/thermo_reward/std": 2.0547823905944824, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14470462873578072, + "epoch": 0.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8790856003761292, + "learning_rate": 4.292864323155684e-07, + "loss": 0.0258, + "num_tokens": 3196964.0, + "reward": 6.670239448547363, + "reward_std": 1.5476105213165283, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.13486161828041077, + "rewards/kidney_reward/std": 1.4245083332061768, + "rewards/length2tails_reward/mean": 0.7783125638961792, + "rewards/length2tails_reward/std": 0.2922722101211548, + "rewards/thermo_reward/mean": 0.10956567525863647, + "rewards/thermo_reward/std": 2.0721426010131836, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 276.5625, + "completions/mean_terminated_length": 276.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1494081998243928, + "epoch": 0.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.519752025604248, + "learning_rate": 4.235676838302068e-07, + "loss": 0.1007, + "num_tokens": 3205846.0, + "reward": 6.206664562225342, + "reward_std": 2.2861545085906982, + "rewards/fitness_reward/mean": 6.025056838989258, + "rewards/fitness_reward/std": 2.1580092906951904, + "rewards/kidney_reward/mean": -0.3018892705440521, + "rewards/kidney_reward/std": 1.277760624885559, + "rewards/length2tails_reward/mean": 0.7835294008255005, + "rewards/length2tails_reward/std": 0.27964353561401367, + "rewards/thermo_reward/mean": 0.2733404040336609, + "rewards/thermo_reward/std": 1.9387136697769165, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.12632002774626017, + "epoch": 0.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7130632400512695, + "learning_rate": 4.1787702984271065e-07, + "loss": -0.0029, + "num_tokens": 3214486.0, + "reward": 6.317443370819092, + "reward_std": 2.8365418910980225, + "rewards/fitness_reward/mean": 5.861456871032715, + "rewards/fitness_reward/std": 2.524892807006836, + "rewards/kidney_reward/mean": 0.6378341913223267, + "rewards/kidney_reward/std": 1.403192400932312, + "rewards/length2tails_reward/mean": 0.7120263576507568, + "rewards/length2tails_reward/std": 0.33472928404808044, + "rewards/thermo_reward/mean": -0.08187372982501984, + "rewards/thermo_reward/std": 1.9261077642440796, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11630972567945719, + "epoch": 0.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0709792375564575, + "learning_rate": 4.1221474770752696e-07, + "loss": 0.0438, + "num_tokens": 3223250.0, + "reward": 6.468087196350098, + "reward_std": 2.417856454849243, + "rewards/fitness_reward/mean": 6.030662536621094, + "rewards/fitness_reward/std": 2.1286466121673584, + "rewards/kidney_reward/mean": 0.2095390260219574, + "rewards/kidney_reward/std": 1.2892457246780396, + "rewards/length2tails_reward/mean": 0.7105699777603149, + "rewards/length2tails_reward/std": 0.3106299340724945, + "rewards/thermo_reward/mean": 0.3100256323814392, + "rewards/thermo_reward/std": 1.8363381624221802, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1587747959420085, + "epoch": 0.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5487451553344727, + "learning_rate": 4.0658111339629865e-07, + "loss": 0.0362, + "num_tokens": 3232035.0, + "reward": 6.189671516418457, + "reward_std": 2.8210692405700684, + "rewards/fitness_reward/mean": 5.8552961349487305, + "rewards/fitness_reward/std": 2.550931930541992, + "rewards/kidney_reward/mean": -0.002792835235595703, + "rewards/kidney_reward/std": 1.2694661617279053, + "rewards/length2tails_reward/mean": 0.7921147346496582, + "rewards/length2tails_reward/std": 0.24774512648582458, + "rewards/thermo_reward/mean": 0.27548542618751526, + "rewards/thermo_reward/std": 1.771033525466919, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 275.84375, + "completions/mean_terminated_length": 275.84375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.14940356370061636, + "epoch": 0.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5113012790679932, + "learning_rate": 4.0097640148441423e-07, + "loss": 0.0281, + "num_tokens": 3240894.0, + "reward": 5.955854415893555, + "reward_std": 3.596100091934204, + "rewards/fitness_reward/mean": 5.546163558959961, + "rewards/fitness_reward/std": 3.3078951835632324, + "rewards/kidney_reward/mean": 0.030694488435983658, + "rewards/kidney_reward/std": 1.2919914722442627, + "rewards/length2tails_reward/mean": 0.7410632371902466, + "rewards/length2tails_reward/std": 0.31063371896743774, + "rewards/thermo_reward/mean": 0.41815584897994995, + "rewards/thermo_reward/std": 1.7940014600753784, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 282.15625, + "completions/mean_terminated_length": 282.15625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.18381345830857754, + "epoch": 0.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0370407104492188, + "learning_rate": 3.9540088513762516e-07, + "loss": 0.103, + "num_tokens": 3249955.0, + "reward": 4.9476799964904785, + "reward_std": 3.550820827484131, + "rewards/fitness_reward/mean": 5.036832332611084, + "rewards/fitness_reward/std": 3.907076120376587, + "rewards/kidney_reward/mean": -0.40763381123542786, + "rewards/kidney_reward/std": 1.3715983629226685, + "rewards/length2tails_reward/mean": 0.8609472513198853, + "rewards/length2tails_reward/std": 0.2464415431022644, + "rewards/thermo_reward/mean": -0.20114412903785706, + "rewards/thermo_reward/std": 2.1375131607055664, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1285235472023487, + "epoch": 0.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.580424964427948, + "learning_rate": 3.8985483609873236e-07, + "loss": 0.0101, + "num_tokens": 3258612.0, + "reward": 6.584819316864014, + "reward_std": 1.7636280059814453, + "rewards/fitness_reward/mean": 6.2823309898376465, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.24670317769050598, + "rewards/kidney_reward/std": 1.4007341861724854, + "rewards/length2tails_reward/mean": 0.6969554424285889, + "rewards/length2tails_reward/std": 0.3184390068054199, + "rewards/thermo_reward/mean": 0.00979556143283844, + "rewards/thermo_reward/std": 2.098444700241089, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.1606553541496396, + "epoch": 0.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34814190864563, + "learning_rate": 3.843385246743417e-07, + "loss": -0.006, + "num_tokens": 3267260.0, + "reward": 5.879519939422607, + "reward_std": 3.6545047760009766, + "rewards/fitness_reward/mean": 5.286998271942139, + "rewards/fitness_reward/std": 3.4983344078063965, + "rewards/kidney_reward/mean": 0.09259563684463501, + "rewards/kidney_reward/std": 1.2750647068023682, + "rewards/length2tails_reward/mean": 0.7489024996757507, + "rewards/length2tails_reward/std": 0.29031893610954285, + "rewards/thermo_reward/mean": 0.7179964780807495, + "rewards/thermo_reward/std": 1.3379138708114624, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13412442337721586, + "epoch": 0.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4042009711265564, + "learning_rate": 3.788522197216897e-07, + "loss": -0.0003, + "num_tokens": 3275978.0, + "reward": 6.59251594543457, + "reward_std": 1.5096096992492676, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.1605461984872818, + "rewards/kidney_reward/std": 1.2568833827972412, + "rewards/length2tails_reward/mean": 0.8646030426025391, + "rewards/length2tails_reward/std": 0.2068766951560974, + "rewards/thermo_reward/mean": 0.14263558387756348, + "rewards/thermo_reward/std": 1.953226923942566, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 282.03125, + "completions/mean_terminated_length": 282.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17999477125704288, + "epoch": 0.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6332778930664062, + "learning_rate": 3.7339618863553976e-07, + "loss": 0.1251, + "num_tokens": 3285035.0, + "reward": 6.153130054473877, + "reward_std": 3.041475534439087, + "rewards/fitness_reward/mean": 5.8121442794799805, + "rewards/fitness_reward/std": 2.697699546813965, + "rewards/kidney_reward/mean": 0.07375882565975189, + "rewards/kidney_reward/std": 1.289566159248352, + "rewards/length2tails_reward/mean": 0.8197746276855469, + "rewards/length2tails_reward/std": 0.26130786538124084, + "rewards/thermo_reward/mean": 0.19832536578178406, + "rewards/thermo_reward/std": 1.8575512170791626, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13965389877557755, + "epoch": 0.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7165160775184631, + "learning_rate": 3.679706973351491e-07, + "loss": 0.0033, + "num_tokens": 3293718.0, + "reward": 5.530478477478027, + "reward_std": 2.9073171615600586, + "rewards/fitness_reward/mean": 5.87794828414917, + "rewards/fitness_reward/std": 2.8070197105407715, + "rewards/kidney_reward/mean": -0.1837131381034851, + "rewards/kidney_reward/std": 1.370072603225708, + "rewards/length2tails_reward/mean": 0.7681834697723389, + "rewards/length2tails_reward/std": 0.33813896775245667, + "rewards/thermo_reward/mean": -0.8953179121017456, + "rewards/thermo_reward/std": 1.9587002992630005, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12181603256613016, + "epoch": 0.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47920435667037964, + "learning_rate": 3.625760102513102e-07, + "loss": -0.0005, + "num_tokens": 3302416.0, + "reward": 5.9532647132873535, + "reward_std": 2.242292642593384, + "rewards/fitness_reward/mean": 6.098564147949219, + "rewards/fitness_reward/std": 1.779414176940918, + "rewards/kidney_reward/mean": -0.08578507602214813, + "rewards/kidney_reward/std": 1.4096269607543945, + "rewards/length2tails_reward/mean": 0.7863004803657532, + "rewards/length2tails_reward/std": 0.3195054829120636, + "rewards/thermo_reward/mean": -0.5979645252227783, + "rewards/thermo_reward/std": 1.8952313661575317, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15209331456571817, + "epoch": 0.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3993508815765381, + "learning_rate": 3.5721239031346063e-07, + "loss": 0.004, + "num_tokens": 3311200.0, + "reward": 6.885627746582031, + "reward_std": 1.4935102462768555, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03521929681301117, + "rewards/kidney_reward/std": 1.4940382242202759, + "rewards/length2tails_reward/mean": 0.8114314079284668, + "rewards/length2tails_reward/std": 0.22135481238365173, + "rewards/thermo_reward/mean": 0.42414015531539917, + "rewards/thermo_reward/std": 1.7999176979064941, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11956506315618753, + "epoch": 0.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5052749514579773, + "learning_rate": 3.518800989368691e-07, + "loss": -0.0054, + "num_tokens": 3319857.0, + "reward": 6.8014068603515625, + "reward_std": 1.4542784690856934, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.09797079861164093, + "rewards/kidney_reward/std": 1.2850091457366943, + "rewards/length2tails_reward/mean": 0.735588014125824, + "rewards/length2tails_reward/std": 0.2957015931606293, + "rewards/thermo_reward/mean": 0.356372594833374, + "rewards/thermo_reward/std": 1.8807733058929443, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.03125, + "completions/mean_terminated_length": 263.03125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.13626226875931025, + "epoch": 0.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4649805426597595, + "learning_rate": 3.465793960098945e-07, + "loss": -0.1314, + "num_tokens": 3328306.0, + "reward": 6.379859924316406, + "reward_std": 2.2455568313598633, + "rewards/fitness_reward/mean": 6.234853744506836, + "rewards/fitness_reward/std": 2.0163543224334717, + "rewards/kidney_reward/mean": -0.25092077255249023, + "rewards/kidney_reward/std": 1.4039207696914673, + "rewards/length2tails_reward/mean": 0.8077023029327393, + "rewards/length2tails_reward/std": 0.21923108398914337, + "rewards/thermo_reward/mean": 0.13708209991455078, + "rewards/thermo_reward/std": 1.9737155437469482, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 286.875, + "completions/mean_terminated_length": 286.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15023626573383808, + "epoch": 0.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.262868881225586, + "learning_rate": 3.4131053988131944e-07, + "loss": 0.1763, + "num_tokens": 3337518.0, + "reward": 6.012147903442383, + "reward_std": 2.5567662715911865, + "rewards/fitness_reward/mean": 5.7760162353515625, + "rewards/fitness_reward/std": 2.488330602645874, + "rewards/kidney_reward/mean": 0.29532673954963684, + "rewards/kidney_reward/std": 1.3476054668426514, + "rewards/length2tails_reward/mean": 0.8109618425369263, + "rewards/length2tails_reward/std": 0.2880936563014984, + "rewards/thermo_reward/mean": -0.22854575514793396, + "rewards/thermo_reward/std": 2.024935007095337, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.19405508507043123, + "epoch": 0.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8079236149787903, + "learning_rate": 3.3607378734775837e-07, + "loss": -0.0455, + "num_tokens": 3346083.0, + "reward": 6.148028373718262, + "reward_std": 2.541191339492798, + "rewards/fitness_reward/mean": 5.762939453125, + "rewards/fitness_reward/std": 2.530473470687866, + "rewards/kidney_reward/mean": -0.24977697432041168, + "rewards/kidney_reward/std": 1.4287378787994385, + "rewards/length2tails_reward/mean": 0.8520157337188721, + "rewards/length2tails_reward/std": 0.20463883876800537, + "rewards/thermo_reward/mean": 0.5939469337463379, + "rewards/thermo_reward/std": 1.7890620231628418, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.1499976934865117, + "epoch": 0.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8777826428413391, + "learning_rate": 3.308693936411421e-07, + "loss": -0.1306, + "num_tokens": 3354595.0, + "reward": 6.628800392150879, + "reward_std": 2.6668899059295654, + "rewards/fitness_reward/mean": 6.241194725036621, + "rewards/fitness_reward/std": 1.9804837703704834, + "rewards/kidney_reward/mean": 0.6248332262039185, + "rewards/kidney_reward/std": 1.411794900894165, + "rewards/length2tails_reward/mean": 0.7870715856552124, + "rewards/length2tails_reward/std": 0.24466806650161743, + "rewards/thermo_reward/mean": -0.2431577444076538, + "rewards/thermo_reward/std": 1.7822825908660889, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.14082825370132923, + "epoch": 0.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2032965421676636, + "learning_rate": 3.256976124162769e-07, + "loss": -0.0621, + "num_tokens": 3363137.0, + "reward": 6.046353340148926, + "reward_std": 3.363234519958496, + "rewards/fitness_reward/mean": 5.761190414428711, + "rewards/fitness_reward/std": 2.893301486968994, + "rewards/kidney_reward/mean": -0.21231193840503693, + "rewards/kidney_reward/std": 1.2381178140640259, + "rewards/length2tails_reward/mean": 0.7668105363845825, + "rewards/length2tails_reward/std": 0.3085991144180298, + "rewards/thermo_reward/mean": 0.39923232793807983, + "rewards/thermo_reward/std": 1.8276801109313965, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13472178112715483, + "epoch": 0.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.536989450454712, + "learning_rate": 3.205586957384837e-07, + "loss": -0.0119, + "num_tokens": 3371819.0, + "reward": 7.058485984802246, + "reward_std": 2.452873945236206, + "rewards/fitness_reward/mean": 6.2134294509887695, + "rewards/fitness_reward/std": 2.1375479698181152, + "rewards/kidney_reward/mean": 0.4976133704185486, + "rewards/kidney_reward/std": 1.3583341836929321, + "rewards/length2tails_reward/mean": 0.7949094176292419, + "rewards/length2tails_reward/std": 0.30168935656547546, + "rewards/thermo_reward/mean": 0.7950452566146851, + "rewards/thermo_reward/std": 1.6629666090011597, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.16182382125407457, + "epoch": 0.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.567378282546997, + "learning_rate": 3.154528940713113e-07, + "loss": 0.0299, + "num_tokens": 3380618.0, + "reward": 6.462051868438721, + "reward_std": 2.986990451812744, + "rewards/fitness_reward/mean": 5.870641708374023, + "rewards/fitness_reward/std": 2.8402304649353027, + "rewards/kidney_reward/mean": 0.1701788753271103, + "rewards/kidney_reward/std": 1.3231279850006104, + "rewards/length2tails_reward/mean": 0.7905040979385376, + "rewards/length2tails_reward/std": 0.29682648181915283, + "rewards/thermo_reward/mean": 0.6173891425132751, + "rewards/thermo_reward/std": 1.581768274307251, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 279.5625, + "completions/mean_terminated_length": 279.5625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.1641734791919589, + "epoch": 0.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5476036071777344, + "learning_rate": 3.103804562643302e-07, + "loss": 0.0692, + "num_tokens": 3389596.0, + "reward": 5.573647499084473, + "reward_std": 4.065038204193115, + "rewards/fitness_reward/mean": 5.163810729980469, + "rewards/fitness_reward/std": 3.8489232063293457, + "rewards/kidney_reward/mean": -0.38959816098213196, + "rewards/kidney_reward/std": 1.1808894872665405, + "rewards/length2tails_reward/mean": 0.9047496318817139, + "rewards/length2tails_reward/std": 0.14699599146842957, + "rewards/thermo_reward/mean": 0.7568966150283813, + "rewards/thermo_reward/std": 1.7143446207046509, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12199778575450182, + "epoch": 0.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44223248958587646, + "learning_rate": 3.0534162954100263e-07, + "loss": 0.0034, + "num_tokens": 3398291.0, + "reward": 5.6734089851379395, + "reward_std": 2.7685587406158447, + "rewards/fitness_reward/mean": 5.9041361808776855, + "rewards/fitness_reward/std": 2.3458354473114014, + "rewards/kidney_reward/mean": -0.28289884328842163, + "rewards/kidney_reward/std": 1.5721582174301147, + "rewards/length2tails_reward/mean": 0.7498540878295898, + "rewards/length2tails_reward/std": 0.33232712745666504, + "rewards/thermo_reward/mean": -0.5534823536872864, + "rewards/thermo_reward/std": 2.1388351917266846, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 285.09375, + "completions/mean_terminated_length": 285.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1719574062153697, + "epoch": 0.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5087924003601074, + "learning_rate": 3.0033665948663446e-07, + "loss": 0.0468, + "num_tokens": 3407446.0, + "reward": 5.829162120819092, + "reward_std": 2.9063830375671387, + "rewards/fitness_reward/mean": 5.848479270935059, + "rewards/fitness_reward/std": 2.5653743743896484, + "rewards/kidney_reward/mean": -0.4199415445327759, + "rewards/kidney_reward/std": 1.430213451385498, + "rewards/length2tails_reward/mean": 0.8508896827697754, + "rewards/length2tails_reward/std": 0.21593841910362244, + "rewards/thermo_reward/mean": -0.04413709044456482, + "rewards/thermo_reward/std": 2.0255866050720215, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.13086803443729877, + "epoch": 0.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5239703059196472, + "learning_rate": 2.9536579003640527e-07, + "loss": 0.014, + "num_tokens": 3416108.0, + "reward": 6.765872955322266, + "reward_std": 1.354516863822937, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.16182968020439148, + "rewards/kidney_reward/std": 1.341886281967163, + "rewards/length2tails_reward/mean": 0.7907084226608276, + "rewards/length2tails_reward/std": 0.25706955790519714, + "rewards/thermo_reward/mean": -0.20803500711917877, + "rewards/thermo_reward/std": 2.00783109664917, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 274.8125, + "completions/mean_terminated_length": 274.8125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.13017123751342297, + "epoch": 0.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.406864881515503, + "learning_rate": 2.904292634634793e-07, + "loss": 0.0429, + "num_tokens": 3424934.0, + "reward": 5.226607799530029, + "reward_std": 3.46649432182312, + "rewards/fitness_reward/mean": 5.334480285644531, + "rewards/fitness_reward/std": 3.353198289871216, + "rewards/kidney_reward/mean": -0.2561941146850586, + "rewards/kidney_reward/std": 1.300828456878662, + "rewards/length2tails_reward/mean": 0.7608662247657776, + "rewards/length2tails_reward/std": 0.3362342417240143, + "rewards/thermo_reward/mean": -0.3399842083454132, + "rewards/thermo_reward/std": 1.951754093170166, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13058719597756863, + "epoch": 0.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5075511932373047, + "learning_rate": 2.8552732036719684e-07, + "loss": 0.0045, + "num_tokens": 3433585.0, + "reward": 6.7212090492248535, + "reward_std": 1.4675315618515015, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.09525611996650696, + "rewards/kidney_reward/std": 1.6818774938583374, + "rewards/length2tails_reward/mean": 0.720190167427063, + "rewards/length2tails_reward/std": 0.320951372385025, + "rewards/thermo_reward/mean": -0.19552983343601227, + "rewards/thermo_reward/std": 1.9775919914245605, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12077728472650051, + "epoch": 0.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.332504004240036, + "learning_rate": 2.8066019966134904e-07, + "loss": -0.0, + "num_tokens": 3442238.0, + "reward": 6.944338798522949, + "reward_std": 1.4273179769515991, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.1750543713569641, + "rewards/kidney_reward/std": 1.3548657894134521, + "rewards/length2tails_reward/mean": 0.7342120409011841, + "rewards/length2tails_reward/std": 0.28480300307273865, + "rewards/thermo_reward/mean": 0.369899183511734, + "rewards/thermo_reward/std": 1.6564077138900757, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1527182124555111, + "epoch": 0.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.405423164367676, + "learning_rate": 2.758281385625327e-07, + "loss": 0.0836, + "num_tokens": 3451164.0, + "reward": 6.0525336265563965, + "reward_std": 2.963632583618164, + "rewards/fitness_reward/mean": 5.772695541381836, + "rewards/fitness_reward/std": 2.850486993789673, + "rewards/kidney_reward/mean": -0.15535932779312134, + "rewards/kidney_reward/std": 1.2924243211746216, + "rewards/length2tails_reward/mean": 0.7899340987205505, + "rewards/length2tails_reward/std": 0.22973912954330444, + "rewards/thermo_reward/mean": 0.3200690448284149, + "rewards/thermo_reward/std": 1.8255976438522339, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 258.84375, + "completions/mean_terminated_length": 258.84375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 0.14717209991067648, + "epoch": 0.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2791216373443604, + "learning_rate": 2.7103137257858863e-07, + "loss": -0.1568, + "num_tokens": 3459479.0, + "reward": 6.065085411071777, + "reward_std": 3.130974769592285, + "rewards/fitness_reward/mean": 5.851991653442383, + "rewards/fitness_reward/std": 2.909266948699951, + "rewards/kidney_reward/mean": 0.005277007818222046, + "rewards/kidney_reward/std": 1.342024564743042, + "rewards/length2tails_reward/mean": 0.7929951548576355, + "rewards/length2tails_reward/std": 0.24119798839092255, + "rewards/thermo_reward/mean": 0.024412035942077637, + "rewards/thermo_reward/std": 1.9675092697143555, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 281.5, + "completions/mean_terminated_length": 281.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.136367273516953, + "epoch": 0.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6745145916938782, + "learning_rate": 2.662701354971235e-07, + "loss": -0.0171, + "num_tokens": 3468519.0, + "reward": 6.343623161315918, + "reward_std": 1.3072080612182617, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.27624326944351196, + "rewards/kidney_reward/std": 1.1385921239852905, + "rewards/length2tails_reward/mean": 0.847183883190155, + "rewards/length2tails_reward/std": 0.2894735038280487, + "rewards/thermo_reward/mean": -0.23074282705783844, + "rewards/thermo_reward/std": 2.1387784481048584, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 275.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.17113555409014225, + "epoch": 0.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1915013790130615, + "learning_rate": 2.615446593741161e-07, + "loss": 0.0208, + "num_tokens": 3477374.0, + "reward": 7.070910453796387, + "reward_std": 1.5382869243621826, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2943302392959595, + "rewards/kidney_reward/std": 1.565941333770752, + "rewards/length2tails_reward/mean": 0.8764208555221558, + "rewards/length2tails_reward/std": 0.21900318562984467, + "rewards/thermo_reward/mean": 0.43266230821609497, + "rewards/thermo_reward/std": 1.9067713022232056, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.14445937052369118, + "epoch": 0.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3606030941009521, + "learning_rate": 2.568551745226056e-07, + "loss": -0.0315, + "num_tokens": 3485990.0, + "reward": 6.238191604614258, + "reward_std": 2.30902361869812, + "rewards/fitness_reward/mean": 6.231675148010254, + "rewards/fitness_reward/std": 2.0343332290649414, + "rewards/kidney_reward/mean": -0.2886021137237549, + "rewards/kidney_reward/std": 1.3605780601501465, + "rewards/length2tails_reward/mean": 0.777554452419281, + "rewards/length2tails_reward/std": 0.2704724073410034, + "rewards/thermo_reward/mean": -0.08714352548122406, + "rewards/thermo_reward/std": 1.8821381330490112, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14623204432427883, + "epoch": 0.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5222352743148804, + "learning_rate": 2.5220190950146827e-07, + "loss": 0.0007, + "num_tokens": 3494746.0, + "reward": 6.362916946411133, + "reward_std": 2.5356452465057373, + "rewards/fitness_reward/mean": 6.1964030265808105, + "rewards/fitness_reward/std": 1.7332172393798828, + "rewards/kidney_reward/mean": 0.039355117827653885, + "rewards/kidney_reward/std": 1.2564246654510498, + "rewards/length2tails_reward/mean": 0.846625804901123, + "rewards/length2tails_reward/std": 0.27086248993873596, + "rewards/thermo_reward/mean": -0.12964069843292236, + "rewards/thermo_reward/std": 2.138139486312866, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1653392855077982, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1060235500335693, + "learning_rate": 2.4758509110427573e-07, + "loss": 0.0005, + "num_tokens": 3503419.0, + "reward": 5.930359840393066, + "reward_std": 3.656620979309082, + "rewards/fitness_reward/mean": 5.391902446746826, + "rewards/fitness_reward/std": 3.479548931121826, + "rewards/kidney_reward/mean": 0.09498319774866104, + "rewards/kidney_reward/std": 1.3903887271881104, + "rewards/length2tails_reward/mean": 0.817430317401886, + "rewards/length2tails_reward/std": 0.23707036674022675, + "rewards/thermo_reward/mean": 0.5732157230377197, + "rewards/thermo_reward/std": 1.8132132291793823, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 275.90625, + "completions/mean_terminated_length": 275.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13901573978364468, + "epoch": 0.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8850361704826355, + "learning_rate": 2.430049443482437e-07, + "loss": 0.0059, + "num_tokens": 3512280.0, + "reward": 6.054816722869873, + "reward_std": 2.607663154602051, + "rewards/fitness_reward/mean": 6.111670017242432, + "rewards/fitness_reward/std": 2.1906065940856934, + "rewards/kidney_reward/mean": -0.3588145971298218, + "rewards/kidney_reward/std": 1.2051351070404053, + "rewards/length2tails_reward/mean": 0.7842074632644653, + "rewards/length2tails_reward/std": 0.32192572951316833, + "rewards/thermo_reward/mean": -0.1469959169626236, + "rewards/thermo_reward/std": 2.0782878398895264, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12296220194548368, + "epoch": 0.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.912596583366394, + "learning_rate": 2.384616924632634e-07, + "loss": 0.0053, + "num_tokens": 3520975.0, + "reward": 6.582754135131836, + "reward_std": 1.5355113744735718, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.06718573719263077, + "rewards/kidney_reward/std": 1.3254574537277222, + "rewards/length2tails_reward/mean": 0.7945810556411743, + "rewards/length2tails_reward/std": 0.2715069651603699, + "rewards/thermo_reward/mean": 0.06476283073425293, + "rewards/thermo_reward/std": 1.9420396089553833, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 279.1875, + "completions/mean_terminated_length": 279.1875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.14521227590739727, + "epoch": 0.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1336517333984375, + "learning_rate": 2.339555568810221e-07, + "loss": 0.0963, + "num_tokens": 3529941.0, + "reward": 5.5686492919921875, + "reward_std": 3.8480067253112793, + "rewards/fitness_reward/mean": 5.522449493408203, + "rewards/fitness_reward/std": 3.398782968521118, + "rewards/kidney_reward/mean": -0.2542445957660675, + "rewards/kidney_reward/std": 1.4892573356628418, + "rewards/length2tails_reward/mean": 0.8749173283576965, + "rewards/length2tails_reward/std": 0.15462607145309448, + "rewards/thermo_reward/mean": -0.09081444144248962, + "rewards/thermo_reward/std": 1.7850987911224365, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 264.40625, + "completions/mean_terminated_length": 264.40625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.1401141695678234, + "epoch": 0.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9227191209793091, + "learning_rate": 2.2948675722421085e-07, + "loss": -0.1072, + "num_tokens": 3538434.0, + "reward": 6.522500038146973, + "reward_std": 3.04559588432312, + "rewards/fitness_reward/mean": 5.933748245239258, + "rewards/fitness_reward/std": 2.5987942218780518, + "rewards/kidney_reward/mean": 0.2467244416475296, + "rewards/kidney_reward/std": 1.4387593269348145, + "rewards/length2tails_reward/mean": 0.7946808934211731, + "rewards/length2tails_reward/std": 0.28842639923095703, + "rewards/thermo_reward/mean": 0.5334376096725464, + "rewards/thermo_reward/std": 1.7454192638397217, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1552875665947795, + "epoch": 0.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8183427453041077, + "learning_rate": 2.2505551129582046e-07, + "loss": 0.0279, + "num_tokens": 3547206.0, + "reward": 6.446796417236328, + "reward_std": 3.105996608734131, + "rewards/fitness_reward/mean": 5.8662495613098145, + "rewards/fitness_reward/std": 2.4916253089904785, + "rewards/kidney_reward/mean": 0.27844473719596863, + "rewards/kidney_reward/std": 1.3675066232681274, + "rewards/length2tails_reward/mean": 0.8420401811599731, + "rewards/length2tails_reward/std": 0.23425650596618652, + "rewards/thermo_reward/mean": 0.46162861585617065, + "rewards/thermo_reward/std": 1.8574618101119995, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13848723284900188, + "epoch": 0.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2599036693572998, + "learning_rate": 2.2066203506852564e-07, + "loss": 0.0547, + "num_tokens": 3556086.0, + "reward": 6.115935802459717, + "reward_std": 2.4688777923583984, + "rewards/fitness_reward/mean": 5.774024486541748, + "rewards/fitness_reward/std": 2.4868690967559814, + "rewards/kidney_reward/mean": 0.08403107523918152, + "rewards/kidney_reward/std": 1.442877173423767, + "rewards/length2tails_reward/mean": 0.7783209085464478, + "rewards/length2tails_reward/std": 0.3148162364959717, + "rewards/thermo_reward/mean": 0.21063126623630524, + "rewards/thermo_reward/std": 1.992857575416565, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1279955366626382, + "epoch": 0.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7296998500823975, + "learning_rate": 2.1630654267416026e-07, + "loss": 0.0067, + "num_tokens": 3564817.0, + "reward": 5.637965202331543, + "reward_std": 3.0680017471313477, + "rewards/fitness_reward/mean": 5.852019309997559, + "rewards/fitness_reward/std": 2.9153757095336914, + "rewards/kidney_reward/mean": -0.3093605637550354, + "rewards/kidney_reward/std": 1.314058780670166, + "rewards/length2tails_reward/mean": 0.8313709497451782, + "rewards/length2tails_reward/std": 0.2533099949359894, + "rewards/thermo_reward/mean": -0.5344333648681641, + "rewards/thermo_reward/std": 2.188082695007324, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.8125, + "completions/mean_terminated_length": 266.8125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.14575813338160515, + "epoch": 0.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0772520303726196, + "learning_rate": 2.1198924639327808e-07, + "loss": -0.0441, + "num_tokens": 3573387.0, + "reward": 6.009997367858887, + "reward_std": 2.9660511016845703, + "rewards/fitness_reward/mean": 5.847411155700684, + "rewards/fitness_reward/std": 2.5736851692199707, + "rewards/kidney_reward/mean": -0.3084346354007721, + "rewards/kidney_reward/std": 1.4720526933670044, + "rewards/length2tails_reward/mean": 0.8826963901519775, + "rewards/length2tails_reward/std": 0.2083863615989685, + "rewards/thermo_reward/mean": 0.19225841760635376, + "rewards/thermo_reward/std": 1.89582359790802, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 267.78125, + "completions/mean_terminated_length": 267.78125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.1622565258294344, + "epoch": 0.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0969433784484863, + "learning_rate": 2.077103566448094e-07, + "loss": -0.0479, + "num_tokens": 3581988.0, + "reward": 6.01772403717041, + "reward_std": 3.2350380420684814, + "rewards/fitness_reward/mean": 5.869783401489258, + "rewards/fitness_reward/std": 2.8406243324279785, + "rewards/kidney_reward/mean": 0.10627858340740204, + "rewards/kidney_reward/std": 1.4346028566360474, + "rewards/length2tails_reward/mean": 0.8081914186477661, + "rewards/length2tails_reward/std": 0.26256439089775085, + "rewards/thermo_reward/mean": -0.21449324488639832, + "rewards/thermo_reward/std": 2.0515637397766113, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.12715268693864346, + "epoch": 0.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2996414303779602, + "learning_rate": 2.0347008197580372e-07, + "loss": 0.0048, + "num_tokens": 3590660.0, + "reward": 6.63413667678833, + "reward_std": 1.4439584016799927, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.29404324293136597, + "rewards/kidney_reward/std": 1.3528603315353394, + "rewards/length2tails_reward/mean": 0.74732905626297, + "rewards/length2tails_reward/std": 0.3145018517971039, + "rewards/thermo_reward/mean": -0.37605229020118713, + "rewards/thermo_reward/std": 2.162081241607666, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 278.21875, + "completions/mean_terminated_length": 278.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.19567361939698458, + "epoch": 0.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5846195220947266, + "learning_rate": 1.9926862905126663e-07, + "loss": 0.0862, + "num_tokens": 3599595.0, + "reward": 6.507219314575195, + "reward_std": 2.83516788482666, + "rewards/fitness_reward/mean": 5.774267196655273, + "rewards/fitness_reward/std": 2.844433546066284, + "rewards/kidney_reward/mean": 0.39035794138908386, + "rewards/kidney_reward/std": 1.258296012878418, + "rewards/length2tails_reward/mean": 0.7463816404342651, + "rewards/length2tails_reward/std": 0.27894169092178345, + "rewards/thermo_reward/mean": 0.702355146408081, + "rewards/thermo_reward/std": 1.6959781646728516, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13486727699637413, + "epoch": 0.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4812967777252197, + "learning_rate": 1.9510620264408594e-07, + "loss": 0.0737, + "num_tokens": 3608380.0, + "reward": 6.2867326736450195, + "reward_std": 2.4341838359832764, + "rewards/fitness_reward/mean": 6.129810333251953, + "rewards/fitness_reward/std": 2.0918588638305664, + "rewards/kidney_reward/mean": -0.017805874347686768, + "rewards/kidney_reward/std": 1.2995150089263916, + "rewards/length2tails_reward/mean": 0.6775492429733276, + "rewards/length2tails_reward/std": 0.3680454194545746, + "rewards/thermo_reward/mean": -0.007123976945877075, + "rewards/thermo_reward/std": 1.8101736307144165, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12293026130646467, + "epoch": 0.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8246999382972717, + "learning_rate": 1.9098300562505264e-07, + "loss": -0.0029, + "num_tokens": 3617104.0, + "reward": 6.208252906799316, + "reward_std": 1.9879117012023926, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.3767698407173157, + "rewards/kidney_reward/std": 1.368463397026062, + "rewards/length2tails_reward/mean": 0.8074749708175659, + "rewards/length2tails_reward/std": 0.30307063460350037, + "rewards/thermo_reward/mean": -0.7226856350898743, + "rewards/thermo_reward/std": 2.1550796031951904, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14080187398940325, + "epoch": 0.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4719889461994171, + "learning_rate": 1.8689923895297244e-07, + "loss": 0.0025, + "num_tokens": 3625911.0, + "reward": 6.903785705566406, + "reward_std": 1.76779305934906, + "rewards/fitness_reward/mean": 6.2823309898376465, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.25635695457458496, + "rewards/kidney_reward/std": 1.4129506349563599, + "rewards/length2tails_reward/mean": 0.7412567734718323, + "rewards/length2tails_reward/std": 0.2820318043231964, + "rewards/thermo_reward/mean": 0.6159244775772095, + "rewards/thermo_reward/std": 1.7575407028198242, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14221271220594645, + "epoch": 0.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3563969135284424, + "learning_rate": 1.828551016648715e-07, + "loss": -0.0005, + "num_tokens": 3634626.0, + "reward": 6.651411056518555, + "reward_std": 2.018911838531494, + "rewards/fitness_reward/mean": 6.2043962478637695, + "rewards/fitness_reward/std": 1.6906987428665161, + "rewards/kidney_reward/mean": 0.12795323133468628, + "rewards/kidney_reward/std": 1.3059492111206055, + "rewards/length2tails_reward/mean": 0.8251470327377319, + "rewards/length2tails_reward/std": 0.25611862540245056, + "rewards/thermo_reward/mean": 0.3535034656524658, + "rewards/thermo_reward/std": 1.824897289276123, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12413936480879784, + "epoch": 0.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3519450426101685, + "learning_rate": 1.7885079086629596e-07, + "loss": -0.0, + "num_tokens": 3643309.0, + "reward": 6.099939346313477, + "reward_std": 2.4669342041015625, + "rewards/fitness_reward/mean": 6.108650207519531, + "rewards/fitness_reward/std": 2.2070798873901367, + "rewards/kidney_reward/mean": -0.1856500655412674, + "rewards/kidney_reward/std": 1.055111289024353, + "rewards/length2tails_reward/mean": 0.8407902717590332, + "rewards/length2tails_reward/std": 0.207689568400383, + "rewards/thermo_reward/mean": -0.2521669864654541, + "rewards/thermo_reward/std": 1.982387661933899, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13434513751417398, + "epoch": 0.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5159134268760681, + "learning_rate": 1.7488650172170493e-07, + "loss": 0.0032, + "num_tokens": 3651993.0, + "reward": 6.955452919006348, + "reward_std": 1.0383720397949219, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.2530665397644043, + "rewards/kidney_reward/std": 1.161150336265564, + "rewards/length2tails_reward/mean": 0.7715202569961548, + "rewards/length2tails_reward/std": 0.2859238386154175, + "rewards/thermo_reward/mean": 0.5956157445907593, + "rewards/thermo_reward/std": 1.7789171934127808, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12625315971672535, + "epoch": 0.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3186589479446411, + "learning_rate": 1.7096242744495838e-07, + "loss": -0.0, + "num_tokens": 3660640.0, + "reward": 6.766183853149414, + "reward_std": 1.2357929944992065, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.09876012057065964, + "rewards/kidney_reward/std": 1.3812872171401978, + "rewards/length2tails_reward/mean": 0.683132529258728, + "rewards/length2tails_reward/std": 0.3223157525062561, + "rewards/thermo_reward/mean": 0.10696518421173096, + "rewards/thermo_reward/std": 1.9921863079071045, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13841083087027073, + "epoch": 0.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7905070185661316, + "learning_rate": 1.6707875928990056e-07, + "loss": 0.0008, + "num_tokens": 3669318.0, + "reward": 6.6396074295043945, + "reward_std": 1.6409484148025513, + "rewards/fitness_reward/mean": 6.316043376922607, + "rewards/fitness_reward/std": 1.5570749044418335, + "rewards/kidney_reward/mean": -0.39684462547302246, + "rewards/kidney_reward/std": 1.2976994514465332, + "rewards/length2tails_reward/mean": 0.8042482137680054, + "rewards/length2tails_reward/std": 0.1898868829011917, + "rewards/thermo_reward/mean": 0.6418485045433044, + "rewards/thermo_reward/std": 1.550958514213562, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14957595337182283, + "epoch": 0.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3791979551315308, + "learning_rate": 1.6323568654103837e-07, + "loss": 0.0222, + "num_tokens": 3678111.0, + "reward": 5.700117588043213, + "reward_std": 3.136606454849243, + "rewards/fitness_reward/mean": 5.612381935119629, + "rewards/fitness_reward/std": 3.1102912425994873, + "rewards/kidney_reward/mean": 0.09526537358760834, + "rewards/kidney_reward/std": 1.4866007566452026, + "rewards/length2tails_reward/mean": 0.8351494073867798, + "rewards/length2tails_reward/std": 0.23856548964977264, + "rewards/thermo_reward/mean": -0.3373691439628601, + "rewards/thermo_reward/std": 2.122860908508301, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13295627105981112, + "epoch": 0.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.530700445175171, + "learning_rate": 1.5943339650431574e-07, + "loss": 0.0591, + "num_tokens": 3686866.0, + "reward": 6.56540584564209, + "reward_std": 2.4985995292663574, + "rewards/fitness_reward/mean": 6.224242210388184, + "rewards/fitness_reward/std": 2.076380491256714, + "rewards/kidney_reward/mean": 0.10974864661693573, + "rewards/kidney_reward/std": 1.4650508165359497, + "rewards/length2tails_reward/mean": 0.7296964526176453, + "rewards/length2tails_reward/std": 0.291138231754303, + "rewards/thermo_reward/mean": 0.20773005485534668, + "rewards/thermo_reward/std": 1.9620627164840698, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11998305935412645, + "epoch": 0.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5405923128128052, + "learning_rate": 1.5567207449798515e-07, + "loss": -0.0009, + "num_tokens": 3695528.0, + "reward": 6.607175827026367, + "reward_std": 1.4843957424163818, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03256216645240784, + "rewards/kidney_reward/std": 1.2762181758880615, + "rewards/length2tails_reward/mean": 0.7198657393455505, + "rewards/length2tails_reward/std": 0.3216610252857208, + "rewards/thermo_reward/mean": -0.08963754773139954, + "rewards/thermo_reward/std": 2.1419501304626465, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 280.5, + "completions/mean_terminated_length": 280.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.17389845848083496, + "epoch": 0.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9274736642837524, + "learning_rate": 1.5195190384357404e-07, + "loss": 0.086, + "num_tokens": 3704536.0, + "reward": 5.957474231719971, + "reward_std": 2.9784088134765625, + "rewards/fitness_reward/mean": 5.606304168701172, + "rewards/fitness_reward/std": 3.129211187362671, + "rewards/kidney_reward/mean": -0.2096947282552719, + "rewards/kidney_reward/std": 1.2081297636032104, + "rewards/length2tails_reward/mean": 0.8495426774024963, + "rewards/length2tails_reward/std": 0.19200514256954193, + "rewards/thermo_reward/mean": 0.4872628450393677, + "rewards/thermo_reward/std": 1.6905598640441895, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 280.1875, + "completions/mean_terminated_length": 280.1875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.14840978104621172, + "epoch": 0.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.710753321647644, + "learning_rate": 1.4827306585695233e-07, + "loss": 0.0106, + "num_tokens": 3713534.0, + "reward": 6.249197006225586, + "reward_std": 2.3880364894866943, + "rewards/fitness_reward/mean": 6.113107681274414, + "rewards/fitness_reward/std": 2.1827657222747803, + "rewards/kidney_reward/mean": 0.1490403264760971, + "rewards/kidney_reward/std": 1.3688663244247437, + "rewards/length2tails_reward/mean": 0.7044593691825867, + "rewards/length2tails_reward/std": 0.3274818956851959, + "rewards/thermo_reward/mean": -0.2290917932987213, + "rewards/thermo_reward/std": 1.8039517402648926, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1391929117962718, + "epoch": 0.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7323298454284668, + "learning_rate": 1.446357398394934e-07, + "loss": 0.0019, + "num_tokens": 3722202.0, + "reward": 6.236641883850098, + "reward_std": 3.057166814804077, + "rewards/fitness_reward/mean": 5.855730056762695, + "rewards/fitness_reward/std": 2.538175344467163, + "rewards/kidney_reward/mean": -0.0438118651509285, + "rewards/kidney_reward/std": 1.222165584564209, + "rewards/length2tails_reward/mean": 0.7789819240570068, + "rewards/length2tails_reward/std": 0.28580957651138306, + "rewards/thermo_reward/mean": 0.41614454984664917, + "rewards/thermo_reward/std": 1.7913116216659546, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.15740068443119526, + "epoch": 0.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7037556171417236, + "learning_rate": 1.4104010306933555e-07, + "loss": -0.0547, + "num_tokens": 3730825.0, + "reward": 6.620261192321777, + "reward_std": 2.26141357421875, + "rewards/fitness_reward/mean": 6.2323808670043945, + "rewards/fitness_reward/std": 2.030343770980835, + "rewards/kidney_reward/mean": -0.044921278953552246, + "rewards/kidney_reward/std": 1.3782581090927124, + "rewards/length2tails_reward/mean": 0.8388746380805969, + "rewards/length2tails_reward/std": 0.23595750331878662, + "rewards/thermo_reward/mean": 0.4012451767921448, + "rewards/thermo_reward/std": 1.9409761428833008, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15207398403435946, + "epoch": 0.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2341874837875366, + "learning_rate": 1.3748633079274253e-07, + "loss": -0.0011, + "num_tokens": 3739644.0, + "reward": 6.3321919441223145, + "reward_std": 2.150449275970459, + "rewards/fitness_reward/mean": 6.1863789558410645, + "rewards/fitness_reward/std": 1.7867270708084106, + "rewards/kidney_reward/mean": 0.05245739221572876, + "rewards/kidney_reward/std": 1.3610572814941406, + "rewards/length2tails_reward/mean": 0.8321821093559265, + "rewards/length2tails_reward/std": 0.24851974844932556, + "rewards/thermo_reward/mean": -0.17692288756370544, + "rewards/thermo_reward/std": 2.0968687534332275, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11885499861091375, + "epoch": 0.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3401205241680145, + "learning_rate": 1.3397459621556128e-07, + "loss": -0.0018, + "num_tokens": 3748321.0, + "reward": 6.152368545532227, + "reward_std": 1.571397066116333, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.011298850178718567, + "rewards/kidney_reward/std": 1.3151267766952515, + "rewards/length2tails_reward/mean": 0.7107293009757996, + "rewards/length2tails_reward/std": 0.36373162269592285, + "rewards/thermo_reward/mean": -0.3980119228363037, + "rewards/thermo_reward/std": 2.023512363433838, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.8125, + "completions/mean_terminated_length": 265.8125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.12992818094789982, + "epoch": 0.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.096072793006897, + "learning_rate": 1.30505070494781e-07, + "loss": -0.0726, + "num_tokens": 3756859.0, + "reward": 5.837516784667969, + "reward_std": 2.693760633468628, + "rewards/fitness_reward/mean": 5.796808242797852, + "rewards/fitness_reward/std": 2.362955331802368, + "rewards/kidney_reward/mean": -0.5988284945487976, + "rewards/kidney_reward/std": 1.1490215063095093, + "rewards/length2tails_reward/mean": 0.7456122636795044, + "rewards/length2tails_reward/std": 0.315470427274704, + "rewards/thermo_reward/mean": 0.30743902921676636, + "rewards/thermo_reward/std": 1.7514585256576538, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 278.9375, + "completions/mean_terminated_length": 278.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1424407884478569, + "epoch": 0.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0920772552490234, + "learning_rate": 1.2707792273019047e-07, + "loss": 0.1266, + "num_tokens": 3765817.0, + "reward": 6.199623107910156, + "reward_std": 2.7672059535980225, + "rewards/fitness_reward/mean": 5.834479808807373, + "rewards/fitness_reward/std": 2.6360130310058594, + "rewards/kidney_reward/mean": 0.03671124577522278, + "rewards/kidney_reward/std": 1.3138879537582397, + "rewards/length2tails_reward/mean": 0.7895512580871582, + "rewards/length2tails_reward/std": 0.25905853509902954, + "rewards/thermo_reward/mean": 0.2987987995147705, + "rewards/thermo_reward/std": 1.9619476795196533, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13846470788121223, + "epoch": 0.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2143913507461548, + "learning_rate": 1.2369331995613663e-07, + "loss": -0.0064, + "num_tokens": 3774514.0, + "reward": 6.380249977111816, + "reward_std": 1.4921387434005737, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.5315516591072083, + "rewards/kidney_reward/std": 1.4306668043136597, + "rewards/length2tails_reward/mean": 0.7784464359283447, + "rewards/length2tails_reward/std": 0.31882283091545105, + "rewards/thermo_reward/mean": 0.13218817114830017, + "rewards/thermo_reward/std": 1.781838059425354, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1336890598759055, + "epoch": 0.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4450879693031311, + "learning_rate": 1.2035142713338363e-07, + "loss": -0.0037, + "num_tokens": 3783198.0, + "reward": 6.036025524139404, + "reward_std": 2.70249605178833, + "rewards/fitness_reward/mean": 5.733524322509766, + "rewards/fitness_reward/std": 2.6453893184661865, + "rewards/kidney_reward/mean": -0.06320416927337646, + "rewards/kidney_reward/std": 1.3764325380325317, + "rewards/length2tails_reward/mean": 0.6983875036239624, + "rewards/length2tails_reward/std": 0.3537623882293701, + "rewards/thermo_reward/mean": 0.31901222467422485, + "rewards/thermo_reward/std": 1.663163185119629, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.09375, + "completions/mean_terminated_length": 262.09375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.14609050378203392, + "epoch": 0.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9099003076553345, + "learning_rate": 1.1705240714107301e-07, + "loss": -0.0978, + "num_tokens": 3791617.0, + "reward": 6.100383758544922, + "reward_std": 2.9282727241516113, + "rewards/fitness_reward/mean": 5.8608198165893555, + "rewards/fitness_reward/std": 2.8744940757751465, + "rewards/kidney_reward/mean": 0.06438690423965454, + "rewards/kidney_reward/std": 1.2841964960098267, + "rewards/length2tails_reward/mean": 0.7604968547821045, + "rewards/length2tails_reward/std": 0.29300975799560547, + "rewards/thermo_reward/mean": 0.03449193388223648, + "rewards/thermo_reward/std": 2.0563175678253174, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 276.96875, + "completions/mean_terminated_length": 276.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14405822847038507, + "epoch": 0.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8847280740737915, + "learning_rate": 1.1379642076878526e-07, + "loss": 0.0476, + "num_tokens": 3800512.0, + "reward": 5.500676155090332, + "reward_std": 2.674482822418213, + "rewards/fitness_reward/mean": 5.6174211502075195, + "rewards/fitness_reward/std": 2.727896213531494, + "rewards/kidney_reward/mean": -0.15667006373405457, + "rewards/kidney_reward/std": 1.4270250797271729, + "rewards/length2tails_reward/mean": 0.7182776927947998, + "rewards/length2tails_reward/std": 0.3376780152320862, + "rewards/thermo_reward/mean": -0.43595871329307556, + "rewards/thermo_reward/std": 2.140556812286377, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14862105157226324, + "epoch": 0.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.339766025543213, + "learning_rate": 1.1058362670870247e-07, + "loss": -0.0016, + "num_tokens": 3809215.0, + "reward": 5.193296432495117, + "reward_std": 3.8562912940979004, + "rewards/fitness_reward/mean": 5.236565589904785, + "rewards/fitness_reward/std": 3.654449701309204, + "rewards/kidney_reward/mean": -0.010497570037841797, + "rewards/kidney_reward/std": 1.438853144645691, + "rewards/length2tails_reward/mean": 0.7963101267814636, + "rewards/length2tails_reward/std": 0.26542168855667114, + "rewards/thermo_reward/mean": -0.4741969704627991, + "rewards/thermo_reward/std": 2.142672061920166, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 282.46875, + "completions/mean_terminated_length": 282.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15248755365610123, + "epoch": 0.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1925106048583984, + "learning_rate": 1.074141815478744e-07, + "loss": 0.1164, + "num_tokens": 3818286.0, + "reward": 5.293362140655518, + "reward_std": 3.658482551574707, + "rewards/fitness_reward/mean": 5.286756992340088, + "rewards/fitness_reward/std": 3.4996767044067383, + "rewards/kidney_reward/mean": -0.034492507576942444, + "rewards/kidney_reward/std": 1.3296836614608765, + "rewards/length2tails_reward/mean": 0.8726150989532471, + "rewards/length2tails_reward/std": 0.24276185035705566, + "rewards/thermo_reward/mean": -0.3886043429374695, + "rewards/thermo_reward/std": 1.8644394874572754, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.13518050219863653, + "epoch": 0.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6139440536499023, + "learning_rate": 1.0428823976058709e-07, + "loss": -0.0233, + "num_tokens": 3826942.0, + "reward": 6.495221138000488, + "reward_std": 2.355567216873169, + "rewards/fitness_reward/mean": 6.006479263305664, + "rewards/fitness_reward/std": 2.2557687759399414, + "rewards/kidney_reward/mean": 0.07885207235813141, + "rewards/kidney_reward/std": 1.2913696765899658, + "rewards/length2tails_reward/mean": 0.7283536195755005, + "rewards/length2tails_reward/std": 0.3239452540874481, + "rewards/thermo_reward/mean": 0.5344558954238892, + "rewards/thermo_reward/std": 1.7547905445098877, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11761743109673262, + "epoch": 0.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33359822630882263, + "learning_rate": 1.0120595370083318e-07, + "loss": 0.0066, + "num_tokens": 3835650.0, + "reward": 6.277066230773926, + "reward_std": 1.4569121599197388, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.38881829380989075, + "rewards/kidney_reward/std": 1.234520673751831, + "rewards/length2tails_reward/mean": 0.8334996700286865, + "rewards/length2tails_reward/std": 0.22490544617176056, + "rewards/thermo_reward/mean": -0.45041751861572266, + "rewards/thermo_reward/std": 2.2308945655822754, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14700100850313902, + "epoch": 0.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8419653177261353, + "learning_rate": 9.81674735948863e-08, + "loss": 0.0156, + "num_tokens": 3844410.0, + "reward": 6.325191020965576, + "reward_std": 2.3530921936035156, + "rewards/fitness_reward/mean": 6.140649795532227, + "rewards/fitness_reward/std": 2.0330381393432617, + "rewards/kidney_reward/mean": -0.03410997986793518, + "rewards/kidney_reward/std": 1.2392289638519287, + "rewards/length2tails_reward/mean": 0.8348275423049927, + "rewards/length2tails_reward/std": 0.2512573301792145, + "rewards/thermo_reward/mean": -0.014221221208572388, + "rewards/thermo_reward/std": 1.937234878540039, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 261.28125, + "completions/mean_terminated_length": 261.28125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.14893424790352583, + "epoch": 0.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.867492437362671, + "learning_rate": 9.517294753398064e-08, + "loss": -0.1308, + "num_tokens": 3852803.0, + "reward": 5.713663101196289, + "reward_std": 3.0692226886749268, + "rewards/fitness_reward/mean": 5.540594100952148, + "rewards/fitness_reward/std": 3.004016637802124, + "rewards/kidney_reward/mean": -0.04113885760307312, + "rewards/kidney_reward/std": 1.2972465753555298, + "rewards/length2tails_reward/mean": 0.8506132960319519, + "rewards/length2tails_reward/std": 0.2541244924068451, + "rewards/thermo_reward/mean": -0.038030415773391724, + "rewards/thermo_reward/std": 1.9679735898971558, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 282.1875, + "completions/mean_terminated_length": 282.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.154700574465096, + "epoch": 0.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5670086741447449, + "learning_rate": 9.222252146709142e-08, + "loss": -0.0254, + "num_tokens": 3861865.0, + "reward": 6.069726943969727, + "reward_std": 2.668290138244629, + "rewards/fitness_reward/mean": 6.112736701965332, + "rewards/fitness_reward/std": 2.184788465499878, + "rewards/kidney_reward/mean": -0.46153780817985535, + "rewards/kidney_reward/std": 1.1795583963394165, + "rewards/length2tails_reward/mean": 0.8094608187675476, + "rewards/length2tails_reward/std": 0.25654107332229614, + "rewards/thermo_reward/mean": -0.029212698340415955, + "rewards/thermo_reward/std": 1.948235273361206, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 275.59375, + "completions/mean_terminated_length": 275.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1533412327989936, + "epoch": 0.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6889277696609497, + "learning_rate": 8.931633919382298e-08, + "loss": 0.0657, + "num_tokens": 3870716.0, + "reward": 6.111047267913818, + "reward_std": 2.321079730987549, + "rewards/fitness_reward/mean": 6.138444900512695, + "rewards/fitness_reward/std": 2.044992208480835, + "rewards/kidney_reward/mean": -0.30765989422798157, + "rewards/kidney_reward/std": 1.4663869142532349, + "rewards/length2tails_reward/mean": 0.8048216104507446, + "rewards/length2tails_reward/std": 0.2839038372039795, + "rewards/thermo_reward/mean": -0.1495458483695984, + "rewards/thermo_reward/std": 2.2613861560821533, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 284.5, + "completions/mean_terminated_length": 284.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14972092770040035, + "epoch": 0.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.146921634674072, + "learning_rate": 8.645454235739902e-08, + "loss": 0.1805, + "num_tokens": 3879852.0, + "reward": 6.23973274230957, + "reward_std": 3.0467097759246826, + "rewards/fitness_reward/mean": 5.754415035247803, + "rewards/fitness_reward/std": 2.9195473194122314, + "rewards/kidney_reward/mean": 0.18438826501369476, + "rewards/kidney_reward/std": 1.3486359119415283, + "rewards/length2tails_reward/mean": 0.715106725692749, + "rewards/length2tails_reward/std": 0.3136950433254242, + "rewards/thermo_reward/mean": 0.4286932945251465, + "rewards/thermo_reward/std": 1.8948018550872803, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.16300893109291792, + "epoch": 0.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.321374535560608, + "learning_rate": 8.363727043776036e-08, + "loss": -0.0074, + "num_tokens": 3888513.0, + "reward": 5.419971466064453, + "reward_std": 3.4583020210266113, + "rewards/fitness_reward/mean": 5.180810928344727, + "rewards/fitness_reward/std": 3.550640344619751, + "rewards/kidney_reward/mean": 0.14285969734191895, + "rewards/kidney_reward/std": 1.2835968732833862, + "rewards/length2tails_reward/mean": 0.7409695386886597, + "rewards/length2tails_reward/std": 0.32659363746643066, + "rewards/thermo_reward/mean": -0.03502354770898819, + "rewards/thermo_reward/std": 1.8435451984405518, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13684740848839283, + "epoch": 0.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5912813544273376, + "learning_rate": 8.086466074476562e-08, + "loss": 0.0048, + "num_tokens": 3897199.0, + "reward": 6.76379919052124, + "reward_std": 2.2146174907684326, + "rewards/fitness_reward/mean": 6.315851211547852, + "rewards/fitness_reward/std": 1.5581613779067993, + "rewards/kidney_reward/mean": 0.2498674839735031, + "rewards/kidney_reward/std": 1.1828155517578125, + "rewards/length2tails_reward/mean": 0.7916619181632996, + "rewards/length2tails_reward/std": 0.26749032735824585, + "rewards/thermo_reward/mean": 0.2501968443393707, + "rewards/thermo_reward/std": 1.9461218118667603, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.15625, + "completions/mean_terminated_length": 267.15625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.14623074512928724, + "epoch": 0.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.829219102859497, + "learning_rate": 7.813684841149959e-08, + "loss": -0.0739, + "num_tokens": 3905780.0, + "reward": 6.439332008361816, + "reward_std": 2.433130979537964, + "rewards/fitness_reward/mean": 6.125979423522949, + "rewards/fitness_reward/std": 2.1126816272735596, + "rewards/kidney_reward/mean": -0.09445090591907501, + "rewards/kidney_reward/std": 1.271247148513794, + "rewards/length2tails_reward/mean": 0.8165769577026367, + "rewards/length2tails_reward/std": 0.28393563628196716, + "rewards/thermo_reward/mean": 0.3128669559955597, + "rewards/thermo_reward/std": 1.8542346954345703, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15820543095469475, + "epoch": 0.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0217692852020264, + "learning_rate": 7.545396638768697e-08, + "loss": 0.016, + "num_tokens": 3914548.0, + "reward": 6.446044445037842, + "reward_std": 2.6220896244049072, + "rewards/fitness_reward/mean": 6.2497239112854, + "rewards/fitness_reward/std": 1.9322361946105957, + "rewards/kidney_reward/mean": -0.11043130606412888, + "rewards/kidney_reward/std": 1.3969236612319946, + "rewards/length2tails_reward/mean": 0.8614251017570496, + "rewards/length2tails_reward/std": 0.22737865149974823, + "rewards/thermo_reward/mean": 0.07236060500144958, + "rewards/thermo_reward/std": 2.22379732131958, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 263.5625, + "completions/mean_terminated_length": 263.5625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1258983640000224, + "epoch": 0.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359396457672119, + "learning_rate": 7.281614543321269e-08, + "loss": -0.0726, + "num_tokens": 3923014.0, + "reward": 6.619543552398682, + "reward_std": 1.9513977766036987, + "rewards/fitness_reward/mean": 6.315708637237549, + "rewards/fitness_reward/std": 1.5589702129364014, + "rewards/kidney_reward/mean": 0.054669540375471115, + "rewards/kidney_reward/std": 1.4212532043457031, + "rewards/length2tails_reward/mean": 0.7442126274108887, + "rewards/length2tails_reward/std": 0.2968263030052185, + "rewards/thermo_reward/mean": 0.18089357018470764, + "rewards/thermo_reward/std": 1.9039746522903442, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15910894237458706, + "epoch": 0.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.005949020385742, + "learning_rate": 7.022351411174865e-08, + "loss": 0.0452, + "num_tokens": 3931794.0, + "reward": 6.604391098022461, + "reward_std": 2.1950721740722656, + "rewards/fitness_reward/mean": 6.22106409072876, + "rewards/fitness_reward/std": 2.0943596363067627, + "rewards/kidney_reward/mean": 0.1045638918876648, + "rewards/kidney_reward/std": 1.3927233219146729, + "rewards/length2tails_reward/mean": 0.8200255632400513, + "rewards/length2tails_reward/std": 0.20488189160823822, + "rewards/thermo_reward/mean": 0.25207698345184326, + "rewards/thermo_reward/std": 1.908453106880188, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12655364908277988, + "epoch": 0.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5396273732185364, + "learning_rate": 6.767619878448783e-08, + "loss": 0.0036, + "num_tokens": 3940539.0, + "reward": 5.361003398895264, + "reward_std": 3.229443311691284, + "rewards/fitness_reward/mean": 5.443634986877441, + "rewards/fitness_reward/std": 3.0867321491241455, + "rewards/kidney_reward/mean": -0.2784079611301422, + "rewards/kidney_reward/std": 1.3394166231155396, + "rewards/length2tails_reward/mean": 0.8541355133056641, + "rewards/length2tails_reward/std": 0.22073553502559662, + "rewards/thermo_reward/mean": -0.3139229416847229, + "rewards/thermo_reward/std": 2.3062264919281006, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 276.1875, + "completions/mean_terminated_length": 276.1875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1407141126692295, + "epoch": 0.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4678702354431152, + "learning_rate": 6.517432360398556e-08, + "loss": 0.0686, + "num_tokens": 3949409.0, + "reward": 6.1275634765625, + "reward_std": 2.8987269401550293, + "rewards/fitness_reward/mean": 5.816352367401123, + "rewards/fitness_reward/std": 2.6810781955718994, + "rewards/kidney_reward/mean": 0.1129336804151535, + "rewards/kidney_reward/std": 1.4291346073150635, + "rewards/length2tails_reward/mean": 0.7642084360122681, + "rewards/length2tails_reward/std": 0.3605248034000397, + "rewards/thermo_reward/mean": 0.1273842751979828, + "rewards/thermo_reward/std": 1.9623202085494995, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.1197723550722003, + "epoch": 0.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2243266105651855, + "learning_rate": 6.271801050810855e-08, + "loss": -0.0072, + "num_tokens": 3958099.0, + "reward": 5.805145263671875, + "reward_std": 2.994102954864502, + "rewards/fitness_reward/mean": 5.523082733154297, + "rewards/fitness_reward/std": 2.7376046180725098, + "rewards/kidney_reward/mean": 0.11638177931308746, + "rewards/kidney_reward/std": 1.1877796649932861, + "rewards/length2tails_reward/mean": 0.7494679689407349, + "rewards/length2tails_reward/std": 0.3458668291568756, + "rewards/thermo_reward/mean": 0.07301057130098343, + "rewards/thermo_reward/std": 2.142275094985962, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1348592070862651, + "epoch": 0.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5708919167518616, + "learning_rate": 6.030737921409168e-08, + "loss": -0.0035, + "num_tokens": 3966770.0, + "reward": 6.293191909790039, + "reward_std": 2.521247148513794, + "rewards/fitness_reward/mean": 5.784482002258301, + "rewards/fitness_reward/std": 2.4514005184173584, + "rewards/kidney_reward/mean": 0.3663383722305298, + "rewards/kidney_reward/std": 1.3017356395721436, + "rewards/length2tails_reward/mean": 0.7055273652076721, + "rewards/length2tails_reward/std": 0.33975639939308167, + "rewards/thermo_reward/mean": 0.29831749200820923, + "rewards/thermo_reward/std": 1.8889341354370117, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.1793599370867014, + "epoch": 0.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875615358352661, + "learning_rate": 5.794254721270331e-08, + "loss": 0.0356, + "num_tokens": 3975541.0, + "reward": 5.444811820983887, + "reward_std": 3.4904708862304688, + "rewards/fitness_reward/mean": 5.315083980560303, + "rewards/fitness_reward/std": 3.412541627883911, + "rewards/kidney_reward/mean": -0.10441018640995026, + "rewards/kidney_reward/std": 1.3502939939498901, + "rewards/length2tails_reward/mean": 0.8567314147949219, + "rewards/length2tails_reward/std": 0.21395272016525269, + "rewards/thermo_reward/mean": -0.06450112909078598, + "rewards/thermo_reward/std": 2.0255086421966553, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.1371596548706293, + "epoch": 0.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32508590817451477, + "learning_rate": 5.5623629762519e-08, + "loss": 0.0036, + "num_tokens": 3984298.0, + "reward": 5.952658653259277, + "reward_std": 2.358738899230957, + "rewards/fitness_reward/mean": 5.95086145401001, + "rewards/fitness_reward/std": 2.06325626373291, + "rewards/kidney_reward/mean": -0.1831625998020172, + "rewards/kidney_reward/std": 1.493977427482605, + "rewards/length2tails_reward/mean": 0.8945037126541138, + "rewards/length2tails_reward/std": 0.19499589502811432, + "rewards/thermo_reward/mean": -0.2604953646659851, + "rewards/thermo_reward/std": 2.373155117034912, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 279.28125, + "completions/mean_terminated_length": 279.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1629155706614256, + "epoch": 0.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5588083267211914, + "learning_rate": 5.335073988430372e-08, + "loss": 0.085, + "num_tokens": 3993267.0, + "reward": 6.2450714111328125, + "reward_std": 3.284686803817749, + "rewards/fitness_reward/mean": 5.599737167358398, + "rewards/fitness_reward/std": 3.141002655029297, + "rewards/kidney_reward/mean": -0.018673259764909744, + "rewards/kidney_reward/std": 1.2502866983413696, + "rewards/length2tails_reward/mean": 0.840927004814148, + "rewards/length2tails_reward/std": 0.2280232310295105, + "rewards/thermo_reward/mean": 0.8888781666755676, + "rewards/thermo_reward/std": 1.6391123533248901, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.16157941427081823, + "epoch": 0.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3679721355438232, + "learning_rate": 5.1123988355503465e-08, + "loss": -0.0221, + "num_tokens": 4001997.0, + "reward": 5.608067512512207, + "reward_std": 2.661470651626587, + "rewards/fitness_reward/mean": 5.589663505554199, + "rewards/fitness_reward/std": 2.8181302547454834, + "rewards/kidney_reward/mean": -0.5036675930023193, + "rewards/kidney_reward/std": 1.1448392868041992, + "rewards/length2tails_reward/mean": 0.848773181438446, + "rewards/length2tails_reward/std": 0.24732929468154907, + "rewards/thermo_reward/mean": 0.11608822643756866, + "rewards/thermo_reward/std": 2.154752731323242, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 276.3125, + "completions/mean_terminated_length": 276.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15516785997897387, + "epoch": 0.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.663477659225464, + "learning_rate": 4.8943483704846465e-08, + "loss": 0.0742, + "num_tokens": 4010871.0, + "reward": 6.066410064697266, + "reward_std": 3.334627151489258, + "rewards/fitness_reward/mean": 5.877503395080566, + "rewards/fitness_reward/std": 2.8087968826293945, + "rewards/kidney_reward/mean": 0.10705171525478363, + "rewards/kidney_reward/std": 1.444384217262268, + "rewards/length2tails_reward/mean": 0.7966011762619019, + "rewards/length2tails_reward/std": 0.29769471287727356, + "rewards/thermo_reward/mean": -0.12753897905349731, + "rewards/thermo_reward/std": 1.8615918159484863, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1699973875656724, + "epoch": 0.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2838830947875977, + "learning_rate": 4.6809332207053074e-08, + "loss": 0.0194, + "num_tokens": 4019618.0, + "reward": 6.185791015625, + "reward_std": 2.2091588973999023, + "rewards/fitness_reward/mean": 6.132357597351074, + "rewards/fitness_reward/std": 2.0780248641967773, + "rewards/kidney_reward/mean": -0.12923404574394226, + "rewards/kidney_reward/std": 1.3461557626724243, + "rewards/length2tails_reward/mean": 0.8275649547576904, + "rewards/length2tails_reward/std": 0.27842438220977783, + "rewards/thermo_reward/mean": -0.17768177390098572, + "rewards/thermo_reward/std": 1.9903466701507568, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 263.46875, + "completions/mean_terminated_length": 263.46875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.1407340606674552, + "epoch": 0.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4002125263214111, + "learning_rate": 4.472163787765637e-08, + "loss": -0.1189, + "num_tokens": 4028081.0, + "reward": 6.638701915740967, + "reward_std": 2.2502400875091553, + "rewards/fitness_reward/mean": 6.228815078735352, + "rewards/fitness_reward/std": 2.0505142211914062, + "rewards/kidney_reward/mean": 0.14867182075977325, + "rewards/kidney_reward/std": 1.1441434621810913, + "rewards/length2tails_reward/mean": 0.7031423449516296, + "rewards/length2tails_reward/std": 0.3281797170639038, + "rewards/thermo_reward/mean": 0.31953129172325134, + "rewards/thermo_reward/std": 1.8455839157104492, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14164727926254272, + "epoch": 0.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1973295211791992, + "learning_rate": 4.2680502467932754e-08, + "loss": 0.0036, + "num_tokens": 4036780.0, + "reward": 6.230764389038086, + "reward_std": 2.1518642902374268, + "rewards/fitness_reward/mean": 6.048417568206787, + "rewards/fitness_reward/std": 2.0361173152923584, + "rewards/kidney_reward/mean": -0.2080516517162323, + "rewards/kidney_reward/std": 1.3549437522888184, + "rewards/length2tails_reward/mean": 0.8079833984375, + "rewards/length2tails_reward/std": 0.26826247572898865, + "rewards/thermo_reward/mean": 0.16875341534614563, + "rewards/thermo_reward/std": 1.9367470741271973, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 277.03125, + "completions/mean_terminated_length": 277.03125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.16016131080687046, + "epoch": 0.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6919748783111572, + "learning_rate": 4.0686025459942486e-08, + "loss": 0.0673, + "num_tokens": 4045677.0, + "reward": 5.369567394256592, + "reward_std": 3.270564317703247, + "rewards/fitness_reward/mean": 5.362431526184082, + "rewards/fitness_reward/std": 2.935969114303589, + "rewards/kidney_reward/mean": -0.11173248291015625, + "rewards/kidney_reward/std": 1.241031527519226, + "rewards/length2tails_reward/mean": 0.8059428334236145, + "rewards/length2tails_reward/std": 0.319585919380188, + "rewards/thermo_reward/mean": -0.27696692943573, + "rewards/thermo_reward/std": 2.009323835372925, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13632575143128633, + "epoch": 0.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3928329646587372, + "learning_rate": 3.87383040616811e-08, + "loss": 0.0068, + "num_tokens": 4054401.0, + "reward": 6.620509147644043, + "reward_std": 1.183544635772705, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.14931367337703705, + "rewards/kidney_reward/std": 1.3510866165161133, + "rewards/length2tails_reward/mean": 0.7820310592651367, + "rewards/length2tails_reward/std": 0.24996426701545715, + "rewards/thermo_reward/mean": 0.22867608070373535, + "rewards/thermo_reward/std": 1.7496261596679688, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13270280417054892, + "epoch": 0.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5173652768135071, + "learning_rate": 3.6837433202341894e-08, + "loss": 0.001, + "num_tokens": 4063086.0, + "reward": 6.274802207946777, + "reward_std": 1.3407738208770752, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.46396902203559875, + "rewards/kidney_reward/std": 1.417636513710022, + "rewards/length2tails_reward/mean": 0.7583062052726746, + "rewards/length2tails_reward/std": 0.28290626406669617, + "rewards/thermo_reward/mean": -0.3421969711780548, + "rewards/thermo_reward/std": 1.9546772241592407, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1353480676189065, + "epoch": 0.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43471759557724, + "learning_rate": 3.4983505527688584e-08, + "loss": 0.0053, + "num_tokens": 4071795.0, + "reward": 6.438246250152588, + "reward_std": 1.6006256341934204, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1561225801706314, + "rewards/kidney_reward/std": 1.3050709962844849, + "rewards/length2tails_reward/mean": 0.7958537340164185, + "rewards/length2tails_reward/std": 0.2570348083972931, + "rewards/thermo_reward/mean": -0.4481971561908722, + "rewards/thermo_reward/std": 1.9269288778305054, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 264.40625, + "completions/mean_terminated_length": 264.40625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.21487415861338377, + "epoch": 0.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.117380380630493, + "learning_rate": 3.317661139554062e-08, + "loss": -0.0721, + "num_tokens": 4080288.0, + "reward": 6.310624599456787, + "reward_std": 3.2768564224243164, + "rewards/fitness_reward/mean": 5.566933631896973, + "rewards/fitness_reward/std": 3.2391488552093506, + "rewards/kidney_reward/mean": 0.5489071011543274, + "rewards/kidney_reward/std": 1.3483208417892456, + "rewards/length2tails_reward/mean": 0.8428280353546143, + "rewards/length2tails_reward/std": 0.20550084114074707, + "rewards/thermo_reward/mean": 0.5170604586601257, + "rewards/thermo_reward/std": 1.59054434299469, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 280.9375, + "completions/mean_terminated_length": 280.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13101665768772364, + "epoch": 0.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.661982297897339, + "learning_rate": 3.141683887136892e-08, + "loss": 0.125, + "num_tokens": 4089310.0, + "reward": 5.751834869384766, + "reward_std": 3.421877145767212, + "rewards/fitness_reward/mean": 5.474588394165039, + "rewards/fitness_reward/std": 3.233222007751465, + "rewards/kidney_reward/mean": 0.019836775958538055, + "rewards/kidney_reward/std": 1.2497594356536865, + "rewards/length2tails_reward/mean": 0.6600824594497681, + "rewards/length2tails_reward/std": 0.35533779859542847, + "rewards/thermo_reward/mean": 0.20461499691009521, + "rewards/thermo_reward/std": 1.853097677230835, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13067772425711155, + "epoch": 0.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5119224786758423, + "learning_rate": 2.9704273724003526e-08, + "loss": 0.0031, + "num_tokens": 4098003.0, + "reward": 6.646775245666504, + "reward_std": 1.3293075561523438, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.031656183302402496, + "rewards/kidney_reward/std": 1.3796350955963135, + "rewards/length2tails_reward/mean": 0.7634539604187012, + "rewards/length2tails_reward/std": 0.3263379633426666, + "rewards/thermo_reward/mean": -0.3024289608001709, + "rewards/thermo_reward/std": 2.1008477210998535, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13748420868068933, + "epoch": 0.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.524431586265564, + "learning_rate": 2.8038999421453823e-08, + "loss": 0.0001, + "num_tokens": 4106742.0, + "reward": 6.620138168334961, + "reward_std": 1.317707896232605, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.27108389139175415, + "rewards/kidney_reward/std": 1.3115421533584595, + "rewards/length2tails_reward/mean": 0.8660446405410767, + "rewards/length2tails_reward/std": 0.2033243179321289, + "rewards/thermo_reward/mean": -0.10425892472267151, + "rewards/thermo_reward/std": 2.0403130054473877, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.14136488363146782, + "epoch": 0.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.863619863986969, + "learning_rate": 2.642109712683971e-08, + "loss": 0.0127, + "num_tokens": 4115403.0, + "reward": 6.52547550201416, + "reward_std": 1.087640643119812, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.13498719036579132, + "rewards/kidney_reward/std": 1.1925514936447144, + "rewards/length2tails_reward/mean": 0.7910594940185547, + "rewards/length2tails_reward/std": 0.2733539044857025, + "rewards/thermo_reward/mean": -0.18620994687080383, + "rewards/thermo_reward/std": 1.8779382705688477, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 281.53125, + "completions/mean_terminated_length": 281.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14799009263515472, + "epoch": 0.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4865119457244873, + "learning_rate": 2.4850645694436734e-08, + "loss": 0.1213, + "num_tokens": 4124444.0, + "reward": 5.619724273681641, + "reward_std": 3.355104684829712, + "rewards/fitness_reward/mean": 5.491335868835449, + "rewards/fitness_reward/std": 3.193152666091919, + "rewards/kidney_reward/mean": 0.20664328336715698, + "rewards/kidney_reward/std": 1.4795762300491333, + "rewards/length2tails_reward/mean": 0.7287927865982056, + "rewards/length2tails_reward/std": 0.34719234704971313, + "rewards/thermo_reward/mean": -0.3142632246017456, + "rewards/thermo_reward/std": 1.987789273262024, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 276.125, + "completions/mean_terminated_length": 276.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1640459280461073, + "epoch": 0.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27881982922554016, + "learning_rate": 2.332772166583208e-08, + "loss": -0.0022, + "num_tokens": 4133312.0, + "reward": 6.9147844314575195, + "reward_std": 1.7263871431350708, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.43178486824035645, + "rewards/kidney_reward/std": 1.449816107749939, + "rewards/length2tails_reward/mean": 0.8127265572547913, + "rewards/length2tails_reward/std": 0.30783113837242126, + "rewards/thermo_reward/mean": 0.014802634716033936, + "rewards/thermo_reward/std": 2.0539700984954834, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 270.3548278808594, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16371324006468058, + "epoch": 0.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.666799545288086, + "learning_rate": 2.185239926619431e-08, + "loss": 0.2289, + "num_tokens": 4142479.0, + "reward": 6.779679298400879, + "reward_std": 2.5289268493652344, + "rewards/fitness_reward/mean": 6.130068778991699, + "rewards/fitness_reward/std": 2.0904550552368164, + "rewards/kidney_reward/mean": 0.32335078716278076, + "rewards/kidney_reward/std": 1.342889666557312, + "rewards/length2tails_reward/mean": 0.7477067112922668, + "rewards/length2tails_reward/std": 0.2941748797893524, + "rewards/thermo_reward/mean": 0.6020166277885437, + "rewards/thermo_reward/std": 1.8339732885360718, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12790144886821508, + "epoch": 0.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6886346340179443, + "learning_rate": 2.0424750400655943e-08, + "loss": 0.0028, + "num_tokens": 4151140.0, + "reward": 6.2106170654296875, + "reward_std": 3.0731492042541504, + "rewards/fitness_reward/mean": 5.7802581787109375, + "rewards/fitness_reward/std": 2.463535785675049, + "rewards/kidney_reward/mean": -0.24863553047180176, + "rewards/kidney_reward/std": 1.4447722434997559, + "rewards/length2tails_reward/mean": 0.7356371283531189, + "rewards/length2tails_reward/std": 0.3002653121948242, + "rewards/thermo_reward/mean": 0.7415350079536438, + "rewards/thermo_reward/std": 1.7750520706176758, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12629235815256834, + "epoch": 0.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.221573829650879, + "learning_rate": 1.9044844650808467e-08, + "loss": -0.0024, + "num_tokens": 4159775.0, + "reward": 6.558210372924805, + "reward_std": 2.4178450107574463, + "rewards/fitness_reward/mean": 6.1215362548828125, + "rewards/fitness_reward/std": 2.136852741241455, + "rewards/kidney_reward/mean": 0.2523971498012543, + "rewards/kidney_reward/std": 1.3830926418304443, + "rewards/length2tails_reward/mean": 0.7020688056945801, + "rewards/length2tails_reward/std": 0.31335434317588806, + "rewards/thermo_reward/mean": 0.2699163258075714, + "rewards/thermo_reward/std": 1.4773310422897339, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13433454185724258, + "epoch": 0.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0128010511398315, + "learning_rate": 1.771274927131139e-08, + "loss": 0.0026, + "num_tokens": 4168464.0, + "reward": 6.293290138244629, + "reward_std": 2.1775317192077637, + "rewards/fitness_reward/mean": 6.202960968017578, + "rewards/fitness_reward/std": 1.6983211040496826, + "rewards/kidney_reward/mean": -0.2153567522764206, + "rewards/kidney_reward/std": 1.2575360536575317, + "rewards/length2tails_reward/mean": 0.7736026048660278, + "rewards/length2tails_reward/std": 0.294264554977417, + "rewards/thermo_reward/mean": 0.00921345129609108, + "rewards/thermo_reward/std": 2.212282180786133, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1272476715967059, + "epoch": 0.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5237146019935608, + "learning_rate": 1.6428529186614193e-08, + "loss": -0.0039, + "num_tokens": 4177149.0, + "reward": 6.367445468902588, + "reward_std": 1.6114425659179688, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.08609558641910553, + "rewards/kidney_reward/std": 1.3017667531967163, + "rewards/length2tails_reward/mean": 0.7121409177780151, + "rewards/length2tails_reward/std": 0.31732919812202454, + "rewards/thermo_reward/mean": -0.06595921516418457, + "rewards/thermo_reward/std": 1.9046858549118042, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12140200845897198, + "epoch": 0.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39394357800483704, + "learning_rate": 1.519224698779198e-08, + "loss": 0.005, + "num_tokens": 4185839.0, + "reward": 6.108905792236328, + "reward_std": 2.085566759109497, + "rewards/fitness_reward/mean": 6.2113847732543945, + "rewards/fitness_reward/std": 1.6536391973495483, + "rewards/kidney_reward/mean": -0.22255373001098633, + "rewards/kidney_reward/std": 1.3791135549545288, + "rewards/length2tails_reward/mean": 0.8077136278152466, + "rewards/length2tails_reward/std": 0.24216946959495544, + "rewards/thermo_reward/mean": -0.3862607479095459, + "rewards/thermo_reward/std": 2.2672908306121826, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.1554853916168213, + "epoch": 0.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6892180442810059, + "learning_rate": 1.4003962929495127e-08, + "loss": -0.0541, + "num_tokens": 4194479.0, + "reward": 6.385907173156738, + "reward_std": 2.320415496826172, + "rewards/fitness_reward/mean": 6.11521577835083, + "rewards/fitness_reward/std": 2.1712775230407715, + "rewards/kidney_reward/mean": 0.04682411253452301, + "rewards/kidney_reward/std": 1.4674209356307983, + "rewards/length2tails_reward/mean": 0.8595852851867676, + "rewards/length2tails_reward/std": 0.2050134241580963, + "rewards/thermo_reward/mean": 0.06476667523384094, + "rewards/thermo_reward/std": 1.9430291652679443, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 284.5, + "completions/mean_terminated_length": 284.5, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.14631187915802002, + "epoch": 0.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1183857917785645, + "learning_rate": 1.2863734927012093e-08, + "loss": 0.1397, + "num_tokens": 4203615.0, + "reward": 5.392303943634033, + "reward_std": 3.975283622741699, + "rewards/fitness_reward/mean": 5.271598815917969, + "rewards/fitness_reward/std": 3.5448451042175293, + "rewards/kidney_reward/mean": -0.32605940103530884, + "rewards/kidney_reward/std": 1.1550776958465576, + "rewards/length2tails_reward/mean": 0.8411507606506348, + "rewards/length2tails_reward/std": 0.26199305057525635, + "rewards/thermo_reward/mean": 0.1468944400548935, + "rewards/thermo_reward/std": 1.9205420017242432, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 261.84375, + "completions/mean_terminated_length": 261.84375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.16815311275422573, + "epoch": 0.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7249035835266113, + "learning_rate": 1.1771618553447215e-08, + "loss": -0.1241, + "num_tokens": 4212026.0, + "reward": 6.322161674499512, + "reward_std": 3.3183627128601074, + "rewards/fitness_reward/mean": 5.872105121612549, + "rewards/fitness_reward/std": 2.830014228820801, + "rewards/kidney_reward/mean": 0.09821343421936035, + "rewards/kidney_reward/std": 1.3843051195144653, + "rewards/length2tails_reward/mean": 0.8109922409057617, + "rewards/length2tails_reward/std": 0.29384127259254456, + "rewards/thermo_reward/mean": 0.39640364050865173, + "rewards/thermo_reward/std": 1.6114957332611084, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12907341960817575, + "epoch": 0.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.533166766166687, + "learning_rate": 1.0727667037011667e-08, + "loss": 0.0007, + "num_tokens": 4220765.0, + "reward": 6.261979579925537, + "reward_std": 1.1733015775680542, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.2737240493297577, + "rewards/kidney_reward/std": 1.2405914068222046, + "rewards/length2tails_reward/mean": 0.8256810307502747, + "rewards/length2tails_reward/std": 0.2820776402950287, + "rewards/thermo_reward/mean": -0.5917750597000122, + "rewards/thermo_reward/std": 1.9592158794403076, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.21401519421488047, + "epoch": 0.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6284871101379395, + "learning_rate": 9.731931258429638e-09, + "loss": -0.01, + "num_tokens": 4229477.0, + "reward": 6.693005561828613, + "reward_std": 2.3508384227752686, + "rewards/fitness_reward/mean": 6.221892356872559, + "rewards/fitness_reward/std": 2.0896761417388916, + "rewards/kidney_reward/mean": 0.1586083471775055, + "rewards/kidney_reward/std": 1.414463996887207, + "rewards/length2tails_reward/mean": 0.8126205205917358, + "rewards/length2tails_reward/std": 0.2758491039276123, + "rewards/thermo_reward/mean": 0.3773079514503479, + "rewards/thermo_reward/std": 2.071603298187256, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 282.96875, + "completions/mean_terminated_length": 282.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17435009684413671, + "epoch": 0.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2569141387939453, + "learning_rate": 8.784459748458317e-09, + "loss": 0.0154, + "num_tokens": 4238564.0, + "reward": 5.633315086364746, + "reward_std": 3.1612935066223145, + "rewards/fitness_reward/mean": 5.347982883453369, + "rewards/fitness_reward/std": 3.3307552337646484, + "rewards/kidney_reward/mean": -0.03157354146242142, + "rewards/kidney_reward/std": 1.3760062456130981, + "rewards/length2tails_reward/mean": 0.7512047290802002, + "rewards/length2tails_reward/std": 0.30314382910728455, + "rewards/thermo_reward/mean": 0.22663527727127075, + "rewards/thermo_reward/std": 1.7661378383636475, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 279.15625, + "completions/mean_terminated_length": 279.15625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.17211697157472372, + "epoch": 0.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859675884246826, + "learning_rate": 7.885298685522235e-09, + "loss": 0.1062, + "num_tokens": 4247529.0, + "reward": 6.191662788391113, + "reward_std": 3.2506253719329834, + "rewards/fitness_reward/mean": 5.795844078063965, + "rewards/fitness_reward/std": 2.7598862648010254, + "rewards/kidney_reward/mean": 0.04356342554092407, + "rewards/kidney_reward/std": 1.4145551919937134, + "rewards/length2tails_reward/mean": 0.8112553954124451, + "rewards/length2tails_reward/std": 0.27985844016075134, + "rewards/thermo_reward/mean": 0.3424462676048279, + "rewards/thermo_reward/std": 1.8757723569869995, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14102749805897474, + "epoch": 0.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0287251472473145, + "learning_rate": 7.034491893463057e-09, + "loss": 0.0043, + "num_tokens": 4256244.0, + "reward": 6.852804183959961, + "reward_std": 1.6488749980926514, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1278020739555359, + "rewards/kidney_reward/std": 1.3883448839187622, + "rewards/length2tails_reward/mean": 0.7885364890098572, + "rewards/length2tails_reward/std": 0.3108876943588257, + "rewards/thermo_reward/mean": 0.41289806365966797, + "rewards/thermo_reward/std": 1.8513990640640259, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14220308419317007, + "epoch": 0.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5835313200950623, + "learning_rate": 6.23208083940363e-09, + "loss": 0.0032, + "num_tokens": 4264972.0, + "reward": 6.557872772216797, + "reward_std": 1.528399109840393, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.31340882182121277, + "rewards/kidney_reward/std": 1.2353618144989014, + "rewards/length2tails_reward/mean": 0.8489131331443787, + "rewards/length2tails_reward/std": 0.19918689131736755, + "rewards/thermo_reward/mean": 0.028078734874725342, + "rewards/thermo_reward/std": 1.8716614246368408, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13235425390303135, + "epoch": 0.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6584662795066833, + "learning_rate": 5.47810463172671e-09, + "loss": 0.0031, + "num_tokens": 4273681.0, + "reward": 6.674538612365723, + "reward_std": 1.3704478740692139, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.00790829211473465, + "rewards/kidney_reward/std": 1.332300066947937, + "rewards/length2tails_reward/mean": 0.802370548248291, + "rewards/length2tails_reward/std": 0.25463488698005676, + "rewards/thermo_reward/mean": -0.2426123023033142, + "rewards/thermo_reward/std": 2.2439165115356445, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12326779402792454, + "epoch": 0.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37450870871543884, + "learning_rate": 4.772600018168815e-09, + "loss": 0.0022, + "num_tokens": 4282389.0, + "reward": 6.011283874511719, + "reward_std": 1.5877715349197388, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.4957900643348694, + "rewards/kidney_reward/std": 1.1033035516738892, + "rewards/length2tails_reward/mean": 0.7796015739440918, + "rewards/length2tails_reward/std": 0.30093348026275635, + "rewards/thermo_reward/mean": -0.6420827507972717, + "rewards/thermo_reward/std": 2.05932354927063, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1382999373599887, + "epoch": 0.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1265020370483398, + "learning_rate": 4.115601384029666e-09, + "loss": -0.0023, + "num_tokens": 4291113.0, + "reward": 7.089540958404541, + "reward_std": 1.1871302127838135, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.15937963128089905, + "rewards/kidney_reward/std": 1.3904200792312622, + "rewards/length2tails_reward/mean": 0.8089946508407593, + "rewards/length2tails_reward/std": 0.25792431831359863, + "rewards/thermo_reward/mean": 0.43260854482650757, + "rewards/thermo_reward/std": 1.7895396947860718, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 277.03125, + "completions/mean_terminated_length": 277.03125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.13852917868644, + "epoch": 0.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1202490329742432, + "learning_rate": 3.5071407504956294e-09, + "loss": 0.078, + "num_tokens": 4300010.0, + "reward": 6.318650722503662, + "reward_std": 1.5724629163742065, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.29085487127304077, + "rewards/kidney_reward/std": 1.2370598316192627, + "rewards/length2tails_reward/mean": 0.7769042253494263, + "rewards/length2tails_reward/std": 0.30063578486442566, + "rewards/thermo_reward/mean": -0.4369143545627594, + "rewards/thermo_reward/std": 2.1379871368408203, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14235816057771444, + "epoch": 0.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0515778064727783, + "learning_rate": 2.947247773079753e-09, + "loss": 0.0048, + "num_tokens": 4308707.0, + "reward": 6.565581798553467, + "reward_std": 2.419142007827759, + "rewards/fitness_reward/mean": 6.200839042663574, + "rewards/fitness_reward/std": 2.2087697982788086, + "rewards/kidney_reward/mean": -0.203052818775177, + "rewards/kidney_reward/std": 1.2326304912567139, + "rewards/length2tails_reward/mean": 0.8171231150627136, + "rewards/length2tails_reward/std": 0.23487749695777893, + "rewards/thermo_reward/mean": 0.5239765048027039, + "rewards/thermo_reward/std": 1.8486820459365845, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12068606168031693, + "epoch": 0.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6563858389854431, + "learning_rate": 2.435949740175802e-09, + "loss": -0.0026, + "num_tokens": 4317372.0, + "reward": 6.417881011962891, + "reward_std": 2.5073564052581787, + "rewards/fitness_reward/mean": 6.17474365234375, + "rewards/fitness_reward/std": 1.8490768671035767, + "rewards/kidney_reward/mean": -0.0659375786781311, + "rewards/kidney_reward/std": 1.3378238677978516, + "rewards/length2tails_reward/mean": 0.7271683216094971, + "rewards/length2tails_reward/std": 0.32772964239120483, + "rewards/thermo_reward/mean": 0.1886269599199295, + "rewards/thermo_reward/std": 2.022970199584961, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 286.78125, + "completions/mean_terminated_length": 286.78125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.18350693583488464, + "epoch": 0.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7464146614074707, + "learning_rate": 1.973271571728441e-09, + "loss": 0.0335, + "num_tokens": 4326581.0, + "reward": 6.034966468811035, + "reward_std": 2.6427571773529053, + "rewards/fitness_reward/mean": 5.688358783721924, + "rewards/fitness_reward/std": 2.8081235885620117, + "rewards/kidney_reward/mean": 0.0014158524572849274, + "rewards/kidney_reward/std": 1.2659722566604614, + "rewards/length2tails_reward/mean": 0.8527138829231262, + "rewards/length2tails_reward/std": 0.21939502656459808, + "rewards/thermo_reward/mean": 0.26544201374053955, + "rewards/thermo_reward/std": 1.916554570198059, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.13475322257727385, + "epoch": 0.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8051790595054626, + "learning_rate": 1.559235818018978e-09, + "loss": 0.0115, + "num_tokens": 4335220.0, + "reward": 5.8860626220703125, + "reward_std": 3.1279499530792236, + "rewards/fitness_reward/mean": 5.766759872436523, + "rewards/fitness_reward/std": 2.515184164047241, + "rewards/kidney_reward/mean": 0.12276924401521683, + "rewards/kidney_reward/std": 1.2952481508255005, + "rewards/length2tails_reward/mean": 0.7117717266082764, + "rewards/length2tails_reward/std": 0.3357178866863251, + "rewards/thermo_reward/mean": -0.24005013704299927, + "rewards/thermo_reward/std": 2.0688977241516113, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 285.3125, + "completions/mean_terminated_length": 285.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18036719225347042, + "epoch": 0.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.536488056182861, + "learning_rate": 1.193862658566025e-09, + "loss": 0.1906, + "num_tokens": 4344382.0, + "reward": 6.254662990570068, + "reward_std": 2.8364017009735107, + "rewards/fitness_reward/mean": 5.82073974609375, + "rewards/fitness_reward/std": 2.669975996017456, + "rewards/kidney_reward/mean": 0.12117721140384674, + "rewards/kidney_reward/std": 1.3783304691314697, + "rewards/length2tails_reward/mean": 0.8040266633033752, + "rewards/length2tails_reward/std": 0.24874736368656158, + "rewards/thermo_reward/mean": 0.34465593099594116, + "rewards/thermo_reward/std": 1.8117518424987793, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 265.4375, + "completions/mean_terminated_length": 265.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.14075111038982868, + "epoch": 0.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8132911920547485, + "learning_rate": 8.771699011416167e-10, + "loss": -0.0563, + "num_tokens": 4352908.0, + "reward": 5.731771469116211, + "reward_std": 2.852626085281372, + "rewards/fitness_reward/mean": 5.548877239227295, + "rewards/fitness_reward/std": 2.9979569911956787, + "rewards/kidney_reward/mean": -0.32727035880088806, + "rewards/kidney_reward/std": 1.3308184146881104, + "rewards/length2tails_reward/mean": 0.8396377563476562, + "rewards/length2tails_reward/std": 0.1741836816072464, + "rewards/thermo_reward/mean": 0.27323901653289795, + "rewards/thermo_reward/std": 1.6997170448303223, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.13753656949847937, + "epoch": 0.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.766315758228302, + "learning_rate": 6.091729809042379e-10, + "loss": -0.0831, + "num_tokens": 4361450.0, + "reward": 5.796799182891846, + "reward_std": 3.151379108428955, + "rewards/fitness_reward/mean": 5.714599609375, + "rewards/fitness_reward/std": 2.720996141433716, + "rewards/kidney_reward/mean": -0.09053881466388702, + "rewards/kidney_reward/std": 1.1235448122024536, + "rewards/length2tails_reward/mean": 0.7273579835891724, + "rewards/length2tails_reward/std": 0.32541486620903015, + "rewards/thermo_reward/mean": -0.10874120146036148, + "rewards/thermo_reward/std": 1.9039497375488281, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1267131119966507, + "epoch": 0.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5692701935768127, + "learning_rate": 3.8988495964564774e-10, + "loss": 0.0018, + "num_tokens": 4370168.0, + "reward": 6.738133430480957, + "reward_std": 1.3152570724487305, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.08398180454969406, + "rewards/kidney_reward/std": 1.4386367797851562, + "rewards/length2tails_reward/mean": 0.7613844871520996, + "rewards/length2tails_reward/std": 0.31253892183303833, + "rewards/thermo_reward/mean": -0.1710038036108017, + "rewards/thermo_reward/std": 1.9834370613098145, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14032446220517159, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2252322435379028, + "learning_rate": 2.1931652515450038e-10, + "loss": 0.0056, + "num_tokens": 4378889.0, + "reward": 6.141080379486084, + "reward_std": 3.0671908855438232, + "rewards/fitness_reward/mean": 5.957330703735352, + "rewards/fitness_reward/std": 2.511507272720337, + "rewards/kidney_reward/mean": 0.047492511570453644, + "rewards/kidney_reward/std": 1.287542462348938, + "rewards/length2tails_reward/mean": 0.7975193858146667, + "rewards/length2tails_reward/std": 0.21932071447372437, + "rewards/thermo_reward/mean": -0.07875257730484009, + "rewards/thermo_reward/std": 1.930476427078247, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13029385171830654, + "epoch": 1.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3931049108505249, + "learning_rate": 9.747599069576118e-11, + "loss": 0.0044, + "num_tokens": 4387576.0, + "reward": 6.128358840942383, + "reward_std": 1.9481713771820068, + "rewards/fitness_reward/mean": 6.2005510330200195, + "rewards/fitness_reward/std": 1.7111358642578125, + "rewards/kidney_reward/mean": -0.4570838212966919, + "rewards/kidney_reward/std": 1.3160635232925415, + "rewards/length2tails_reward/mean": 0.7353918552398682, + "rewards/length2tails_reward/std": 0.2832469344139099, + "rewards/thermo_reward/mean": -0.05499591678380966, + "rewards/thermo_reward/std": 2.082115411758423, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15063100401312113, + "epoch": 1.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4766063690185547, + "learning_rate": 1.085874532688217e-06, + "loss": 0.0044, + "num_tokens": 4396313.0, + "reward": 4.716043472290039, + "reward_std": 3.9207024574279785, + "rewards/fitness_reward/mean": 4.780799388885498, + "rewards/fitness_reward/std": 3.55592679977417, + "rewards/kidney_reward/mean": -0.35647231340408325, + "rewards/kidney_reward/std": 1.4802333116531372, + "rewards/length2tails_reward/mean": 0.8220291137695312, + "rewards/length2tails_reward/std": 0.2834490239620209, + "rewards/thermo_reward/mean": -0.18405494093894958, + "rewards/thermo_reward/std": 1.9247404336929321, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.14282104652374983, + "epoch": 1.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4218590259552002, + "learning_rate": 1.0825793454723324e-06, + "loss": -0.0044, + "num_tokens": 4404974.0, + "reward": 6.322770595550537, + "reward_std": 3.294572591781616, + "rewards/fitness_reward/mean": 5.877994537353516, + "rewards/fitness_reward/std": 2.809220314025879, + "rewards/kidney_reward/mean": 0.34341520071029663, + "rewards/kidney_reward/std": 1.2038322687149048, + "rewards/length2tails_reward/mean": 0.7419826984405518, + "rewards/length2tails_reward/std": 0.3042587339878082, + "rewards/thermo_reward/mean": 0.175145223736763, + "rewards/thermo_reward/std": 1.9156057834625244, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1312598967924714, + "epoch": 1.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2247414588928223, + "learning_rate": 1.0792832551819557e-06, + "loss": 0.0793, + "num_tokens": 4413830.0, + "reward": 6.414349555969238, + "reward_std": 2.865602731704712, + "rewards/fitness_reward/mean": 5.952852725982666, + "rewards/fitness_reward/std": 2.535936117172241, + "rewards/kidney_reward/mean": -0.17726193368434906, + "rewards/kidney_reward/std": 1.3616160154342651, + "rewards/length2tails_reward/mean": 0.8348301649093628, + "rewards/length2tails_reward/std": 0.1951935738325119, + "rewards/thermo_reward/mean": 0.6828402280807495, + "rewards/thermo_reward/std": 1.7432595491409302, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14658197574317455, + "epoch": 1.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0982699394226074, + "learning_rate": 1.075986297862603e-06, + "loss": 0.0101, + "num_tokens": 4422552.0, + "reward": 6.486241340637207, + "reward_std": 2.30737042427063, + "rewards/fitness_reward/mean": 6.141730785369873, + "rewards/fitness_reward/std": 2.027179718017578, + "rewards/kidney_reward/mean": -0.10469349473714828, + "rewards/kidney_reward/std": 1.3239145278930664, + "rewards/length2tails_reward/mean": 0.8329644203186035, + "rewards/length2tails_reward/std": 0.21248126029968262, + "rewards/thermo_reward/mean": 0.3772319257259369, + "rewards/thermo_reward/std": 1.8914566040039062, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11312571819871664, + "epoch": 1.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7219551205635071, + "learning_rate": 1.072688509569271e-06, + "loss": 0.0041, + "num_tokens": 4431214.0, + "reward": 6.577698707580566, + "reward_std": 1.5953855514526367, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.1282057762145996, + "rewards/kidney_reward/std": 1.2744837999343872, + "rewards/length2tails_reward/mean": 0.756378173828125, + "rewards/length2tails_reward/std": 0.29190853238105774, + "rewards/thermo_reward/mean": 0.1347734034061432, + "rewards/thermo_reward/std": 2.067352771759033, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.16683596931397915, + "epoch": 1.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5193722248077393, + "learning_rate": 1.069389926366044e-06, + "loss": -0.0412, + "num_tokens": 4439909.0, + "reward": 6.593597412109375, + "reward_std": 2.396681070327759, + "rewards/fitness_reward/mean": 6.228442668914795, + "rewards/fitness_reward/std": 2.0526208877563477, + "rewards/kidney_reward/mean": 0.09212029725313187, + "rewards/kidney_reward/std": 1.4154157638549805, + "rewards/length2tails_reward/mean": 0.8067541122436523, + "rewards/length2tails_reward/std": 0.2526581585407257, + "rewards/thermo_reward/mean": 0.2348119616508484, + "rewards/thermo_reward/std": 1.8269776105880737, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1140651274472475, + "epoch": 1.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35990557074546814, + "learning_rate": 1.0660905843256994e-06, + "loss": -0.0031, + "num_tokens": 4448550.0, + "reward": 6.444252014160156, + "reward_std": 2.1594624519348145, + "rewards/fitness_reward/mean": 6.186591148376465, + "rewards/fitness_reward/std": 1.7855913639068604, + "rewards/kidney_reward/mean": -0.002717338502407074, + "rewards/kidney_reward/std": 1.3338731527328491, + "rewards/length2tails_reward/mean": 0.676758885383606, + "rewards/length2tails_reward/std": 0.3374778926372528, + "rewards/thermo_reward/mean": 0.17965897917747498, + "rewards/thermo_reward/std": 1.7784078121185303, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.12541080079972744, + "epoch": 1.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49342793226242065, + "learning_rate": 1.0627905195293135e-06, + "loss": 0.0019, + "num_tokens": 4457243.0, + "reward": 6.073275566101074, + "reward_std": 2.286477565765381, + "rewards/fitness_reward/mean": 6.070756912231445, + "rewards/fitness_reward/std": 1.9208258390426636, + "rewards/kidney_reward/mean": -0.2497643232345581, + "rewards/kidney_reward/std": 1.2507424354553223, + "rewards/length2tails_reward/mean": 0.7881720066070557, + "rewards/length2tails_reward/std": 0.31101441383361816, + "rewards/thermo_reward/mean": -0.1392844319343567, + "rewards/thermo_reward/std": 1.982621669769287, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14774498343467712, + "epoch": 1.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9633992910385132, + "learning_rate": 1.0594897680658656e-06, + "loss": 0.0001, + "num_tokens": 4465974.0, + "reward": 6.762842178344727, + "reward_std": 1.1644959449768066, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.21087446808815002, + "rewards/kidney_reward/std": 1.2558995485305786, + "rewards/length2tails_reward/mean": 0.8480589389801025, + "rewards/length2tails_reward/std": 0.2005181908607483, + "rewards/thermo_reward/mean": -0.29181528091430664, + "rewards/thermo_reward/std": 1.9807835817337036, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1343985851854086, + "epoch": 1.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6261811852455139, + "learning_rate": 1.0561883660318454e-06, + "loss": 0.0058, + "num_tokens": 4474643.0, + "reward": 6.389760971069336, + "reward_std": 1.3629364967346191, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.35103434324264526, + "rewards/kidney_reward/std": 1.2251182794570923, + "rewards/length2tails_reward/mean": 0.7605469822883606, + "rewards/length2tails_reward/std": 0.31118449568748474, + "rewards/thermo_reward/mean": -0.020357713103294373, + "rewards/thermo_reward/std": 1.7070509195327759, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12923681642860174, + "epoch": 1.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38580459356307983, + "learning_rate": 1.0528863495308566e-06, + "loss": 0.0014, + "num_tokens": 4483322.0, + "reward": 6.612669944763184, + "reward_std": 1.37649405002594, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.18318305909633636, + "rewards/kidney_reward/std": 1.299351453781128, + "rewards/length2tails_reward/mean": 0.7847663760185242, + "rewards/length2tails_reward/std": 0.2517183721065521, + "rewards/thermo_reward/mean": 0.03952169418334961, + "rewards/thermo_reward/std": 1.960716724395752, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13614719919860363, + "epoch": 1.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7320016622543335, + "learning_rate": 1.0495837546732222e-06, + "loss": -0.0041, + "num_tokens": 4491989.0, + "reward": 6.7402777671813965, + "reward_std": 1.829291820526123, + "rewards/fitness_reward/mean": 6.315629005432129, + "rewards/fitness_reward/std": 1.5594186782836914, + "rewards/kidney_reward/mean": -0.052485376596450806, + "rewards/kidney_reward/std": 1.2512917518615723, + "rewards/length2tails_reward/mean": 0.7529121041297913, + "rewards/length2tails_reward/std": 0.30625954270362854, + "rewards/thermo_reward/mean": 0.5253260135650635, + "rewards/thermo_reward/std": 1.8666858673095703, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1361381532624364, + "epoch": 1.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5114327669143677, + "learning_rate": 1.046280617575591e-06, + "loss": -0.002, + "num_tokens": 4500694.0, + "reward": 7.205974578857422, + "reward_std": 1.038799524307251, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4273352324962616, + "rewards/kidney_reward/std": 1.343811273574829, + "rewards/length2tails_reward/mean": 0.7962641716003418, + "rewards/length2tails_reward/std": 0.2505396902561188, + "rewards/thermo_reward/mean": 0.40388602018356323, + "rewards/thermo_reward/std": 1.614760160446167, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 274.4375, + "completions/mean_terminated_length": 274.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14021822530776262, + "epoch": 1.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4638788402080536, + "learning_rate": 1.0429769743605405e-06, + "loss": -0.0207, + "num_tokens": 4509508.0, + "reward": 6.43010139465332, + "reward_std": 1.6429603099822998, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.2095562219619751, + "rewards/kidney_reward/std": 1.3042500019073486, + "rewards/length2tails_reward/mean": 0.7541611194610596, + "rewards/length2tails_reward/std": 0.33757731318473816, + "rewards/thermo_reward/mean": 0.12801618874073029, + "rewards/thermo_reward/std": 1.8921420574188232, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 276.71875, + "completions/mean_terminated_length": 276.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13349637668579817, + "epoch": 1.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7291481494903564, + "learning_rate": 1.0396728611561843e-06, + "loss": 0.1033, + "num_tokens": 4518395.0, + "reward": 6.790717601776123, + "reward_std": 2.053311586380005, + "rewards/fitness_reward/mean": 6.252190589904785, + "rewards/fitness_reward/std": 1.9182831048965454, + "rewards/kidney_reward/mean": 0.17156369984149933, + "rewards/kidney_reward/std": 1.4212603569030762, + "rewards/length2tails_reward/mean": 0.7817354202270508, + "rewards/length2tails_reward/std": 0.28353890776634216, + "rewards/thermo_reward/mean": 0.5146225690841675, + "rewards/thermo_reward/std": 1.6317241191864014, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12114311382174492, + "epoch": 1.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6436499357223511, + "learning_rate": 1.0363683140957744e-06, + "loss": 0.0067, + "num_tokens": 4527068.0, + "reward": 6.9150238037109375, + "reward_std": 1.4282177686691284, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.420188307762146, + "rewards/kidney_reward/std": 1.4897546768188477, + "rewards/length2tails_reward/mean": 0.7961421608924866, + "rewards/length2tails_reward/std": 0.1942632496356964, + "rewards/thermo_reward/mean": -0.1708083152770996, + "rewards/thermo_reward/std": 2.0713841915130615, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.09375, + "completions/mean_terminated_length": 269.09375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.13267110008746386, + "epoch": 1.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5470105409622192, + "learning_rate": 1.0330633693173082e-06, + "loss": 0.0028, + "num_tokens": 4535711.0, + "reward": 6.7112836837768555, + "reward_std": 1.220462679862976, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.1221885085105896, + "rewards/kidney_reward/std": 1.3579566478729248, + "rewards/length2tails_reward/mean": 0.8357151746749878, + "rewards/length2tails_reward/std": 0.1849537491798401, + "rewards/thermo_reward/mean": -0.05569875240325928, + "rewards/thermo_reward/std": 1.8645966053009033, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.13998878747224808, + "epoch": 1.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4017844200134277, + "learning_rate": 1.0297580629631324e-06, + "loss": 0.0603, + "num_tokens": 4544510.0, + "reward": 6.162096977233887, + "reward_std": 3.3689303398132324, + "rewards/fitness_reward/mean": 5.874364852905273, + "rewards/fitness_reward/std": 2.821101188659668, + "rewards/kidney_reward/mean": 0.1349107325077057, + "rewards/kidney_reward/std": 1.3395761251449585, + "rewards/length2tails_reward/mean": 0.750157356262207, + "rewards/length2tails_reward/std": 0.3289134204387665, + "rewards/thermo_reward/mean": 0.06547515094280243, + "rewards/thermo_reward/std": 1.9795258045196533, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.15451844595372677, + "epoch": 1.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7261816263198853, + "learning_rate": 1.0264524311795477e-06, + "loss": 0.0686, + "num_tokens": 4553366.0, + "reward": 6.272507667541504, + "reward_std": 2.775852680206299, + "rewards/fitness_reward/mean": 5.904332160949707, + "rewards/fitness_reward/std": 2.7064011096954346, + "rewards/kidney_reward/mean": 0.01566886156797409, + "rewards/kidney_reward/std": 1.1194953918457031, + "rewards/length2tails_reward/mean": 0.8368139266967773, + "rewards/length2tails_reward/std": 0.20790699124336243, + "rewards/thermo_reward/mean": 0.30227431654930115, + "rewards/thermo_reward/std": 1.9701894521713257, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1274288296699524, + "epoch": 1.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2885948121547699, + "learning_rate": 1.0231465101164138e-06, + "loss": 0.0002, + "num_tokens": 4562047.0, + "reward": 6.923990249633789, + "reward_std": 1.2962249517440796, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.18058733642101288, + "rewards/kidney_reward/std": 1.504853367805481, + "rewards/length2tails_reward/mean": 0.7315161824226379, + "rewards/length2tails_reward/std": 0.3234041929244995, + "rewards/thermo_reward/mean": 0.32501617074012756, + "rewards/thermo_reward/std": 1.7061312198638916, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14175010193139315, + "epoch": 1.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4665096998214722, + "learning_rate": 1.0198403359267536e-06, + "loss": 0.0278, + "num_tokens": 4570800.0, + "reward": 5.75223445892334, + "reward_std": 2.7915806770324707, + "rewards/fitness_reward/mean": 5.585247039794922, + "rewards/fitness_reward/std": 2.488772392272949, + "rewards/kidney_reward/mean": -0.24288256466388702, + "rewards/kidney_reward/std": 1.4095439910888672, + "rewards/length2tails_reward/mean": 0.8185077905654907, + "rewards/length2tails_reward/std": 0.26394015550613403, + "rewards/thermo_reward/mean": 0.1676037311553955, + "rewards/thermo_reward/std": 1.8292311429977417, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14256253372877836, + "epoch": 1.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53201961517334, + "learning_rate": 1.0165339447663586e-06, + "loss": 0.0136, + "num_tokens": 4579486.0, + "reward": 5.869808673858643, + "reward_std": 2.5947656631469727, + "rewards/fitness_reward/mean": 6.2259650230407715, + "rewards/fitness_reward/std": 2.066636562347412, + "rewards/kidney_reward/mean": -0.5915475487709045, + "rewards/kidney_reward/std": 1.3409829139709473, + "rewards/length2tails_reward/mean": 0.7683144807815552, + "rewards/length2tails_reward/std": 0.27298229932785034, + "rewards/thermo_reward/mean": -0.5049220323562622, + "rewards/thermo_reward/std": 1.864272117614746, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11859462130814791, + "epoch": 1.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5305758118629456, + "learning_rate": 1.0132273727933923e-06, + "loss": 0.0006, + "num_tokens": 4588166.0, + "reward": 6.6199212074279785, + "reward_std": 2.5161712169647217, + "rewards/fitness_reward/mean": 6.2628326416015625, + "rewards/fitness_reward/std": 1.8580807447433472, + "rewards/kidney_reward/mean": 0.13870906829833984, + "rewards/kidney_reward/std": 1.3581653833389282, + "rewards/length2tails_reward/mean": 0.7632534503936768, + "rewards/length2tails_reward/std": 0.2750368118286133, + "rewards/thermo_reward/mean": 0.1938406229019165, + "rewards/thermo_reward/std": 1.7894741296768188, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14264140371233225, + "epoch": 1.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.438153624534607, + "learning_rate": 1.0099206561679963e-06, + "loss": 0.0013, + "num_tokens": 4596866.0, + "reward": 6.146903038024902, + "reward_std": 3.041282892227173, + "rewards/fitness_reward/mean": 5.802824974060059, + "rewards/fitness_reward/std": 2.7383475303649902, + "rewards/kidney_reward/mean": -0.00892581045627594, + "rewards/kidney_reward/std": 1.4409884214401245, + "rewards/length2tails_reward/mean": 0.7481784820556641, + "rewards/length2tails_reward/std": 0.28880226612091064, + "rewards/thermo_reward/mean": 0.32299262285232544, + "rewards/thermo_reward/std": 1.8470505475997925, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 268.45159912109375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.17776819597929716, + "epoch": 1.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3385443687438965, + "learning_rate": 1.0066138310518942e-06, + "loss": 0.1741, + "num_tokens": 4605974.0, + "reward": 6.1656975746154785, + "reward_std": 3.263472318649292, + "rewards/fitness_reward/mean": 5.848710060119629, + "rewards/fitness_reward/std": 2.923682689666748, + "rewards/kidney_reward/mean": -0.6391528248786926, + "rewards/kidney_reward/std": 1.3782894611358643, + "rewards/length2tails_reward/mean": 0.8260078430175781, + "rewards/length2tails_reward/std": 0.17865999042987823, + "rewards/thermo_reward/mean": 0.8601248264312744, + "rewards/thermo_reward/std": 1.563266396522522, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.16660993732511997, + "epoch": 1.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5620379447937012, + "learning_rate": 1.0033069336079952e-06, + "loss": -0.0249, + "num_tokens": 4614616.0, + "reward": 6.151926040649414, + "reward_std": 2.6613047122955322, + "rewards/fitness_reward/mean": 5.89830207824707, + "rewards/fitness_reward/std": 2.7269234657287598, + "rewards/kidney_reward/mean": -0.0820734053850174, + "rewards/kidney_reward/std": 1.2384506464004517, + "rewards/length2tails_reward/mean": 0.8303913474082947, + "rewards/length2tails_reward/std": 0.2801852226257324, + "rewards/thermo_reward/mean": 0.17412415146827698, + "rewards/thermo_reward/std": 1.9478468894958496, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.13714282773435116, + "epoch": 1.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8548287153244019, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 4623337.0, + "reward": 6.260705947875977, + "reward_std": 3.053499937057495, + "rewards/fitness_reward/mean": 5.850708961486816, + "rewards/fitness_reward/std": 2.922593355178833, + "rewards/kidney_reward/mean": 0.023052960634231567, + "rewards/kidney_reward/std": 1.3226453065872192, + "rewards/length2tails_reward/mean": 0.8229449391365051, + "rewards/length2tails_reward/std": 0.22886696457862854, + "rewards/thermo_reward/mean": 0.3854677081108093, + "rewards/thermo_reward/std": 1.8542811870574951, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 278.59375, + "completions/mean_terminated_length": 278.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1528241215273738, + "epoch": 1.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6255238056182861, + "learning_rate": 9.966930663920047e-07, + "loss": 0.0844, + "num_tokens": 4632284.0, + "reward": 6.275386810302734, + "reward_std": 2.969879627227783, + "rewards/fitness_reward/mean": 5.682027816772461, + "rewards/fitness_reward/std": 2.8381593227386475, + "rewards/kidney_reward/mean": 0.3764987289905548, + "rewards/kidney_reward/std": 1.3428179025650024, + "rewards/length2tails_reward/mean": 0.8416090607643127, + "rewards/length2tails_reward/std": 0.23421354591846466, + "rewards/thermo_reward/mean": 0.38941410183906555, + "rewards/thermo_reward/std": 1.8643651008605957, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 279.59375, + "completions/mean_terminated_length": 279.59375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.16059839446097612, + "epoch": 1.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5222561359405518, + "learning_rate": 9.93386168948106e-07, + "loss": 0.0432, + "num_tokens": 4641263.0, + "reward": 5.578651428222656, + "reward_std": 3.2182140350341797, + "rewards/fitness_reward/mean": 5.612772464752197, + "rewards/fitness_reward/std": 2.7972095012664795, + "rewards/kidney_reward/mean": -0.2904106676578522, + "rewards/kidney_reward/std": 1.1763945817947388, + "rewards/length2tails_reward/mean": 0.9197503328323364, + "rewards/length2tails_reward/std": 0.1497938632965088, + "rewards/thermo_reward/mean": -0.23770755529403687, + "rewards/thermo_reward/std": 2.2182846069335938, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15945592615753412, + "epoch": 1.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.7326860427856445, + "learning_rate": 9.900793438320036e-07, + "loss": 0.0048, + "num_tokens": 4649962.0, + "reward": 6.165236473083496, + "reward_std": 2.0870838165283203, + "rewards/fitness_reward/mean": 6.139778137207031, + "rewards/fitness_reward/std": 2.0377631187438965, + "rewards/kidney_reward/mean": -0.29442495107650757, + "rewards/kidney_reward/std": 0.9816387295722961, + "rewards/length2tails_reward/mean": 0.7879770994186401, + "rewards/length2tails_reward/std": 0.278515100479126, + "rewards/thermo_reward/mean": -0.04864644259214401, + "rewards/thermo_reward/std": 2.001068353652954, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1316318465396762, + "epoch": 1.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1423982381820679, + "learning_rate": 9.867726272066078e-07, + "loss": 0.0031, + "num_tokens": 4658625.0, + "reward": 6.170933723449707, + "reward_std": 3.059544086456299, + "rewards/fitness_reward/mean": 5.882389068603516, + "rewards/fitness_reward/std": 2.4371273517608643, + "rewards/kidney_reward/mean": -0.20426565408706665, + "rewards/kidney_reward/std": 1.4124196767807007, + "rewards/length2tails_reward/mean": 0.7186302542686462, + "rewards/length2tails_reward/std": 0.2689270079135895, + "rewards/thermo_reward/mean": 0.42203912138938904, + "rewards/thermo_reward/std": 1.79399573802948, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13159463834017515, + "epoch": 1.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7654881477355957, + "learning_rate": 9.834660552336415e-07, + "loss": 0.0134, + "num_tokens": 4667380.0, + "reward": 6.113855361938477, + "reward_std": 1.2902034521102905, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.4482192099094391, + "rewards/kidney_reward/std": 1.193291425704956, + "rewards/length2tails_reward/mean": 0.8577442169189453, + "rewards/length2tails_reward/std": 0.16505979001522064, + "rewards/thermo_reward/mean": -0.5235818028450012, + "rewards/thermo_reward/std": 2.1191470623016357, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11732593178749084, + "epoch": 1.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6562029719352722, + "learning_rate": 9.801596640732465e-07, + "loss": 0.0021, + "num_tokens": 4676064.0, + "reward": 6.969173431396484, + "reward_std": 1.4373183250427246, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.24578352272510529, + "rewards/kidney_reward/std": 1.4383959770202637, + "rewards/length2tails_reward/mean": 0.7385537028312683, + "rewards/length2tails_reward/std": 0.3359629213809967, + "rewards/thermo_reward/mean": 0.1406901478767395, + "rewards/thermo_reward/std": 1.933661937713623, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1390068093314767, + "epoch": 1.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43044525384902954, + "learning_rate": 9.76853489883586e-07, + "loss": -0.0011, + "num_tokens": 4684758.0, + "reward": 6.810073375701904, + "reward_std": 1.400303602218628, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.06369506567716599, + "rewards/kidney_reward/std": 1.3923709392547607, + "rewards/length2tails_reward/mean": 0.8017060160636902, + "rewards/length2tails_reward/std": 0.23364786803722382, + "rewards/thermo_reward/mean": -0.026997536420822144, + "rewards/thermo_reward/std": 2.1442739963531494, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14229949098080397, + "epoch": 1.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7712764739990234, + "learning_rate": 9.73547568820452e-07, + "loss": 0.0106, + "num_tokens": 4693462.0, + "reward": 6.350998401641846, + "reward_std": 2.644775867462158, + "rewards/fitness_reward/mean": 5.908863067626953, + "rewards/fitness_reward/std": 2.329244613647461, + "rewards/kidney_reward/mean": 0.43196019530296326, + "rewards/kidney_reward/std": 1.1989777088165283, + "rewards/length2tails_reward/mean": 0.776542603969574, + "rewards/length2tails_reward/std": 0.3111147880554199, + "rewards/thermo_reward/mean": 0.06403912603855133, + "rewards/thermo_reward/std": 1.878399133682251, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 282.03125, + "completions/mean_terminated_length": 282.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.17150241695344448, + "epoch": 1.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.284193277359009, + "learning_rate": 9.702419370368676e-07, + "loss": 0.1684, + "num_tokens": 4702519.0, + "reward": 6.48347282409668, + "reward_std": 2.568263053894043, + "rewards/fitness_reward/mean": 6.2233500480651855, + "rewards/fitness_reward/std": 2.081427574157715, + "rewards/kidney_reward/mean": 0.03654392808675766, + "rewards/kidney_reward/std": 1.3589805364608765, + "rewards/length2tails_reward/mean": 0.8815852403640747, + "rewards/length2tails_reward/std": 0.20788002014160156, + "rewards/thermo_reward/mean": 0.04290910065174103, + "rewards/thermo_reward/std": 1.958383560180664, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13951380364596844, + "epoch": 1.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3122531175613403, + "learning_rate": 9.669366306826917e-07, + "loss": -0.0024, + "num_tokens": 4711179.0, + "reward": 6.949626922607422, + "reward_std": 1.5579562187194824, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4565337598323822, + "rewards/kidney_reward/std": 1.4615638256072998, + "rewards/length2tails_reward/mean": 0.7286112308502197, + "rewards/length2tails_reward/std": 0.2831724286079407, + "rewards/thermo_reward/mean": 0.10179591178894043, + "rewards/thermo_reward/std": 1.6984502077102661, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.17715083248913288, + "epoch": 1.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.699789047241211, + "learning_rate": 9.636316859042257e-07, + "loss": -0.0111, + "num_tokens": 4719759.0, + "reward": 6.4292707443237305, + "reward_std": 1.9660115242004395, + "rewards/fitness_reward/mean": 6.2418622970581055, + "rewards/fitness_reward/std": 1.9767087697982788, + "rewards/kidney_reward/mean": 0.21176302433013916, + "rewards/kidney_reward/std": 1.363947868347168, + "rewards/length2tails_reward/mean": 0.7133187055587769, + "rewards/length2tails_reward/std": 0.30997586250305176, + "rewards/thermo_reward/mean": -0.19360485672950745, + "rewards/thermo_reward/std": 2.003898859024048, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.129694988951087, + "epoch": 1.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6537262797355652, + "learning_rate": 9.603271388438158e-07, + "loss": 0.0015, + "num_tokens": 4728488.0, + "reward": 6.394089221954346, + "reward_std": 2.369520902633667, + "rewards/fitness_reward/mean": 5.975286483764648, + "rewards/fitness_reward/std": 1.942562222480774, + "rewards/kidney_reward/mean": 0.05214837193489075, + "rewards/kidney_reward/std": 1.531655192375183, + "rewards/length2tails_reward/mean": 0.8460031747817993, + "rewards/length2tails_reward/std": 0.1946304738521576, + "rewards/thermo_reward/mean": 0.3624553084373474, + "rewards/thermo_reward/std": 1.8877215385437012, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 282.375, + "completions/mean_terminated_length": 282.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.21266052313148975, + "epoch": 1.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3180181980133057, + "learning_rate": 9.570230256394596e-07, + "loss": 0.0639, + "num_tokens": 4737556.0, + "reward": 6.304543972015381, + "reward_std": 3.1608636379241943, + "rewards/fitness_reward/mean": 5.891756057739258, + "rewards/fitness_reward/std": 2.753159523010254, + "rewards/kidney_reward/mean": 0.3563697040081024, + "rewards/kidney_reward/std": 1.2493822574615479, + "rewards/length2tails_reward/mean": 0.7824121117591858, + "rewards/length2tails_reward/std": 0.3018985986709595, + "rewards/thermo_reward/mean": 0.07800028473138809, + "rewards/thermo_reward/std": 1.96294367313385, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 265.65625, + "completions/mean_terminated_length": 265.65625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.13771737553179264, + "epoch": 1.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0566332340240479, + "learning_rate": 9.53719382424409e-07, + "loss": -0.0788, + "num_tokens": 4746089.0, + "reward": 5.871472358703613, + "reward_std": 2.911451816558838, + "rewards/fitness_reward/mean": 5.845986366271973, + "rewards/fitness_reward/std": 2.5759243965148926, + "rewards/kidney_reward/mean": -0.3787839710712433, + "rewards/kidney_reward/std": 1.3494809865951538, + "rewards/length2tails_reward/mean": 0.7842440605163574, + "rewards/length2tails_reward/std": 0.3241458535194397, + "rewards/thermo_reward/mean": 0.03763420134782791, + "rewards/thermo_reward/std": 2.00175142288208, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13120271638035774, + "epoch": 1.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2890165150165558, + "learning_rate": 9.504162453267776e-07, + "loss": -0.0012, + "num_tokens": 4754756.0, + "reward": 6.692496299743652, + "reward_std": 1.545830249786377, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.22100891172885895, + "rewards/kidney_reward/std": 1.4090465307235718, + "rewards/length2tails_reward/mean": 0.7602061033248901, + "rewards/length2tails_reward/std": 0.26392310857772827, + "rewards/thermo_reward/mean": 0.24927972257137299, + "rewards/thermo_reward/std": 1.9046598672866821, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1574163269251585, + "epoch": 1.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6579468250274658, + "learning_rate": 9.471136504691435e-07, + "loss": -0.0027, + "num_tokens": 4763470.0, + "reward": 5.896202087402344, + "reward_std": 3.3876235485076904, + "rewards/fitness_reward/mean": 5.515279769897461, + "rewards/fitness_reward/std": 3.1130592823028564, + "rewards/kidney_reward/mean": 0.3342685103416443, + "rewards/kidney_reward/std": 1.370279312133789, + "rewards/length2tails_reward/mean": 0.7904115319252014, + "rewards/length2tails_reward/std": 0.319992333650589, + "rewards/thermo_reward/mean": 0.032369282096624374, + "rewards/thermo_reward/std": 1.8723267316818237, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15581514779478312, + "epoch": 1.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9107969999313354, + "learning_rate": 9.438116339681544e-07, + "loss": 0.0108, + "num_tokens": 4772189.0, + "reward": 7.041689395904541, + "reward_std": 2.610949993133545, + "rewards/fitness_reward/mean": 6.133593559265137, + "rewards/fitness_reward/std": 2.071310520172119, + "rewards/kidney_reward/mean": 0.4656294882297516, + "rewards/kidney_reward/std": 1.5366322994232178, + "rewards/length2tails_reward/mean": 0.7978523969650269, + "rewards/length2tails_reward/std": 0.2588934600353241, + "rewards/thermo_reward/mean": 0.9516354203224182, + "rewards/thermo_reward/std": 1.543983817100525, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1403808044269681, + "epoch": 1.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3943225145339966, + "learning_rate": 9.405102319341344e-07, + "loss": -0.0008, + "num_tokens": 4780917.0, + "reward": 6.371104717254639, + "reward_std": 2.6571381092071533, + "rewards/fitness_reward/mean": 5.908653259277344, + "rewards/fitness_reward/std": 2.329563856124878, + "rewards/kidney_reward/mean": 0.42575061321258545, + "rewards/kidney_reward/std": 1.3236973285675049, + "rewards/length2tails_reward/mean": 0.8670136332511902, + "rewards/length2tails_reward/std": 0.21976728737354279, + "rewards/thermo_reward/mean": 0.06564557552337646, + "rewards/thermo_reward/std": 2.133981704711914, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 279.8125, + "completions/mean_terminated_length": 279.8125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.15770368836820126, + "epoch": 1.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.114337921142578, + "learning_rate": 9.372094804706866e-07, + "loss": 0.0889, + "num_tokens": 4789903.0, + "reward": 5.256720542907715, + "reward_std": 3.865236759185791, + "rewards/fitness_reward/mean": 4.968907356262207, + "rewards/fitness_reward/std": 3.8213346004486084, + "rewards/kidney_reward/mean": 0.1389780193567276, + "rewards/kidney_reward/std": 1.215377926826477, + "rewards/length2tails_reward/mean": 0.8178907632827759, + "rewards/length2tails_reward/std": 0.29823675751686096, + "rewards/thermo_reward/mean": 0.027702778577804565, + "rewards/thermo_reward/std": 1.8638756275177002, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13071757927536964, + "epoch": 1.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.138167381286621, + "learning_rate": 9.339094156743006e-07, + "loss": 0.001, + "num_tokens": 4798586.0, + "reward": 6.401601791381836, + "reward_std": 2.923518180847168, + "rewards/fitness_reward/mean": 5.93919038772583, + "rewards/fitness_reward/std": 2.580639600753784, + "rewards/kidney_reward/mean": 0.2830088138580322, + "rewards/kidney_reward/std": 1.5343753099441528, + "rewards/length2tails_reward/mean": 0.7652191519737244, + "rewards/length2tails_reward/std": 0.2832586467266083, + "rewards/thermo_reward/mean": 0.2592047154903412, + "rewards/thermo_reward/std": 1.913085699081421, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.1419760538265109, + "epoch": 1.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9503770470619202, + "learning_rate": 9.306100736339559e-07, + "loss": 0.0155, + "num_tokens": 4807285.0, + "reward": 6.153724193572998, + "reward_std": 2.7506442070007324, + "rewards/fitness_reward/mean": 5.825443744659424, + "rewards/fitness_reward/std": 2.6551458835601807, + "rewards/kidney_reward/mean": -0.14597657322883606, + "rewards/kidney_reward/std": 1.294512391090393, + "rewards/length2tails_reward/mean": 0.7951149344444275, + "rewards/length2tails_reward/std": 0.256059855222702, + "rewards/thermo_reward/mean": 0.40497976541519165, + "rewards/thermo_reward/std": 1.7230664491653442, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12735261023044586, + "epoch": 1.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6177323460578918, + "learning_rate": 9.273114904307289e-07, + "loss": 0.0042, + "num_tokens": 4815978.0, + "reward": 6.4180498123168945, + "reward_std": 1.2532094717025757, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.2775779068470001, + "rewards/kidney_reward/std": 1.5295125246047974, + "rewards/length2tails_reward/mean": 0.78519207239151, + "rewards/length2tails_reward/std": 0.26373088359832764, + "rewards/thermo_reward/mean": -0.4615146517753601, + "rewards/thermo_reward/std": 1.9175395965576172, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13758530840277672, + "epoch": 1.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.344401240348816, + "learning_rate": 9.240137021373968e-07, + "loss": 0.0019, + "num_tokens": 4824646.0, + "reward": 6.916250228881836, + "reward_std": 1.3116036653518677, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.05725880339741707, + "rewards/kidney_reward/std": 1.536643624305725, + "rewards/length2tails_reward/mean": 0.6976279020309448, + "rewards/length2tails_reward/std": 0.33351606130599976, + "rewards/thermo_reward/mean": 0.24383237957954407, + "rewards/thermo_reward/std": 1.8003408908843994, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13356123864650726, + "epoch": 1.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6651228070259094, + "learning_rate": 9.20716744818044e-07, + "loss": 0.0035, + "num_tokens": 4833347.0, + "reward": 6.4406585693359375, + "reward_std": 2.958179473876953, + "rewards/fitness_reward/mean": 5.9377546310424805, + "rewards/fitness_reward/std": 2.217132806777954, + "rewards/kidney_reward/mean": -0.004210323095321655, + "rewards/kidney_reward/std": 1.4389758110046387, + "rewards/length2tails_reward/mean": 0.8237462043762207, + "rewards/length2tails_reward/std": 0.23207122087478638, + "rewards/thermo_reward/mean": 0.5981446504592896, + "rewards/thermo_reward/std": 1.7600765228271484, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.1324286963790655, + "epoch": 1.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0271905660629272, + "learning_rate": 9.174206545276677e-07, + "loss": -0.0185, + "num_tokens": 4842035.0, + "reward": 5.576254844665527, + "reward_std": 3.0881810188293457, + "rewards/fitness_reward/mean": 5.562891006469727, + "rewards/fitness_reward/std": 3.262460947036743, + "rewards/kidney_reward/mean": -0.2107272744178772, + "rewards/kidney_reward/std": 1.325615644454956, + "rewards/length2tails_reward/mean": 0.8381778001785278, + "rewards/length2tails_reward/std": 0.2260654717683792, + "rewards/thermo_reward/mean": -0.18163442611694336, + "rewards/thermo_reward/std": 1.969087839126587, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 263.9375, + "completions/mean_terminated_length": 263.9375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.12110783345997334, + "epoch": 1.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9053553938865662, + "learning_rate": 9.141254673117829e-07, + "loss": -0.0898, + "num_tokens": 4850513.0, + "reward": 6.17215633392334, + "reward_std": 2.411080837249756, + "rewards/fitness_reward/mean": 5.85194730758667, + "rewards/fitness_reward/std": 2.562154769897461, + "rewards/kidney_reward/mean": -0.026886343955993652, + "rewards/kidney_reward/std": 1.2255576848983765, + "rewards/length2tails_reward/mean": 0.7556699514389038, + "rewards/length2tails_reward/std": 0.2960347831249237, + "rewards/thermo_reward/mean": 0.2894690930843353, + "rewards/thermo_reward/std": 1.944090485572815, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15741972997784615, + "epoch": 1.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39847999811172485, + "learning_rate": 9.108312192060296e-07, + "loss": -0.004, + "num_tokens": 4859245.0, + "reward": 6.996063232421875, + "reward_std": 1.321871280670166, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3381817042827606, + "rewards/kidney_reward/std": 1.32902991771698, + "rewards/length2tails_reward/mean": 0.7464842796325684, + "rewards/length2tails_reward/std": 0.2994208037853241, + "rewards/thermo_reward/mean": 0.3040847182273865, + "rewards/thermo_reward/std": 1.4871476888656616, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1460885526612401, + "epoch": 1.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4785166084766388, + "learning_rate": 9.075379462357765e-07, + "loss": 0.0048, + "num_tokens": 4867923.0, + "reward": 6.905086517333984, + "reward_std": 2.198906183242798, + "rewards/fitness_reward/mean": 6.181448936462402, + "rewards/fitness_reward/std": 1.8131176233291626, + "rewards/kidney_reward/mean": 0.3510921001434326, + "rewards/kidney_reward/std": 1.4123769998550415, + "rewards/length2tails_reward/mean": 0.7582607865333557, + "rewards/length2tails_reward/std": 0.2772643268108368, + "rewards/thermo_reward/mean": 0.717052698135376, + "rewards/thermo_reward/std": 1.7030202150344849, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14641050435602665, + "epoch": 1.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.412349134683609, + "learning_rate": 9.042456844157298e-07, + "loss": 0.0034, + "num_tokens": 4876600.0, + "reward": 6.473404884338379, + "reward_std": 1.7912063598632812, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.42937731742858887, + "rewards/kidney_reward/std": 1.506989598274231, + "rewards/length2tails_reward/mean": 0.7886794805526733, + "rewards/length2tails_reward/std": 0.26198115944862366, + "rewards/thermo_reward/mean": 0.41718509793281555, + "rewards/thermo_reward/std": 1.5530680418014526, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 275.03125, + "completions/mean_terminated_length": 275.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15689587220549583, + "epoch": 1.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0156164169311523, + "learning_rate": 9.009544697495373e-07, + "loss": 0.0652, + "num_tokens": 4885433.0, + "reward": 6.1605224609375, + "reward_std": 2.798455238342285, + "rewards/fitness_reward/mean": 5.870935916900635, + "rewards/fitness_reward/std": 2.4831860065460205, + "rewards/kidney_reward/mean": 0.3200324773788452, + "rewards/kidney_reward/std": 1.4810597896575928, + "rewards/length2tails_reward/mean": 0.7556763291358948, + "rewards/length2tails_reward/std": 0.2771874964237213, + "rewards/thermo_reward/mean": -0.1186968982219696, + "rewards/thermo_reward/std": 1.7100430727005005, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1417178362607956, + "epoch": 1.1179999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7429875731468201, + "learning_rate": 8.97664338229395e-07, + "loss": 0.0051, + "num_tokens": 4894133.0, + "reward": 6.412052154541016, + "reward_std": 1.9399174451828003, + "rewards/fitness_reward/mean": 6.315244674682617, + "rewards/fitness_reward/std": 1.5615921020507812, + "rewards/kidney_reward/mean": -0.19362059235572815, + "rewards/kidney_reward/std": 1.4891669750213623, + "rewards/length2tails_reward/mean": 0.8171731233596802, + "rewards/length2tails_reward/std": 0.25483256578445435, + "rewards/thermo_reward/mean": -0.02135181427001953, + "rewards/thermo_reward/std": 1.9206360578536987, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.14815932419151068, + "epoch": 1.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.320993423461914, + "learning_rate": 8.943753258356545e-07, + "loss": -0.0404, + "num_tokens": 4902697.0, + "reward": 6.033912658691406, + "reward_std": 2.7762601375579834, + "rewards/fitness_reward/mean": 5.682241439819336, + "rewards/fitness_reward/std": 2.8782520294189453, + "rewards/kidney_reward/mean": -0.08795703947544098, + "rewards/kidney_reward/std": 1.190619945526123, + "rewards/length2tails_reward/mean": 0.7094908356666565, + "rewards/length2tails_reward/std": 0.3566397428512573, + "rewards/thermo_reward/mean": 0.4365541338920593, + "rewards/thermo_reward/std": 1.6525672674179077, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 281.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14551371429115534, + "epoch": 1.1219999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8949246406555176, + "learning_rate": 8.910874685364273e-07, + "loss": 0.0631, + "num_tokens": 4911749.0, + "reward": 6.784656524658203, + "reward_std": 1.4535659551620483, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.008047327399253845, + "rewards/kidney_reward/std": 1.3660366535186768, + "rewards/length2tails_reward/mean": 0.7835389971733093, + "rewards/length2tails_reward/std": 0.25396519899368286, + "rewards/thermo_reward/mean": 0.002994745969772339, + "rewards/thermo_reward/std": 2.0884358882904053, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 276.6875, + "completions/mean_terminated_length": 276.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15507922414690256, + "epoch": 1.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5992161631584167, + "learning_rate": 8.878008022871958e-07, + "loss": 0.0077, + "num_tokens": 4920635.0, + "reward": 6.673271179199219, + "reward_std": 1.9338127374649048, + "rewards/fitness_reward/mean": 6.303162097930908, + "rewards/fitness_reward/std": 1.629944086074829, + "rewards/kidney_reward/mean": 0.1278885155916214, + "rewards/kidney_reward/std": 1.370566487312317, + "rewards/length2tails_reward/mean": 0.7881702184677124, + "rewards/length2tails_reward/std": 0.2620719075202942, + "rewards/thermo_reward/mean": 0.2182437926530838, + "rewards/thermo_reward/std": 1.86492919921875, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 264.96875, + "completions/mean_terminated_length": 264.96875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.15227935276925564, + "epoch": 1.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4911561906337738, + "learning_rate": 8.845153630304139e-07, + "loss": -0.12, + "num_tokens": 4929146.0, + "reward": 6.32029914855957, + "reward_std": 2.370889902114868, + "rewards/fitness_reward/mean": 6.15533447265625, + "rewards/fitness_reward/std": 1.9535919427871704, + "rewards/kidney_reward/mean": -0.14201894402503967, + "rewards/kidney_reward/std": 1.3033816814422607, + "rewards/length2tails_reward/mean": 0.872943103313446, + "rewards/length2tails_reward/std": 0.19380095601081848, + "rewards/thermo_reward/mean": 0.0354764461517334, + "rewards/thermo_reward/std": 2.0793440341949463, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1207326352596283, + "epoch": 1.1280000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5171439051628113, + "learning_rate": 8.812311866951198e-07, + "loss": -0.0029, + "num_tokens": 4937818.0, + "reward": 6.324062347412109, + "reward_std": 2.3502964973449707, + "rewards/fitness_reward/mean": 6.193474769592285, + "rewards/fitness_reward/std": 1.7488290071487427, + "rewards/kidney_reward/mean": -0.09047335386276245, + "rewards/kidney_reward/std": 1.4314613342285156, + "rewards/length2tails_reward/mean": 0.7447332143783569, + "rewards/length2tails_reward/std": 0.3103727400302887, + "rewards/thermo_reward/mean": -0.020717978477478027, + "rewards/thermo_reward/std": 2.023076057434082, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 267.6875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.146316884085536, + "epoch": 1.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9321585893630981, + "learning_rate": 8.779483091965398e-07, + "loss": -0.0518, + "num_tokens": 4946416.0, + "reward": 5.93803596496582, + "reward_std": 3.2448863983154297, + "rewards/fitness_reward/mean": 5.738128185272217, + "rewards/fitness_reward/std": 2.9822914600372314, + "rewards/kidney_reward/mean": -0.08433566242456436, + "rewards/kidney_reward/std": 1.4299689531326294, + "rewards/length2tails_reward/mean": 0.8150462508201599, + "rewards/length2tails_reward/std": 0.24676427245140076, + "rewards/thermo_reward/mean": 0.07662725448608398, + "rewards/thermo_reward/std": 2.0116751194000244, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13636144809424877, + "epoch": 1.1320000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8187625408172607, + "learning_rate": 8.746667664356955e-07, + "loss": 0.0034, + "num_tokens": 4955106.0, + "reward": 6.594188690185547, + "reward_std": 1.054268717765808, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.061730969697237015, + "rewards/kidney_reward/std": 1.3794859647750854, + "rewards/length2tails_reward/mean": 0.7963079810142517, + "rewards/length2tails_reward/std": 0.278277188539505, + "rewards/thermo_reward/mean": -0.3306412696838379, + "rewards/thermo_reward/std": 1.8308331966400146, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13865534961223602, + "epoch": 1.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8117534518241882, + "learning_rate": 8.713865942990141e-07, + "loss": 0.0017, + "num_tokens": 4963814.0, + "reward": 6.841464996337891, + "reward_std": 2.2679343223571777, + "rewards/fitness_reward/mean": 6.187190532684326, + "rewards/fitness_reward/std": 1.7823883295059204, + "rewards/kidney_reward/mean": 0.2006911188364029, + "rewards/kidney_reward/std": 1.465105414390564, + "rewards/length2tails_reward/mean": 0.8203763961791992, + "rewards/length2tails_reward/std": 0.21558518707752228, + "rewards/thermo_reward/mean": 0.6976690888404846, + "rewards/thermo_reward/std": 1.7370494604110718, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10764083079993725, + "epoch": 1.1360000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1352214813232422, + "learning_rate": 8.68107828657931e-07, + "loss": -0.0026, + "num_tokens": 4972460.0, + "reward": 6.440495491027832, + "reward_std": 1.5797516107559204, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.2657780051231384, + "rewards/kidney_reward/std": 1.2273304462432861, + "rewards/length2tails_reward/mean": 0.6784309148788452, + "rewards/length2tails_reward/std": 0.36211761832237244, + "rewards/thermo_reward/mean": 0.03691381216049194, + "rewards/thermo_reward/std": 1.959222674369812, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15133801195770502, + "epoch": 1.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0756551027297974, + "learning_rate": 8.648305053685034e-07, + "loss": 0.003, + "num_tokens": 4981193.0, + "reward": 6.505936622619629, + "reward_std": 2.7848563194274902, + "rewards/fitness_reward/mean": 5.945821762084961, + "rewards/fitness_reward/std": 2.5500574111938477, + "rewards/kidney_reward/mean": -0.033743105828762054, + "rewards/kidney_reward/std": 1.4649971723556519, + "rewards/length2tails_reward/mean": 0.8186132907867432, + "rewards/length2tails_reward/std": 0.20480157434940338, + "rewards/thermo_reward/mean": 0.7446656823158264, + "rewards/thermo_reward/std": 1.6348015069961548, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.20986818429082632, + "epoch": 1.1400000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.184492826461792, + "learning_rate": 8.615546602710124e-07, + "loss": 0.0243, + "num_tokens": 4989976.0, + "reward": 6.095884323120117, + "reward_std": 3.152451515197754, + "rewards/fitness_reward/mean": 5.789475440979004, + "rewards/fitness_reward/std": 2.78916072845459, + "rewards/kidney_reward/mean": -0.12849631905555725, + "rewards/kidney_reward/std": 1.322196364402771, + "rewards/length2tails_reward/mean": 0.7794613838195801, + "rewards/length2tails_reward/std": 0.286723256111145, + "rewards/thermo_reward/mean": 0.35158205032348633, + "rewards/thermo_reward/std": 1.7895629405975342, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 276.4375, + "completions/mean_terminated_length": 276.4375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16466512717306614, + "epoch": 1.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0158518552780151, + "learning_rate": 8.582803291895757e-07, + "loss": 0.0115, + "num_tokens": 4998854.0, + "reward": 6.794222831726074, + "reward_std": 2.24648118019104, + "rewards/fitness_reward/mean": 6.17574405670166, + "rewards/fitness_reward/std": 1.8437060117721558, + "rewards/kidney_reward/mean": 0.2754228711128235, + "rewards/kidney_reward/std": 1.228481411933899, + "rewards/length2tails_reward/mean": 0.7832163572311401, + "rewards/length2tails_reward/std": 0.2660101652145386, + "rewards/thermo_reward/mean": 0.5699260234832764, + "rewards/thermo_reward/std": 1.8275165557861328, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13757296092808247, + "epoch": 1.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.332283616065979, + "learning_rate": 8.550075479317542e-07, + "loss": 0.0003, + "num_tokens": 5007538.0, + "reward": 6.745051860809326, + "reward_std": 1.3034541606903076, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.22021061182022095, + "rewards/kidney_reward/std": 1.3577102422714233, + "rewards/length2tails_reward/mean": 0.7379215955734253, + "rewards/length2tails_reward/std": 0.28174179792404175, + "rewards/thermo_reward/mean": -0.2816644310951233, + "rewards/thermo_reward/std": 1.8626832962036133, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.13993708230555058, + "epoch": 1.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4096217751502991, + "learning_rate": 8.517363522881578e-07, + "loss": -0.0013, + "num_tokens": 5016222.0, + "reward": 6.8677873611450195, + "reward_std": 1.459789752960205, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.05568697303533554, + "rewards/kidney_reward/std": 1.5286734104156494, + "rewards/length2tails_reward/mean": 0.8292410373687744, + "rewards/length2tails_reward/std": 0.23444099724292755, + "rewards/thermo_reward/mean": 0.40002214908599854, + "rewards/thermo_reward/std": 1.7137781381607056, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14512245077639818, + "epoch": 1.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.797149658203125, + "learning_rate": 8.484667780320597e-07, + "loss": 0.0029, + "num_tokens": 5024912.0, + "reward": 6.788018703460693, + "reward_std": 1.357239842414856, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.006016001105308533, + "rewards/kidney_reward/std": 1.3125320672988892, + "rewards/length2tails_reward/mean": 0.8095096349716187, + "rewards/length2tails_reward/std": 0.25289738178253174, + "rewards/thermo_reward/mean": 0.8065821528434753, + "rewards/thermo_reward/std": 1.5437324047088623, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1407864410430193, + "epoch": 1.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32123813033103943, + "learning_rate": 8.451988609189986e-07, + "loss": -0.0028, + "num_tokens": 5033610.0, + "reward": 7.116694450378418, + "reward_std": 1.1050056219100952, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.052993372082710266, + "rewards/kidney_reward/std": 1.4540084600448608, + "rewards/length2tails_reward/mean": 0.8535097241401672, + "rewards/length2tails_reward/std": 0.16250735521316528, + "rewards/thermo_reward/mean": 0.5710432529449463, + "rewards/thermo_reward/std": 1.459686517715454, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14363481849431992, + "epoch": 1.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.147871971130371, + "learning_rate": 8.419326366863937e-07, + "loss": 0.0589, + "num_tokens": 5042391.0, + "reward": 6.446317195892334, + "reward_std": 2.6782422065734863, + "rewards/fitness_reward/mean": 6.207469463348389, + "rewards/fitness_reward/std": 2.171262502670288, + "rewards/kidney_reward/mean": 0.12232324481010437, + "rewards/kidney_reward/std": 1.4508579969406128, + "rewards/length2tails_reward/mean": 0.7448511719703674, + "rewards/length2tails_reward/std": 0.3329923450946808, + "rewards/thermo_reward/mean": -0.017053082585334778, + "rewards/thermo_reward/std": 2.130506992340088, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 276.15625, + "completions/mean_terminated_length": 276.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11538595240563154, + "epoch": 1.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9062669277191162, + "learning_rate": 8.38668141053149e-07, + "loss": 0.0947, + "num_tokens": 5051260.0, + "reward": 6.2534942626953125, + "reward_std": 2.4839119911193848, + "rewards/fitness_reward/mean": 5.95405387878418, + "rewards/fitness_reward/std": 2.047356128692627, + "rewards/kidney_reward/mean": 0.3151780962944031, + "rewards/kidney_reward/std": 1.408141851425171, + "rewards/length2tails_reward/mean": 0.6764785051345825, + "rewards/length2tails_reward/std": 0.3764680027961731, + "rewards/thermo_reward/mean": -0.05453641712665558, + "rewards/thermo_reward/std": 1.8243632316589355, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13400636054575443, + "epoch": 1.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35709095001220703, + "learning_rate": 8.354054097192659e-07, + "loss": -0.0075, + "num_tokens": 5059967.0, + "reward": 7.217033863067627, + "reward_std": 1.222536563873291, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.5806408524513245, + "rewards/kidney_reward/std": 1.4004333019256592, + "rewards/length2tails_reward/mean": 0.7713372707366943, + "rewards/length2tails_reward/std": 0.28338032960891724, + "rewards/thermo_reward/mean": 0.6971180438995361, + "rewards/thermo_reward/std": 1.6416670083999634, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.1875, + "completions/mean_terminated_length": 266.1875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.2043990483507514, + "epoch": 1.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3258284330368042, + "learning_rate": 8.321444783654523e-07, + "loss": -0.05, + "num_tokens": 5068517.0, + "reward": 5.923011779785156, + "reward_std": 3.4309871196746826, + "rewards/fitness_reward/mean": 5.577899932861328, + "rewards/fitness_reward/std": 2.8630788326263428, + "rewards/kidney_reward/mean": 0.4526729881763458, + "rewards/kidney_reward/std": 1.3635318279266357, + "rewards/length2tails_reward/mean": 0.7340092658996582, + "rewards/length2tails_reward/std": 0.32914623618125916, + "rewards/thermo_reward/mean": -0.12945333123207092, + "rewards/thermo_reward/std": 1.788311243057251, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 262.96875, + "completions/mean_terminated_length": 262.96875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.1358304200693965, + "epoch": 1.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9773128628730774, + "learning_rate": 8.288853826527299e-07, + "loss": -0.1039, + "num_tokens": 5076964.0, + "reward": 6.189246654510498, + "reward_std": 3.103529214859009, + "rewards/fitness_reward/mean": 5.797711372375488, + "rewards/fitness_reward/std": 2.756150245666504, + "rewards/kidney_reward/mean": -0.15518364310264587, + "rewards/kidney_reward/std": 1.3070719242095947, + "rewards/length2tails_reward/mean": 0.7876451015472412, + "rewards/length2tails_reward/std": 0.2878929376602173, + "rewards/thermo_reward/mean": 0.5444309711456299, + "rewards/thermo_reward/std": 1.6022015810012817, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1535765863955021, + "epoch": 1.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4475181102752686, + "learning_rate": 8.256281582220485e-07, + "loss": 0.0078, + "num_tokens": 5085671.0, + "reward": 6.406468868255615, + "reward_std": 2.172994613647461, + "rewards/fitness_reward/mean": 6.141479969024658, + "rewards/fitness_reward/std": 2.0285377502441406, + "rewards/kidney_reward/mean": 0.11090327054262161, + "rewards/kidney_reward/std": 1.238804817199707, + "rewards/length2tails_reward/mean": 0.7958652973175049, + "rewards/length2tails_reward/std": 0.27049940824508667, + "rewards/thermo_reward/mean": 0.021141860634088516, + "rewards/thermo_reward/std": 1.866187334060669, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12724142894148827, + "epoch": 1.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27392685413360596, + "learning_rate": 8.223728406938913e-07, + "loss": 0.0058, + "num_tokens": 5094341.0, + "reward": 6.203819274902344, + "reward_std": 2.6722285747528076, + "rewards/fitness_reward/mean": 5.789396286010742, + "rewards/fitness_reward/std": 2.4327392578125, + "rewards/kidney_reward/mean": 0.06290467828512192, + "rewards/kidney_reward/std": 1.399139165878296, + "rewards/length2tails_reward/mean": 0.7463275790214539, + "rewards/length2tails_reward/std": 0.3078264594078064, + "rewards/thermo_reward/mean": 0.3927791118621826, + "rewards/thermo_reward/std": 1.8644540309906006, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12647257279604673, + "epoch": 1.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9638093709945679, + "learning_rate": 8.191194656678904e-07, + "loss": -0.0007, + "num_tokens": 5103013.0, + "reward": 6.61676025390625, + "reward_std": 2.203179359436035, + "rewards/fitness_reward/mean": 6.189640522003174, + "rewards/fitness_reward/std": 1.769295334815979, + "rewards/kidney_reward/mean": 0.0783485472202301, + "rewards/kidney_reward/std": 1.4041903018951416, + "rewards/length2tails_reward/mean": 0.678724467754364, + "rewards/length2tails_reward/std": 0.3460564613342285, + "rewards/thermo_reward/mean": 0.4365290701389313, + "rewards/thermo_reward/std": 1.6493560075759888, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14152859803289175, + "epoch": 1.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.275541067123413, + "learning_rate": 8.158680687224328e-07, + "loss": 0.0421, + "num_tokens": 5111789.0, + "reward": 5.870347499847412, + "reward_std": 2.946139097213745, + "rewards/fitness_reward/mean": 5.712801456451416, + "rewards/fitness_reward/std": 2.7289793491363525, + "rewards/kidney_reward/mean": -0.1301552653312683, + "rewards/kidney_reward/std": 1.2211557626724243, + "rewards/length2tails_reward/mean": 0.7000634670257568, + "rewards/length2tails_reward/std": 0.35693538188934326, + "rewards/thermo_reward/mean": 0.09521618485450745, + "rewards/thermo_reward/std": 1.8078317642211914, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14893931802362204, + "epoch": 1.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5198249220848083, + "learning_rate": 8.126186854142751e-07, + "loss": -0.0046, + "num_tokens": 5120535.0, + "reward": 6.65510892868042, + "reward_std": 1.577199935913086, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.04684050381183624, + "rewards/kidney_reward/std": 1.266330599784851, + "rewards/length2tails_reward/mean": 0.8248117566108704, + "rewards/length2tails_reward/std": 0.22090297937393188, + "rewards/thermo_reward/mean": 0.17401191592216492, + "rewards/thermo_reward/std": 1.8194555044174194, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13215283025056124, + "epoch": 1.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7394059896469116, + "learning_rate": 8.093713512781532e-07, + "loss": -0.0017, + "num_tokens": 5129234.0, + "reward": 6.729998588562012, + "reward_std": 1.4616167545318604, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.06995169818401337, + "rewards/kidney_reward/std": 1.3386176824569702, + "rewards/length2tails_reward/mean": 0.7853714823722839, + "rewards/length2tails_reward/std": 0.28365448117256165, + "rewards/thermo_reward/mean": 0.16064414381980896, + "rewards/thermo_reward/std": 1.835227370262146, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14433594420552254, + "epoch": 1.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46236541867256165, + "learning_rate": 8.061261018263918e-07, + "loss": 0.0104, + "num_tokens": 5138004.0, + "reward": 7.0586652755737305, + "reward_std": 1.2240278720855713, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.03153465688228607, + "rewards/kidney_reward/std": 1.2283109426498413, + "rewards/length2tails_reward/mean": 0.8546296954154968, + "rewards/length2tails_reward/std": 0.22107666730880737, + "rewards/thermo_reward/mean": 0.4758845865726471, + "rewards/thermo_reward/std": 1.9577088356018066, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13494574837386608, + "epoch": 1.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5038905739784241, + "learning_rate": 8.028829725485197e-07, + "loss": 0.0011, + "num_tokens": 5146718.0, + "reward": 6.719306945800781, + "reward_std": 1.4132814407348633, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3096298575401306, + "rewards/kidney_reward/std": 1.2510836124420166, + "rewards/length2tails_reward/mean": 0.7908936738967896, + "rewards/length2tails_reward/std": 0.27867591381073, + "rewards/thermo_reward/mean": -0.2430802285671234, + "rewards/thermo_reward/std": 1.8624811172485352, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14672483783215284, + "epoch": 1.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7989422082901001, + "learning_rate": 7.996419989108788e-07, + "loss": 0.0051, + "num_tokens": 5155482.0, + "reward": 6.9287614822387695, + "reward_std": 1.4752769470214844, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.0017717070877552032, + "rewards/kidney_reward/std": 1.246278166770935, + "rewards/length2tails_reward/mean": 0.8348451852798462, + "rewards/length2tails_reward/std": 0.22858649492263794, + "rewards/thermo_reward/mean": 0.46525460481643677, + "rewards/thermo_reward/std": 1.7359920740127563, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13120158202946186, + "epoch": 1.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4027276933193207, + "learning_rate": 7.964032163562377e-07, + "loss": 0.0032, + "num_tokens": 5164194.0, + "reward": 6.588923931121826, + "reward_std": 2.5632362365722656, + "rewards/fitness_reward/mean": 6.260285377502441, + "rewards/fitness_reward/std": 1.8724900484085083, + "rewards/kidney_reward/mean": 0.014665953814983368, + "rewards/kidney_reward/std": 1.5442041158676147, + "rewards/length2tails_reward/mean": 0.7987914085388184, + "rewards/length2tails_reward/std": 0.2774232029914856, + "rewards/thermo_reward/mean": 0.2432149201631546, + "rewards/thermo_reward/std": 1.9349545240402222, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.1326597034931183, + "epoch": 1.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5109566450119019, + "learning_rate": 7.931666603034032e-07, + "loss": -0.0355, + "num_tokens": 5172794.0, + "reward": 6.2049055099487305, + "reward_std": 3.610879421234131, + "rewards/fitness_reward/mean": 5.613210201263428, + "rewards/fitness_reward/std": 3.1262240409851074, + "rewards/kidney_reward/mean": 0.12749797105789185, + "rewards/kidney_reward/std": 1.3113789558410645, + "rewards/length2tails_reward/mean": 0.7408642768859863, + "rewards/length2tails_reward/std": 0.3128291368484497, + "rewards/thermo_reward/mean": 0.685460090637207, + "rewards/thermo_reward/std": 1.6072545051574707, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 286.84375, + "completions/mean_terminated_length": 286.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16178287006914616, + "epoch": 1.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.907186508178711, + "learning_rate": 7.899323661468343e-07, + "loss": 0.1937, + "num_tokens": 5182005.0, + "reward": 6.523500442504883, + "reward_std": 2.292417287826538, + "rewards/fitness_reward/mean": 6.153111457824707, + "rewards/fitness_reward/std": 1.9655989408493042, + "rewards/kidney_reward/mean": 0.2648105025291443, + "rewards/kidney_reward/std": 1.5198214054107666, + "rewards/length2tails_reward/mean": 0.7844908237457275, + "rewards/length2tails_reward/std": 0.2842472791671753, + "rewards/thermo_reward/mean": 0.08372097462415695, + "rewards/thermo_reward/std": 1.9427759647369385, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13564716465771198, + "epoch": 1.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.083633542060852, + "learning_rate": 7.867003692562533e-07, + "loss": 0.0022, + "num_tokens": 5190683.0, + "reward": 6.414722442626953, + "reward_std": 2.82068133354187, + "rewards/fitness_reward/mean": 5.61900520324707, + "rewards/fitness_reward/std": 2.731590509414673, + "rewards/kidney_reward/mean": 0.4946686923503876, + "rewards/kidney_reward/std": 1.3424859046936035, + "rewards/length2tails_reward/mean": 0.7192057371139526, + "rewards/length2tails_reward/std": 0.3258393108844757, + "rewards/thermo_reward/mean": 0.7371622920036316, + "rewards/thermo_reward/std": 1.6627081632614136, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14434853196144104, + "epoch": 1.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6702961325645447, + "learning_rate": 7.834707049762603e-07, + "loss": -0.0079, + "num_tokens": 5199457.0, + "reward": 7.122941017150879, + "reward_std": 1.1753270626068115, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4700015187263489, + "rewards/kidney_reward/std": 1.2932708263397217, + "rewards/length2tails_reward/mean": 0.7321652173995972, + "rewards/length2tails_reward/std": 0.3327972888946533, + "rewards/thermo_reward/mean": 0.22720155119895935, + "rewards/thermo_reward/std": 1.7189815044403076, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13265551067888737, + "epoch": 1.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3333011865615845, + "learning_rate": 7.802434086259468e-07, + "loss": -0.0056, + "num_tokens": 5208150.0, + "reward": 7.169600009918213, + "reward_std": 1.2789298295974731, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.28085535764694214, + "rewards/kidney_reward/std": 1.210427165031433, + "rewards/length2tails_reward/mean": 0.7825354337692261, + "rewards/length2tails_reward/std": 0.25303974747657776, + "rewards/thermo_reward/mean": 0.6904585361480713, + "rewards/thermo_reward/std": 1.4946491718292236, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12822877150028944, + "epoch": 1.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46452242136001587, + "learning_rate": 7.770185154985084e-07, + "loss": -0.0059, + "num_tokens": 5216833.0, + "reward": 6.615362167358398, + "reward_std": 1.523648977279663, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.06162615120410919, + "rewards/kidney_reward/std": 1.327879548072815, + "rewards/length2tails_reward/mean": 0.7592154741287231, + "rewards/length2tails_reward/std": 0.28252682089805603, + "rewards/thermo_reward/mean": 0.3480813503265381, + "rewards/thermo_reward/std": 1.6789804697036743, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12154591176658869, + "epoch": 1.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5403944849967957, + "learning_rate": 7.737960608608599e-07, + "loss": -0.0004, + "num_tokens": 5225501.0, + "reward": 6.404595375061035, + "reward_std": 1.3753118515014648, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03437618166208267, + "rewards/kidney_reward/std": 1.123569130897522, + "rewards/length2tails_reward/mean": 0.7464485764503479, + "rewards/length2tails_reward/std": 0.303548127412796, + "rewards/thermo_reward/mean": -0.506275475025177, + "rewards/thermo_reward/std": 2.1634864807128906, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.14206543564796448, + "epoch": 1.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3620198965072632, + "learning_rate": 7.705760799532485e-07, + "loss": -0.0582, + "num_tokens": 5234117.0, + "reward": 5.687882423400879, + "reward_std": 3.8474349975585938, + "rewards/fitness_reward/mean": 5.23832368850708, + "rewards/fitness_reward/std": 3.389721632003784, + "rewards/kidney_reward/mean": 0.030551522970199585, + "rewards/kidney_reward/std": 1.4576585292816162, + "rewards/length2tails_reward/mean": 0.8674226403236389, + "rewards/length2tails_reward/std": 0.17617519199848175, + "rewards/thermo_reward/mean": 0.4348553419113159, + "rewards/thermo_reward/std": 1.88848876953125, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.15390462707728148, + "epoch": 1.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1230080127716064, + "learning_rate": 7.673586079888697e-07, + "loss": -0.0114, + "num_tokens": 5242783.0, + "reward": 5.792421340942383, + "reward_std": 3.0871574878692627, + "rewards/fitness_reward/mean": 5.8299055099487305, + "rewards/fitness_reward/std": 2.6357665061950684, + "rewards/kidney_reward/mean": -0.5022517442703247, + "rewards/kidney_reward/std": 1.0752254724502563, + "rewards/length2tails_reward/mean": 0.7902992367744446, + "rewards/length2tails_reward/std": 0.3036975562572479, + "rewards/thermo_reward/mean": 0.03213368356227875, + "rewards/thermo_reward/std": 2.072735071182251, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.17657050769776106, + "epoch": 1.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.678818702697754, + "learning_rate": 7.641436801534817e-07, + "loss": 0.0086, + "num_tokens": 5251467.0, + "reward": 5.838535308837891, + "reward_std": 3.150883674621582, + "rewards/fitness_reward/mean": 5.399847984313965, + "rewards/fitness_reward/std": 3.16312837600708, + "rewards/kidney_reward/mean": -0.03744514286518097, + "rewards/kidney_reward/std": 1.2415835857391357, + "rewards/length2tails_reward/mean": 0.7904901504516602, + "rewards/length2tails_reward/std": 0.23812848329544067, + "rewards/thermo_reward/mean": 0.5195754766464233, + "rewards/thermo_reward/std": 1.652632474899292, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13160788360983133, + "epoch": 1.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27272090315818787, + "learning_rate": 7.609313316050199e-07, + "loss": -0.0037, + "num_tokens": 5260132.0, + "reward": 7.237815856933594, + "reward_std": 0.9438698887825012, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.20958009362220764, + "rewards/kidney_reward/std": 1.1185325384140015, + "rewards/length2tails_reward/mean": 0.7409188747406006, + "rewards/length2tails_reward/std": 0.2941511571407318, + "rewards/thermo_reward/mean": 0.7129966020584106, + "rewards/thermo_reward/std": 1.4531610012054443, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.59375, + "completions/mean_terminated_length": 263.59375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.13048188295215368, + "epoch": 1.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1722381114959717, + "learning_rate": 7.577215974732138e-07, + "loss": -0.1067, + "num_tokens": 5268599.0, + "reward": 6.594254493713379, + "reward_std": 2.2381532192230225, + "rewards/fitness_reward/mean": 6.136876106262207, + "rewards/fitness_reward/std": 2.0535004138946533, + "rewards/kidney_reward/mean": 0.05201803147792816, + "rewards/kidney_reward/std": 1.0911787748336792, + "rewards/length2tails_reward/mean": 0.713343620300293, + "rewards/length2tails_reward/std": 0.3508329689502716, + "rewards/thermo_reward/mean": 0.5060676336288452, + "rewards/thermo_reward/std": 1.9209312200546265, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13748124428093433, + "epoch": 1.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4540555775165558, + "learning_rate": 7.545145128592008e-07, + "loss": -0.0001, + "num_tokens": 5277302.0, + "reward": 6.775562286376953, + "reward_std": 1.4684947729110718, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.23769515752792358, + "rewards/kidney_reward/std": 1.3861310482025146, + "rewards/length2tails_reward/mean": 0.8420854210853577, + "rewards/length2tails_reward/std": 0.19192719459533691, + "rewards/thermo_reward/mean": 0.3911591172218323, + "rewards/thermo_reward/std": 1.9770121574401855, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.21875, + "completions/mean_terminated_length": 268.21875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.16660019382834435, + "epoch": 1.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1002583503723145, + "learning_rate": 7.513101128351453e-07, + "loss": -0.0375, + "num_tokens": 5285917.0, + "reward": 6.294078350067139, + "reward_std": 2.8583486080169678, + "rewards/fitness_reward/mean": 5.810585021972656, + "rewards/fitness_reward/std": 2.7157864570617676, + "rewards/kidney_reward/mean": -0.1376389116048813, + "rewards/kidney_reward/std": 1.2746502161026, + "rewards/length2tails_reward/mean": 0.7809045910835266, + "rewards/length2tails_reward/std": 0.2980997562408447, + "rewards/thermo_reward/mean": 0.7141739130020142, + "rewards/thermo_reward/std": 1.7252804040908813, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13461593352258205, + "epoch": 1.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1397013664245605, + "learning_rate": 7.48108432443852e-07, + "loss": 0.0017, + "num_tokens": 5294628.0, + "reward": 5.489590644836426, + "reward_std": 3.355715274810791, + "rewards/fitness_reward/mean": 5.299132347106934, + "rewards/fitness_reward/std": 3.4843578338623047, + "rewards/kidney_reward/mean": -0.1558576077222824, + "rewards/kidney_reward/std": 1.2660735845565796, + "rewards/length2tails_reward/mean": 0.744053065776825, + "rewards/length2tails_reward/std": 0.3339652717113495, + "rewards/thermo_reward/mean": 0.16474637389183044, + "rewards/thermo_reward/std": 1.9988871812820435, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1321342485025525, + "epoch": 1.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46841612458229065, + "learning_rate": 7.449095066983848e-07, + "loss": 0.002, + "num_tokens": 5303294.0, + "reward": 7.165895938873291, + "reward_std": 1.181601643562317, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2960917055606842, + "rewards/kidney_reward/std": 1.3534188270568848, + "rewards/length2tails_reward/mean": 0.7463406324386597, + "rewards/length2tails_reward/std": 0.23171299695968628, + "rewards/thermo_reward/mean": 0.6859114170074463, + "rewards/thermo_reward/std": 1.6473795175552368, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14923159778118134, + "epoch": 1.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5477750897407532, + "learning_rate": 7.417133705816836e-07, + "loss": 0.0049, + "num_tokens": 5312035.0, + "reward": 6.401854515075684, + "reward_std": 2.9134676456451416, + "rewards/fitness_reward/mean": 5.927549362182617, + "rewards/fitness_reward/std": 2.255948066711426, + "rewards/kidney_reward/mean": -0.17237554490566254, + "rewards/kidney_reward/std": 1.3583468198776245, + "rewards/length2tails_reward/mean": 0.8513798713684082, + "rewards/length2tails_reward/std": 0.21917584538459778, + "rewards/thermo_reward/mean": 0.6952966451644897, + "rewards/thermo_reward/std": 1.7249643802642822, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.28125, + "completions/mean_terminated_length": 263.28125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.12810958083719015, + "epoch": 1.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39304810762405396, + "learning_rate": 7.385200590461802e-07, + "loss": -0.1279, + "num_tokens": 5320492.0, + "reward": 6.815650463104248, + "reward_std": 2.4583919048309326, + "rewards/fitness_reward/mean": 6.164605617523193, + "rewards/fitness_reward/std": 1.9035958051681519, + "rewards/kidney_reward/mean": 0.5095815658569336, + "rewards/kidney_reward/std": 1.2497411966323853, + "rewards/length2tails_reward/mean": 0.733183741569519, + "rewards/length2tails_reward/std": 0.3376908004283905, + "rewards/thermo_reward/mean": 0.4259158968925476, + "rewards/thermo_reward/std": 1.882502555847168, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1363762691617012, + "epoch": 1.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0076494216918945, + "learning_rate": 7.353296070134185e-07, + "loss": 0.0061, + "num_tokens": 5329216.0, + "reward": 6.859580993652344, + "reward_std": 1.4216362237930298, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.07885540276765823, + "rewards/kidney_reward/std": 1.212181806564331, + "rewards/length2tails_reward/mean": 0.8184213638305664, + "rewards/length2tails_reward/std": 0.27560845017433167, + "rewards/thermo_reward/mean": 0.25447773933410645, + "rewards/thermo_reward/std": 1.891952395439148, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1458069821819663, + "epoch": 1.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6213310956954956, + "learning_rate": 7.321420493736704e-07, + "loss": -0.0033, + "num_tokens": 5337922.0, + "reward": 6.900601387023926, + "reward_std": 2.263007640838623, + "rewards/fitness_reward/mean": 6.306676864624023, + "rewards/fitness_reward/std": 1.6100612878799438, + "rewards/kidney_reward/mean": 0.48028111457824707, + "rewards/kidney_reward/std": 1.4778878688812256, + "rewards/length2tails_reward/mean": 0.737396776676178, + "rewards/length2tails_reward/std": 0.3109074532985687, + "rewards/thermo_reward/mean": 0.3388698697090149, + "rewards/thermo_reward/std": 1.8961843252182007, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13642543088644743, + "epoch": 1.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48593640327453613, + "learning_rate": 7.289574209855559e-07, + "loss": 0.0008, + "num_tokens": 5346653.0, + "reward": 7.100461006164551, + "reward_std": 1.0841041803359985, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4207899570465088, + "rewards/kidney_reward/std": 1.2486227750778198, + "rewards/length2tails_reward/mean": 0.7790296077728271, + "rewards/length2tails_reward/std": 0.2976951599121094, + "rewards/thermo_reward/mean": 0.20802102982997894, + "rewards/thermo_reward/std": 1.9539036750793457, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.71875, + "completions/mean_terminated_length": 267.71875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.11996054463088512, + "epoch": 1.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6139099597930908, + "learning_rate": 7.257757566756603e-07, + "loss": -0.0224, + "num_tokens": 5355252.0, + "reward": 6.390378952026367, + "reward_std": 2.353851318359375, + "rewards/fitness_reward/mean": 6.005577564239502, + "rewards/fitness_reward/std": 2.260528564453125, + "rewards/kidney_reward/mean": -0.10285365581512451, + "rewards/kidney_reward/std": 1.2881437540054321, + "rewards/length2tails_reward/mean": 0.7487176656723022, + "rewards/length2tails_reward/std": 0.27497732639312744, + "rewards/thermo_reward/mean": 0.4980979263782501, + "rewards/thermo_reward/std": 1.851843237876892, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14470160473138094, + "epoch": 1.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5000024437904358, + "learning_rate": 7.225970912381556e-07, + "loss": 0.0036, + "num_tokens": 5363947.0, + "reward": 6.9731855392456055, + "reward_std": 2.3545727729797363, + "rewards/fitness_reward/mean": 6.284873008728027, + "rewards/fitness_reward/std": 1.7334023714065552, + "rewards/kidney_reward/mean": -0.08439573645591736, + "rewards/kidney_reward/std": 1.3576686382293701, + "rewards/length2tails_reward/mean": 0.8042858839035034, + "rewards/length2tails_reward/std": 0.27037766575813293, + "rewards/thermo_reward/mean": 1.0588774681091309, + "rewards/thermo_reward/std": 1.2294719219207764, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 279.59375, + "completions/mean_terminated_length": 279.59375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.1592145711183548, + "epoch": 1.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4882171154022217, + "learning_rate": 7.194214594344168e-07, + "loss": 0.0696, + "num_tokens": 5372926.0, + "reward": 6.606222629547119, + "reward_std": 2.0426268577575684, + "rewards/fitness_reward/mean": 6.238626480102539, + "rewards/fitness_reward/std": 1.9950132369995117, + "rewards/kidney_reward/mean": -0.05644669756293297, + "rewards/kidney_reward/std": 1.307448387145996, + "rewards/length2tails_reward/mean": 0.8217949867248535, + "rewards/length2tails_reward/std": 0.22389177978038788, + "rewards/thermo_reward/mean": 0.3807413876056671, + "rewards/thermo_reward/std": 1.8864628076553345, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13522779755294323, + "epoch": 1.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34206509590148926, + "learning_rate": 7.162488959926449e-07, + "loss": -0.0019, + "num_tokens": 5381630.0, + "reward": 6.898789882659912, + "reward_std": 1.177194595336914, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.18356528878211975, + "rewards/kidney_reward/std": 1.204271912574768, + "rewards/length2tails_reward/mean": 0.8021769523620605, + "rewards/length2tails_reward/std": 0.25768741965293884, + "rewards/thermo_reward/mean": 0.23630744218826294, + "rewards/thermo_reward/std": 1.6089015007019043, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12843825947493315, + "epoch": 1.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4975128173828125, + "learning_rate": 7.130794356074858e-07, + "loss": -0.0023, + "num_tokens": 5390312.0, + "reward": 6.194328308105469, + "reward_std": 2.2656209468841553, + "rewards/fitness_reward/mean": 6.100290298461914, + "rewards/fitness_reward/std": 1.7707273960113525, + "rewards/kidney_reward/mean": 0.049002595245838165, + "rewards/kidney_reward/std": 1.3955203294754028, + "rewards/length2tails_reward/mean": 0.7013083696365356, + "rewards/length2tails_reward/std": 0.3351364731788635, + "rewards/thermo_reward/mean": -0.21158038079738617, + "rewards/thermo_reward/std": 1.9173842668533325, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.1260090097784996, + "epoch": 1.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4583533704280853, + "learning_rate": 7.099131129396501e-07, + "loss": -0.0108, + "num_tokens": 5398982.0, + "reward": 6.756742477416992, + "reward_std": 1.1847895383834839, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03264933079481125, + "rewards/kidney_reward/std": 1.1256049871444702, + "rewards/length2tails_reward/mean": 0.7118386626243591, + "rewards/length2tails_reward/std": 0.3414039611816406, + "rewards/thermo_reward/mean": 0.21359705924987793, + "rewards/thermo_reward/std": 1.7328882217407227, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15006422251462936, + "epoch": 1.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.626889228820801, + "learning_rate": 7.067499626155354e-07, + "loss": 0.0207, + "num_tokens": 5407705.0, + "reward": 6.472668170928955, + "reward_std": 2.401951789855957, + "rewards/fitness_reward/mean": 6.130404949188232, + "rewards/fitness_reward/std": 2.0886292457580566, + "rewards/kidney_reward/mean": 0.19612519443035126, + "rewards/kidney_reward/std": 1.4960027933120728, + "rewards/length2tails_reward/mean": 0.7643212080001831, + "rewards/length2tails_reward/std": 0.3224364221096039, + "rewards/thermo_reward/mean": 0.10624087601900101, + "rewards/thermo_reward/std": 1.887791633605957, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 289.5, + "completions/mean_terminated_length": 289.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.22164984233677387, + "epoch": 1.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.335368633270264, + "learning_rate": 7.035900192268464e-07, + "loss": 0.1689, + "num_tokens": 5417001.0, + "reward": 6.091755390167236, + "reward_std": 3.304783582687378, + "rewards/fitness_reward/mean": 5.5214762687683105, + "rewards/fitness_reward/std": 3.0962703227996826, + "rewards/kidney_reward/mean": 0.27352970838546753, + "rewards/kidney_reward/std": 1.4632915258407593, + "rewards/length2tails_reward/mean": 0.7817375659942627, + "rewards/length2tails_reward/std": 0.257295697927475, + "rewards/thermo_reward/mean": 0.47616004943847656, + "rewards/thermo_reward/std": 1.7222274541854858, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14653821662068367, + "epoch": 1.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3712035417556763, + "learning_rate": 7.004333173302184e-07, + "loss": 0.0205, + "num_tokens": 5425754.0, + "reward": 6.424018859863281, + "reward_std": 2.1600019931793213, + "rewards/fitness_reward/mean": 6.1443328857421875, + "rewards/fitness_reward/std": 2.013084650039673, + "rewards/kidney_reward/mean": 0.06865222007036209, + "rewards/kidney_reward/std": 1.4318342208862305, + "rewards/length2tails_reward/mean": 0.7978615164756775, + "rewards/length2tails_reward/std": 0.2523939907550812, + "rewards/thermo_reward/mean": 0.09178858250379562, + "rewards/thermo_reward/std": 2.0147271156311035, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13927504792809486, + "epoch": 1.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34749701619148254, + "learning_rate": 6.972798914468369e-07, + "loss": 0.0022, + "num_tokens": 5434450.0, + "reward": 6.654448986053467, + "reward_std": 1.819215178489685, + "rewards/fitness_reward/mean": 6.088306903839111, + "rewards/fitness_reward/std": 1.8312757015228271, + "rewards/kidney_reward/mean": 0.21300119161605835, + "rewards/kidney_reward/std": 1.294221043586731, + "rewards/length2tails_reward/mean": 0.8022327423095703, + "rewards/length2tails_reward/std": 0.22164824604988098, + "rewards/thermo_reward/mean": 0.51816725730896, + "rewards/thermo_reward/std": 1.6381409168243408, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14412566553801298, + "epoch": 1.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0621916055679321, + "learning_rate": 6.941297760620626e-07, + "loss": 0.0004, + "num_tokens": 5443189.0, + "reward": 6.669079780578613, + "reward_std": 2.1582190990448, + "rewards/fitness_reward/mean": 6.000514984130859, + "rewards/fitness_reward/std": 1.8205080032348633, + "rewards/kidney_reward/mean": 0.5217428803443909, + "rewards/kidney_reward/std": 1.419599175453186, + "rewards/length2tails_reward/mean": 0.825745701789856, + "rewards/length2tails_reward/std": 0.30729860067367554, + "rewards/thermo_reward/mean": 0.4025149345397949, + "rewards/thermo_reward/std": 2.0498969554901123, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12816251255571842, + "epoch": 1.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3722991943359375, + "learning_rate": 6.909830056250526e-07, + "loss": -0.0031, + "num_tokens": 5451852.0, + "reward": 6.490742206573486, + "reward_std": 1.8934377431869507, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.1469985544681549, + "rewards/kidney_reward/std": 1.4743796586990356, + "rewards/length2tails_reward/mean": 0.7471676468849182, + "rewards/length2tails_reward/std": 0.3187951445579529, + "rewards/thermo_reward/mean": 0.10221816599369049, + "rewards/thermo_reward/std": 2.0202748775482178, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 263.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.1494145756587386, + "epoch": 1.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.507699489593506, + "learning_rate": 6.87839614548384e-07, + "loss": -0.101, + "num_tokens": 5460316.0, + "reward": 6.4404096603393555, + "reward_std": 2.601477861404419, + "rewards/fitness_reward/mean": 5.941213607788086, + "rewards/fitness_reward/std": 2.57167911529541, + "rewards/kidney_reward/mean": 0.15139764547348022, + "rewards/kidney_reward/std": 1.2842119932174683, + "rewards/length2tails_reward/mean": 0.8168920874595642, + "rewards/length2tails_reward/std": 0.19531317055225372, + "rewards/thermo_reward/mean": 0.4385473132133484, + "rewards/thermo_reward/std": 1.7423099279403687, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.13896430749446154, + "epoch": 1.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.873785138130188, + "learning_rate": 6.846996372076785e-07, + "loss": -0.0144, + "num_tokens": 5468956.0, + "reward": 6.3865509033203125, + "reward_std": 2.339273452758789, + "rewards/fitness_reward/mean": 6.012503623962402, + "rewards/fitness_reward/std": 2.2239937782287598, + "rewards/kidney_reward/mean": -0.36523696780204773, + "rewards/kidney_reward/std": 1.246549367904663, + "rewards/length2tails_reward/mean": 0.7499978542327881, + "rewards/length2tails_reward/std": 0.29789772629737854, + "rewards/thermo_reward/mean": 0.7383337020874023, + "rewards/thermo_reward/std": 1.651054859161377, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 277.59375, + "completions/mean_terminated_length": 277.59375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1516404114663601, + "epoch": 1.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9114460945129395, + "learning_rate": 6.815631079412248e-07, + "loss": 0.078, + "num_tokens": 5477871.0, + "reward": 6.335674285888672, + "reward_std": 3.5130021572113037, + "rewards/fitness_reward/mean": 5.692418098449707, + "rewards/fitness_reward/std": 2.7999401092529297, + "rewards/kidney_reward/mean": 0.2563542425632477, + "rewards/kidney_reward/std": 1.1807318925857544, + "rewards/length2tails_reward/mean": 0.8174996376037598, + "rewards/length2tails_reward/std": 0.22419343888759613, + "rewards/thermo_reward/mean": 0.6214093565940857, + "rewards/thermo_reward/std": 1.8662943840026855, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 276.5625, + "completions/mean_terminated_length": 276.5625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.17259023897349834, + "epoch": 1.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.285747528076172, + "learning_rate": 6.784300610496047e-07, + "loss": 0.0554, + "num_tokens": 5486753.0, + "reward": 5.662915229797363, + "reward_std": 3.6996238231658936, + "rewards/fitness_reward/mean": 5.427659034729004, + "rewards/fitness_reward/std": 3.3753035068511963, + "rewards/kidney_reward/mean": -0.03573489561676979, + "rewards/kidney_reward/std": 1.3187494277954102, + "rewards/length2tails_reward/mean": 0.8155403733253479, + "rewards/length2tails_reward/std": 0.2902891933917999, + "rewards/thermo_reward/mean": 0.09847551584243774, + "rewards/thermo_reward/std": 2.0139334201812744, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.125, + "completions/mean_terminated_length": 265.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.13919494859874249, + "epoch": 1.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5877822637557983, + "learning_rate": 6.753005307953165e-07, + "loss": -0.0901, + "num_tokens": 5495269.0, + "reward": 6.150241851806641, + "reward_std": 2.644890546798706, + "rewards/fitness_reward/mean": 6.110477447509766, + "rewards/fitness_reward/std": 2.1971089839935303, + "rewards/kidney_reward/mean": -0.05800964683294296, + "rewards/kidney_reward/std": 1.179059386253357, + "rewards/length2tails_reward/mean": 0.7819545269012451, + "rewards/length2tails_reward/std": 0.2708703875541687, + "rewards/thermo_reward/mean": -0.2534390985965729, + "rewards/thermo_reward/std": 2.063530206680298, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 279.0625, + "completions/mean_terminated_length": 279.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16929593496024609, + "epoch": 1.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.441758155822754, + "learning_rate": 6.721745514024022e-07, + "loss": 0.1271, + "num_tokens": 5504231.0, + "reward": 6.402868747711182, + "reward_std": 2.77589750289917, + "rewards/fitness_reward/mean": 5.82520866394043, + "rewards/fitness_reward/std": 2.6672444343566895, + "rewards/kidney_reward/mean": -0.011563509702682495, + "rewards/kidney_reward/std": 1.5373965501785278, + "rewards/length2tails_reward/mean": 0.7859324216842651, + "rewards/length2tails_reward/std": 0.2572874426841736, + "rewards/thermo_reward/mean": 0.7739176750183105, + "rewards/thermo_reward/std": 1.6143525838851929, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13366108760237694, + "epoch": 1.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6790350079536438, + "learning_rate": 6.690521570560716e-07, + "loss": -0.003, + "num_tokens": 5512961.0, + "reward": 6.813434600830078, + "reward_std": 1.1079778671264648, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.07390785217285156, + "rewards/kidney_reward/std": 1.3826494216918945, + "rewards/length2tails_reward/mean": 0.8083479404449463, + "rewards/length2tails_reward/std": 0.26170846819877625, + "rewards/thermo_reward/mean": 0.11400671303272247, + "rewards/thermo_reward/std": 1.8987302780151367, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.1141181867569685, + "epoch": 1.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33284276723861694, + "learning_rate": 6.65933381902329e-07, + "loss": -0.005, + "num_tokens": 5521606.0, + "reward": 6.679983139038086, + "reward_std": 1.5212734937667847, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.07944856584072113, + "rewards/kidney_reward/std": 1.4168193340301514, + "rewards/length2tails_reward/mean": 0.7093742489814758, + "rewards/length2tails_reward/std": 0.3682093322277069, + "rewards/thermo_reward/mean": 0.1551906019449234, + "rewards/thermo_reward/std": 1.8044683933258057, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1311505250632763, + "epoch": 1.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6112185716629028, + "learning_rate": 6.628182600475997e-07, + "loss": 0.0089, + "num_tokens": 5530286.0, + "reward": 5.7865309715271, + "reward_std": 2.4798007011413574, + "rewards/fitness_reward/mean": 5.610774993896484, + "rewards/fitness_reward/std": 2.745197057723999, + "rewards/kidney_reward/mean": -0.23186977207660675, + "rewards/kidney_reward/std": 1.2328815460205078, + "rewards/length2tails_reward/mean": 0.7195959091186523, + "rewards/length2tails_reward/std": 0.31892168521881104, + "rewards/thermo_reward/mean": 0.22358444333076477, + "rewards/thermo_reward/std": 1.860578179359436, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14035332296043634, + "epoch": 1.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9554190635681152, + "learning_rate": 6.597068255583569e-07, + "loss": -0.0029, + "num_tokens": 5538999.0, + "reward": 6.898894786834717, + "reward_std": 1.9465094804763794, + "rewards/fitness_reward/mean": 6.27459716796875, + "rewards/fitness_reward/std": 1.7915318012237549, + "rewards/kidney_reward/mean": 0.18761423230171204, + "rewards/kidney_reward/std": 1.3303934335708618, + "rewards/length2tails_reward/mean": 0.8007993698120117, + "rewards/length2tails_reward/std": 0.2781968414783478, + "rewards/thermo_reward/mean": 0.6605822443962097, + "rewards/thermo_reward/std": 1.6764042377471924, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14992988761514425, + "epoch": 1.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5707008838653564, + "learning_rate": 6.565991124607506e-07, + "loss": 0.0031, + "num_tokens": 5547715.0, + "reward": 7.003487586975098, + "reward_std": 0.981229841709137, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.08413944393396378, + "rewards/kidney_reward/std": 1.4582713842391968, + "rewards/length2tails_reward/mean": 0.7759417295455933, + "rewards/length2tails_reward/std": 0.29535973072052, + "rewards/thermo_reward/mean": 0.5205470323562622, + "rewards/thermo_reward/std": 1.633626103401184, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 284.5625, + "completions/mean_terminated_length": 284.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16557743027806282, + "epoch": 1.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.631587028503418, + "learning_rate": 6.534951547402321e-07, + "loss": 0.1614, + "num_tokens": 5556853.0, + "reward": 6.230566024780273, + "reward_std": 2.9813506603240967, + "rewards/fitness_reward/mean": 5.728328704833984, + "rewards/fitness_reward/std": 3.020080804824829, + "rewards/kidney_reward/mean": -0.07348226010799408, + "rewards/kidney_reward/std": 1.5445884466171265, + "rewards/length2tails_reward/mean": 0.8205157518386841, + "rewards/length2tails_reward/std": 0.2521876096725464, + "rewards/thermo_reward/mean": 0.667699933052063, + "rewards/thermo_reward/std": 1.7616760730743408, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.59375, + "completions/mean_terminated_length": 268.59375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.14927682746201754, + "epoch": 1.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9488927125930786, + "learning_rate": 6.503949863411865e-07, + "loss": 0.0028, + "num_tokens": 5565480.0, + "reward": 6.727592468261719, + "reward_std": 1.2678565979003906, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.07832270860671997, + "rewards/kidney_reward/std": 1.2198609113693237, + "rewards/length2tails_reward/mean": 0.7817985415458679, + "rewards/length2tails_reward/std": 0.277799516916275, + "rewards/thermo_reward/mean": 0.37196844816207886, + "rewards/thermo_reward/std": 1.883579134941101, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.18589443992823362, + "epoch": 1.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6713123321533203, + "learning_rate": 6.472986411665588e-07, + "loss": -0.0266, + "num_tokens": 5574245.0, + "reward": 6.296464920043945, + "reward_std": 3.5569512844085693, + "rewards/fitness_reward/mean": 5.571892738342285, + "rewards/fitness_reward/std": 3.2508528232574463, + "rewards/kidney_reward/mean": 0.1352205127477646, + "rewards/kidney_reward/std": 1.409559726715088, + "rewards/length2tails_reward/mean": 0.820399284362793, + "rewards/length2tails_reward/std": 0.24890314042568207, + "rewards/thermo_reward/mean": 0.9037243127822876, + "rewards/thermo_reward/std": 1.5839658975601196, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14778904803097248, + "epoch": 1.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3663407266139984, + "learning_rate": 6.442061530774834e-07, + "loss": 0.0056, + "num_tokens": 5582966.0, + "reward": 7.596940994262695, + "reward_std": 1.0984342098236084, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.6841093301773071, + "rewards/kidney_reward/std": 1.4593254327774048, + "rewards/length2tails_reward/mean": 0.8278961181640625, + "rewards/length2tails_reward/std": 0.2020004242658615, + "rewards/thermo_reward/mean": 0.9132281541824341, + "rewards/thermo_reward/std": 1.460633635520935, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13881055917590857, + "epoch": 1.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38059312105178833, + "learning_rate": 6.411175558929152e-07, + "loss": -0.0028, + "num_tokens": 5591652.0, + "reward": 7.203988075256348, + "reward_std": 1.0541120767593384, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5507540106773376, + "rewards/kidney_reward/std": 1.2353981733322144, + "rewards/length2tails_reward/mean": 0.7279623746871948, + "rewards/length2tails_reward/std": 0.2978818714618683, + "rewards/thermo_reward/mean": 0.31064373254776, + "rewards/thermo_reward/std": 1.6055090427398682, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.15258120652288198, + "epoch": 1.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.577871561050415, + "learning_rate": 6.38032883389257e-07, + "loss": -0.0087, + "num_tokens": 5600328.0, + "reward": 7.0710906982421875, + "reward_std": 1.4295557737350464, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.3058558404445648, + "rewards/kidney_reward/std": 1.3590362071990967, + "rewards/length2tails_reward/mean": 0.7754330635070801, + "rewards/length2tails_reward/std": 0.27673545479774475, + "rewards/thermo_reward/mean": 0.6779689788818359, + "rewards/thermo_reward/std": 1.530184030532837, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.40625, + "completions/mean_terminated_length": 268.40625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.15540193300694227, + "epoch": 1.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9766852855682373, + "learning_rate": 6.349521692999944e-07, + "loss": -0.0202, + "num_tokens": 5608949.0, + "reward": 5.674541473388672, + "reward_std": 2.916369676589966, + "rewards/fitness_reward/mean": 5.654493808746338, + "rewards/fitness_reward/std": 2.9443953037261963, + "rewards/kidney_reward/mean": -0.05414886772632599, + "rewards/kidney_reward/std": 1.4154093265533447, + "rewards/length2tails_reward/mean": 0.766571044921875, + "rewards/length2tails_reward/std": 0.3015124201774597, + "rewards/thermo_reward/mean": -0.28904175758361816, + "rewards/thermo_reward/std": 1.7579821348190308, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.13397748861461878, + "epoch": 1.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6520522236824036, + "learning_rate": 6.31875447315322e-07, + "loss": -0.0021, + "num_tokens": 5617653.0, + "reward": 6.948795318603516, + "reward_std": 1.4336069822311401, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.46346867084503174, + "rewards/kidney_reward/std": 1.2700620889663696, + "rewards/length2tails_reward/mean": 0.8374483585357666, + "rewards/length2tails_reward/std": 0.21488112211227417, + "rewards/thermo_reward/mean": 0.038780003786087036, + "rewards/thermo_reward/std": 1.8404791355133057, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1322827786207199, + "epoch": 1.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6410660743713379, + "learning_rate": 6.288027510817791e-07, + "loss": -0.0036, + "num_tokens": 5626347.0, + "reward": 7.278005599975586, + "reward_std": 1.2659250497817993, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5362731218338013, + "rewards/kidney_reward/std": 1.3232353925704956, + "rewards/length2tails_reward/mean": 0.7245739102363586, + "rewards/length2tails_reward/std": 0.30938565731048584, + "rewards/thermo_reward/mean": 0.680832028388977, + "rewards/thermo_reward/std": 1.6555609703063965, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 277.34375, + "completions/mean_terminated_length": 277.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14233141485601664, + "epoch": 1.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.912531852722168, + "learning_rate": 6.257341142018797e-07, + "loss": 0.08, + "num_tokens": 5635254.0, + "reward": 6.13580846786499, + "reward_std": 2.8155057430267334, + "rewards/fitness_reward/mean": 5.834390640258789, + "rewards/fitness_reward/std": 2.634599208831787, + "rewards/kidney_reward/mean": 0.1610783189535141, + "rewards/kidney_reward/std": 1.4255234003067017, + "rewards/length2tails_reward/mean": 0.8666987419128418, + "rewards/length2tails_reward/std": 0.21826976537704468, + "rewards/thermo_reward/mean": 0.00840771198272705, + "rewards/thermo_reward/std": 1.7903788089752197, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 265.15625, + "completions/mean_terminated_length": 265.15625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.16019840352237225, + "epoch": 1.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1110379695892334, + "learning_rate": 6.226695702337442e-07, + "loss": -0.0825, + "num_tokens": 5643771.0, + "reward": 5.976951599121094, + "reward_std": 3.158778429031372, + "rewards/fitness_reward/mean": 5.544244766235352, + "rewards/fitness_reward/std": 2.6484479904174805, + "rewards/kidney_reward/mean": 0.01712121069431305, + "rewards/kidney_reward/std": 1.6551475524902344, + "rewards/length2tails_reward/mean": 0.8136659860610962, + "rewards/length2tails_reward/std": 0.26795053482055664, + "rewards/thermo_reward/mean": 0.4414595365524292, + "rewards/thermo_reward/std": 1.9067299365997314, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14127318561077118, + "epoch": 1.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40734344720840454, + "learning_rate": 6.196091526907355e-07, + "loss": -0.0012, + "num_tokens": 5652482.0, + "reward": 6.749227523803711, + "reward_std": 1.0654892921447754, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.1642061322927475, + "rewards/kidney_reward/std": 1.2673845291137695, + "rewards/length2tails_reward/mean": 0.7619192600250244, + "rewards/length2tails_reward/std": 0.33276113867759705, + "rewards/thermo_reward/mean": 0.3050832152366638, + "rewards/thermo_reward/std": 1.8505090475082397, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14173165522515774, + "epoch": 1.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4195237159729004, + "learning_rate": 6.165528950410884e-07, + "loss": 0.0134, + "num_tokens": 5661192.0, + "reward": 6.565463066101074, + "reward_std": 2.6815237998962402, + "rewards/fitness_reward/mean": 6.213047027587891, + "rewards/fitness_reward/std": 2.139711618423462, + "rewards/kidney_reward/mean": 0.2550015449523926, + "rewards/kidney_reward/std": 1.4037851095199585, + "rewards/length2tails_reward/mean": 0.8078710436820984, + "rewards/length2tails_reward/std": 0.22603370249271393, + "rewards/thermo_reward/mean": 0.045895226299762726, + "rewards/thermo_reward/std": 1.9364056587219238, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 264.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.1734876073896885, + "epoch": 1.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4625022411346436, + "learning_rate": 6.13500830707548e-07, + "loss": -0.0255, + "num_tokens": 5669696.0, + "reward": 5.867452144622803, + "reward_std": 3.0157735347747803, + "rewards/fitness_reward/mean": 5.785667419433594, + "rewards/fitness_reward/std": 2.8030941486358643, + "rewards/kidney_reward/mean": -0.25507643818855286, + "rewards/kidney_reward/std": 1.2442216873168945, + "rewards/length2tails_reward/mean": 0.8725078701972961, + "rewards/length2tails_reward/std": 0.21126972138881683, + "rewards/thermo_reward/mean": -0.017607375979423523, + "rewards/thermo_reward/std": 1.8447613716125488, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 283.5, + "completions/mean_terminated_length": 283.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15415276028215885, + "epoch": 1.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.34400749206543, + "learning_rate": 6.10452993067e-07, + "loss": 0.1574, + "num_tokens": 5678800.0, + "reward": 5.908939361572266, + "reward_std": 2.969799518585205, + "rewards/fitness_reward/mean": 5.738583564758301, + "rewards/fitness_reward/std": 2.9808731079101562, + "rewards/kidney_reward/mean": 0.2620808780193329, + "rewards/kidney_reward/std": 1.391344666481018, + "rewards/length2tails_reward/mean": 0.731244683265686, + "rewards/length2tails_reward/std": 0.3396046757698059, + "rewards/thermo_reward/mean": -0.28699085116386414, + "rewards/thermo_reward/std": 1.8723300695419312, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 277.5625, + "completions/mean_terminated_length": 277.5625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1882968582212925, + "epoch": 1.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4039666652679443, + "learning_rate": 6.074094154501086e-07, + "loss": 0.0814, + "num_tokens": 5687714.0, + "reward": 6.29148530960083, + "reward_std": 2.6734001636505127, + "rewards/fitness_reward/mean": 5.834356784820557, + "rewards/fitness_reward/std": 2.6169049739837646, + "rewards/kidney_reward/mean": 0.34884753823280334, + "rewards/kidney_reward/std": 1.3807021379470825, + "rewards/length2tails_reward/mean": 0.8212233781814575, + "rewards/length2tails_reward/std": 0.2387375682592392, + "rewards/thermo_reward/mean": 0.1547977775335312, + "rewards/thermo_reward/std": 1.9377219676971436, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1444950643926859, + "epoch": 1.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36122214794158936, + "learning_rate": 6.04370131140952e-07, + "loss": 0.0026, + "num_tokens": 5696443.0, + "reward": 6.898990631103516, + "reward_std": 1.3468652963638306, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1589527130126953, + "rewards/kidney_reward/std": 1.5192503929138184, + "rewards/length2tails_reward/mean": 0.7818030714988708, + "rewards/length2tails_reward/std": 0.2831023931503296, + "rewards/thermo_reward/mean": 0.06553035974502563, + "rewards/thermo_reward/std": 2.091651201248169, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.20243182964622974, + "epoch": 1.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.959991693496704, + "learning_rate": 6.01335173376655e-07, + "loss": 0.0239, + "num_tokens": 5705234.0, + "reward": 5.641485214233398, + "reward_std": 3.5968315601348877, + "rewards/fitness_reward/mean": 5.2309136390686035, + "rewards/fitness_reward/std": 3.676581621170044, + "rewards/kidney_reward/mean": 0.17761388421058655, + "rewards/kidney_reward/std": 1.3631139993667603, + "rewards/length2tails_reward/mean": 0.8585853576660156, + "rewards/length2tails_reward/std": 0.16891047358512878, + "rewards/thermo_reward/mean": 0.21423634886741638, + "rewards/thermo_reward/std": 2.134769916534424, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1514090709388256, + "epoch": 1.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1887006759643555, + "learning_rate": 5.983045753470307e-07, + "loss": 0.0236, + "num_tokens": 5714006.0, + "reward": 5.898200988769531, + "reward_std": 3.060225009918213, + "rewards/fitness_reward/mean": 5.744778633117676, + "rewards/fitness_reward/std": 2.613828659057617, + "rewards/kidney_reward/mean": -0.10127445310354233, + "rewards/kidney_reward/std": 1.2469600439071655, + "rewards/length2tails_reward/mean": 0.7454930543899536, + "rewards/length2tails_reward/std": 0.34676027297973633, + "rewards/thermo_reward/mean": 0.03537212312221527, + "rewards/thermo_reward/std": 1.8324885368347168, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 281.1875, + "completions/mean_terminated_length": 281.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1525631584227085, + "epoch": 1.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7953178882598877, + "learning_rate": 5.952783701942129e-07, + "loss": 0.1617, + "num_tokens": 5723036.0, + "reward": 6.598310470581055, + "reward_std": 2.4306259155273438, + "rewards/fitness_reward/mean": 6.145068168640137, + "rewards/fitness_reward/std": 2.0091021060943604, + "rewards/kidney_reward/mean": 0.002159029245376587, + "rewards/kidney_reward/std": 1.3303543329238892, + "rewards/length2tails_reward/mean": 0.8235996961593628, + "rewards/length2tails_reward/std": 0.25856974720954895, + "rewards/thermo_reward/mean": 0.4925250709056854, + "rewards/thermo_reward/std": 1.778227686882019, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12768492568284273, + "epoch": 1.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39604681730270386, + "learning_rate": 5.922565910122966e-07, + "loss": -0.0011, + "num_tokens": 5731705.0, + "reward": 6.629502296447754, + "reward_std": 2.039947748184204, + "rewards/fitness_reward/mean": 6.155602931976318, + "rewards/fitness_reward/std": 1.952143669128418, + "rewards/kidney_reward/mean": 0.21815750002861023, + "rewards/kidney_reward/std": 1.2914760112762451, + "rewards/length2tails_reward/mean": 0.7410340309143066, + "rewards/length2tails_reward/std": 0.30291491746902466, + "rewards/thermo_reward/mean": 0.359124094247818, + "rewards/thermo_reward/std": 1.5980550050735474, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12119963858276606, + "epoch": 1.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.407927930355072, + "learning_rate": 5.892392708469747e-07, + "loss": -0.0039, + "num_tokens": 5740385.0, + "reward": 6.151309967041016, + "reward_std": 1.8759739398956299, + "rewards/fitness_reward/mean": 6.188357830047607, + "rewards/fitness_reward/std": 1.7761485576629639, + "rewards/kidney_reward/mean": -0.3281180262565613, + "rewards/kidney_reward/std": 1.2943578958511353, + "rewards/length2tails_reward/mean": 0.7649803757667542, + "rewards/length2tails_reward/std": 0.30511534214019775, + "rewards/thermo_reward/mean": -0.12846702337265015, + "rewards/thermo_reward/std": 1.906110167503357, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15560829732567072, + "epoch": 1.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1416964530944824, + "learning_rate": 5.862264426951768e-07, + "loss": 0.0558, + "num_tokens": 5749170.0, + "reward": 6.898611068725586, + "reward_std": 1.9748318195343018, + "rewards/fitness_reward/mean": 6.132171630859375, + "rewards/fitness_reward/std": 2.0790321826934814, + "rewards/kidney_reward/mean": 0.537623405456543, + "rewards/kidney_reward/std": 1.3657079935073853, + "rewards/length2tails_reward/mean": 0.7724308967590332, + "rewards/length2tails_reward/std": 0.2801792621612549, + "rewards/thermo_reward/mean": 0.609039843082428, + "rewards/thermo_reward/std": 1.5434783697128296, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1479223845526576, + "epoch": 1.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6348621249198914, + "learning_rate": 5.832181395047098e-07, + "loss": -0.0026, + "num_tokens": 5757881.0, + "reward": 7.095817565917969, + "reward_std": 2.352982521057129, + "rewards/fitness_reward/mean": 6.189311981201172, + "rewards/fitness_reward/std": 1.771049976348877, + "rewards/kidney_reward/mean": 0.508020281791687, + "rewards/kidney_reward/std": 1.354441523551941, + "rewards/length2tails_reward/mean": 0.7360724210739136, + "rewards/length2tails_reward/std": 0.28219249844551086, + "rewards/thermo_reward/mean": 0.9369553923606873, + "rewards/thermo_reward/std": 1.2352522611618042, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1369036491960287, + "epoch": 1.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5892386436462402, + "learning_rate": 5.802143941738944e-07, + "loss": 0.0478, + "num_tokens": 5766623.0, + "reward": 6.1280012130737305, + "reward_std": 2.9481887817382812, + "rewards/fitness_reward/mean": 5.775424480438232, + "rewards/fitness_reward/std": 2.838524580001831, + "rewards/kidney_reward/mean": 0.03221467137336731, + "rewards/kidney_reward/std": 1.2222998142242432, + "rewards/length2tails_reward/mean": 0.667849063873291, + "rewards/length2tails_reward/std": 0.3331885039806366, + "rewards/thermo_reward/mean": 0.33901447057724, + "rewards/thermo_reward/std": 1.6431798934936523, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14061100222170353, + "epoch": 1.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8035763502120972, + "learning_rate": 5.772152395512087e-07, + "loss": 0.0022, + "num_tokens": 5775314.0, + "reward": 7.163296699523926, + "reward_std": 1.20790696144104, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.3790941536426544, + "rewards/kidney_reward/std": 1.34048593044281, + "rewards/length2tails_reward/mean": 0.7604628801345825, + "rewards/length2tails_reward/std": 0.3018241226673126, + "rewards/thermo_reward/mean": 0.3846713900566101, + "rewards/thermo_reward/std": 1.7411631345748901, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1336772684007883, + "epoch": 1.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6508118510246277, + "learning_rate": 5.742207084349273e-07, + "loss": -0.0055, + "num_tokens": 5783973.0, + "reward": 7.059962272644043, + "reward_std": 1.0898627042770386, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.17260214686393738, + "rewards/kidney_reward/std": 1.3055181503295898, + "rewards/length2tails_reward/mean": 0.6751230955123901, + "rewards/length2tails_reward/std": 0.3568343222141266, + "rewards/thermo_reward/mean": 0.6331421136856079, + "rewards/thermo_reward/std": 1.3784630298614502, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14024581480771303, + "epoch": 1.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2925182282924652, + "learning_rate": 5.712308335727628e-07, + "loss": -0.0006, + "num_tokens": 5792639.0, + "reward": 7.133518218994141, + "reward_std": 1.1250042915344238, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.004523918032646179, + "rewards/kidney_reward/std": 1.469503402709961, + "rewards/length2tails_reward/mean": 0.7807639837265015, + "rewards/length2tails_reward/std": 0.2387048751115799, + "rewards/thermo_reward/mean": 0.6895338296890259, + "rewards/thermo_reward/std": 1.6021491289138794, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1417333409190178, + "epoch": 1.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43599823117256165, + "learning_rate": 5.682456476615072e-07, + "loss": 0.002, + "num_tokens": 5801390.0, + "reward": 6.68442440032959, + "reward_std": 1.4163669347763062, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.18505345284938812, + "rewards/kidney_reward/std": 1.4562594890594482, + "rewards/length2tails_reward/mean": 0.7570204734802246, + "rewards/length2tails_reward/std": 0.29161199927330017, + "rewards/thermo_reward/mean": 0.1987738311290741, + "rewards/thermo_reward/std": 1.9739598035812378, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 274.96875, + "completions/mean_terminated_length": 274.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.15057275351136923, + "epoch": 1.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8899993896484375, + "learning_rate": 5.652651833466755e-07, + "loss": 0.079, + "num_tokens": 5810221.0, + "reward": 6.272120475769043, + "reward_std": 2.248950481414795, + "rewards/fitness_reward/mean": 6.2373175621032715, + "rewards/fitness_reward/std": 2.0024170875549316, + "rewards/kidney_reward/mean": -0.05418562889099121, + "rewards/kidney_reward/std": 1.2909743785858154, + "rewards/length2tails_reward/mean": 0.7484981417655945, + "rewards/length2tails_reward/std": 0.33117684721946716, + "rewards/thermo_reward/mean": -0.2504574656486511, + "rewards/thermo_reward/std": 2.0557281970977783, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13654115051031113, + "epoch": 1.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2763078510761261, + "learning_rate": 5.622894732221482e-07, + "loss": 0.0003, + "num_tokens": 5818924.0, + "reward": 7.003393650054932, + "reward_std": 1.251980185508728, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4440957307815552, + "rewards/kidney_reward/std": 1.3153462409973145, + "rewards/length2tails_reward/mean": 0.7653374671936035, + "rewards/length2tails_reward/std": 0.3081322908401489, + "rewards/thermo_reward/mean": 0.2034047245979309, + "rewards/thermo_reward/std": 1.8775660991668701, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15669972635805607, + "epoch": 1.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7499356269836426, + "learning_rate": 5.593185498298141e-07, + "loss": 0.0205, + "num_tokens": 5827664.0, + "reward": 6.445470809936523, + "reward_std": 2.2563424110412598, + "rewards/fitness_reward/mean": 6.136785984039307, + "rewards/fitness_reward/std": 2.053987503051758, + "rewards/kidney_reward/mean": -0.17048430442810059, + "rewards/kidney_reward/std": 1.280286431312561, + "rewards/length2tails_reward/mean": 0.8183590173721313, + "rewards/length2tails_reward/std": 0.22113774716854095, + "rewards/thermo_reward/mean": 0.37867385149002075, + "rewards/thermo_reward/std": 1.8788390159606934, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14466068148612976, + "epoch": 1.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6656396985054016, + "learning_rate": 5.563524456592163e-07, + "loss": 0.006, + "num_tokens": 5836375.0, + "reward": 6.719061374664307, + "reward_std": 1.7237056493759155, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.2651819586753845, + "rewards/kidney_reward/std": 1.5847587585449219, + "rewards/length2tails_reward/mean": 0.7623664140701294, + "rewards/length2tails_reward/std": 0.2931446433067322, + "rewards/thermo_reward/mean": 0.021117717027664185, + "rewards/thermo_reward/std": 1.997339129447937, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14278834592550993, + "epoch": 1.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45834881067276, + "learning_rate": 5.533911931471935e-07, + "loss": 0.0022, + "num_tokens": 5845084.0, + "reward": 7.0002899169921875, + "reward_std": 1.327841877937317, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.024180158972740173, + "rewards/kidney_reward/std": 1.4476650953292847, + "rewards/length2tails_reward/mean": 0.8485123515129089, + "rewards/length2tails_reward/std": 0.17107199132442474, + "rewards/thermo_reward/mean": 0.3695473074913025, + "rewards/thermo_reward/std": 1.9741339683532715, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13745083380490541, + "epoch": 1.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5631520748138428, + "learning_rate": 5.504348246775299e-07, + "loss": 0.0027, + "num_tokens": 5853791.0, + "reward": 7.046502113342285, + "reward_std": 1.2177540063858032, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2630627751350403, + "rewards/kidney_reward/std": 1.46834135055542, + "rewards/length2tails_reward/mean": 0.8024006485939026, + "rewards/length2tails_reward/std": 0.26976141333580017, + "rewards/thermo_reward/mean": 0.4521217346191406, + "rewards/thermo_reward/std": 1.9688328504562378, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 284.875, + "completions/mean_terminated_length": 284.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.18276775442063808, + "epoch": 1.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.479960918426514, + "learning_rate": 5.474833725805962e-07, + "loss": 0.2367, + "num_tokens": 5862939.0, + "reward": 6.991754531860352, + "reward_std": 2.0754315853118896, + "rewards/fitness_reward/mean": 6.240487575531006, + "rewards/fitness_reward/std": 1.984483242034912, + "rewards/kidney_reward/mean": 0.42164015769958496, + "rewards/kidney_reward/std": 1.3554811477661133, + "rewards/length2tails_reward/mean": 0.8755493760108948, + "rewards/length2tails_reward/std": 0.16724464297294617, + "rewards/thermo_reward/mean": 0.6431186199188232, + "rewards/thermo_reward/std": 1.570862054824829, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1310231052339077, + "epoch": 1.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.409902960062027, + "learning_rate": 5.445368691330006e-07, + "loss": -0.0001, + "num_tokens": 5871625.0, + "reward": 6.525934219360352, + "reward_std": 1.633767008781433, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.21257570385932922, + "rewards/kidney_reward/std": 1.396531581878662, + "rewards/length2tails_reward/mean": 0.7744640111923218, + "rewards/length2tails_reward/std": 0.27144792675971985, + "rewards/thermo_reward/mean": 0.5185284614562988, + "rewards/thermo_reward/std": 1.6811809539794922, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.179467367939651, + "epoch": 1.3439999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.118148326873779, + "learning_rate": 5.415953465572332e-07, + "loss": 0.0739, + "num_tokens": 5880449.0, + "reward": 6.553123474121094, + "reward_std": 3.325801134109497, + "rewards/fitness_reward/mean": 5.857428073883057, + "rewards/fitness_reward/std": 2.8894855976104736, + "rewards/kidney_reward/mean": 0.7388929128646851, + "rewards/kidney_reward/std": 1.2432023286819458, + "rewards/length2tails_reward/mean": 0.6661558747291565, + "rewards/length2tails_reward/std": 0.3547395169734955, + "rewards/thermo_reward/mean": 0.31941914558410645, + "rewards/thermo_reward/std": 1.9431196451187134, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14087828435003757, + "epoch": 1.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6226058602333069, + "learning_rate": 5.386588370213123e-07, + "loss": 0.0053, + "num_tokens": 5889138.0, + "reward": 6.679670333862305, + "reward_std": 1.9523361921310425, + "rewards/fitness_reward/mean": 6.203366756439209, + "rewards/fitness_reward/std": 1.6961663961410522, + "rewards/kidney_reward/mean": 0.4498825669288635, + "rewards/kidney_reward/std": 1.325005292892456, + "rewards/length2tails_reward/mean": 0.8054025173187256, + "rewards/length2tails_reward/std": 0.2683117389678955, + "rewards/thermo_reward/mean": 0.10002326965332031, + "rewards/thermo_reward/std": 1.884231448173523, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15696079470217228, + "epoch": 1.3479999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2696923315525055, + "learning_rate": 5.357273726384367e-07, + "loss": 0.0005, + "num_tokens": 5897994.0, + "reward": 6.942432403564453, + "reward_std": 1.6423234939575195, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.3043510317802429, + "rewards/kidney_reward/std": 1.3465080261230469, + "rewards/length2tails_reward/mean": 0.9036553502082825, + "rewards/length2tails_reward/std": 0.1465829759836197, + "rewards/thermo_reward/mean": 0.3580460846424103, + "rewards/thermo_reward/std": 1.8777705430984497, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13921087235212326, + "epoch": 1.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5983933210372925, + "learning_rate": 5.328009854666302e-07, + "loss": 0.0007, + "num_tokens": 5906693.0, + "reward": 7.0876359939575195, + "reward_std": 1.326507806777954, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.4664854407310486, + "rewards/kidney_reward/std": 1.3527214527130127, + "rewards/length2tails_reward/mean": 0.7940385341644287, + "rewards/length2tails_reward/std": 0.2716478705406189, + "rewards/thermo_reward/mean": 0.54112708568573, + "rewards/thermo_reward/std": 1.7181748151779175, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12440503854304552, + "epoch": 1.3519999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44946378469467163, + "learning_rate": 5.298797075083955e-07, + "loss": 0.0043, + "num_tokens": 5915337.0, + "reward": 6.417015075683594, + "reward_std": 2.319885015487671, + "rewards/fitness_reward/mean": 6.186835289001465, + "rewards/fitness_reward/std": 1.784287452697754, + "rewards/kidney_reward/mean": -0.08183324337005615, + "rewards/kidney_reward/std": 1.3728009462356567, + "rewards/length2tails_reward/mean": 0.7380324006080627, + "rewards/length2tails_reward/std": 0.24326679110527039, + "rewards/thermo_reward/mean": 0.17317692935466766, + "rewards/thermo_reward/std": 1.980185627937317, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14245464466512203, + "epoch": 1.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3716691732406616, + "learning_rate": 5.269635707103593e-07, + "loss": 0.0123, + "num_tokens": 5924013.0, + "reward": 6.7908549308776855, + "reward_std": 2.4631636142730713, + "rewards/fitness_reward/mean": 6.110771179199219, + "rewards/fitness_reward/std": 2.1955087184906006, + "rewards/kidney_reward/mean": 0.49149662256240845, + "rewards/kidney_reward/std": 1.2913416624069214, + "rewards/length2tails_reward/mean": 0.7306331396102905, + "rewards/length2tails_reward/std": 0.27333998680114746, + "rewards/thermo_reward/mean": 0.5033540725708008, + "rewards/thermo_reward/std": 1.6740689277648926, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 289.53125, + "completions/mean_terminated_length": 289.53125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.2265687696635723, + "epoch": 1.3559999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6275124549865723, + "learning_rate": 5.240526069629264e-07, + "loss": 0.149, + "num_tokens": 5933310.0, + "reward": 5.4968719482421875, + "reward_std": 3.5532803535461426, + "rewards/fitness_reward/mean": 5.255889892578125, + "rewards/fitness_reward/std": 3.3009347915649414, + "rewards/kidney_reward/mean": -0.0853227972984314, + "rewards/kidney_reward/std": 1.4309494495391846, + "rewards/length2tails_reward/mean": 0.8283058404922485, + "rewards/length2tails_reward/std": 0.26944589614868164, + "rewards/thermo_reward/mean": 0.15313391387462616, + "rewards/thermo_reward/std": 2.1520845890045166, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 278.96875, + "completions/mean_terminated_length": 278.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13733403012156487, + "epoch": 1.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8472709655761719, + "learning_rate": 5.211468480999304e-07, + "loss": 0.1001, + "num_tokens": 5942269.0, + "reward": 6.766761779785156, + "reward_std": 2.455509901046753, + "rewards/fitness_reward/mean": 6.127740383148193, + "rewards/fitness_reward/std": 2.1031084060668945, + "rewards/kidney_reward/mean": 0.489413321018219, + "rewards/kidney_reward/std": 1.1277484893798828, + "rewards/length2tails_reward/mean": 0.7868244647979736, + "rewards/length2tails_reward/std": 0.3167828321456909, + "rewards/thermo_reward/mean": 0.39521703124046326, + "rewards/thermo_reward/std": 1.8430607318878174, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.375, + "completions/mean_terminated_length": 266.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.1501946747303009, + "epoch": 1.3599999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.276226043701172, + "learning_rate": 5.182463258982846e-07, + "loss": -0.0829, + "num_tokens": 5950825.0, + "reward": 6.895351409912109, + "reward_std": 2.2152953147888184, + "rewards/fitness_reward/mean": 6.146317481994629, + "rewards/fitness_reward/std": 2.0023388862609863, + "rewards/kidney_reward/mean": 0.48449423909187317, + "rewards/kidney_reward/std": 1.4067612886428833, + "rewards/length2tails_reward/mean": 0.8560777902603149, + "rewards/length2tails_reward/std": 0.20544761419296265, + "rewards/thermo_reward/mean": 0.5855341553688049, + "rewards/thermo_reward/std": 1.6604307889938354, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15200999192893505, + "epoch": 1.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35106155276298523, + "learning_rate": 5.153510720776353e-07, + "loss": 0.0093, + "num_tokens": 5959575.0, + "reward": 6.509156227111816, + "reward_std": 2.3999266624450684, + "rewards/fitness_reward/mean": 6.1674699783325195, + "rewards/fitness_reward/std": 1.8881745338439941, + "rewards/kidney_reward/mean": 0.03378329426050186, + "rewards/kidney_reward/std": 1.2819344997406006, + "rewards/length2tails_reward/mean": 0.8100302815437317, + "rewards/length2tails_reward/std": 0.21821154654026031, + "rewards/thermo_reward/mean": 0.24457307159900665, + "rewards/thermo_reward/std": 1.9244755506515503, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 269.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12327281478792429, + "epoch": 1.3639999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0517131090164185, + "learning_rate": 5.124611183000137e-07, + "loss": -0.0007, + "num_tokens": 5968224.0, + "reward": 7.198990821838379, + "reward_std": 1.1014410257339478, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.20418256521224976, + "rewards/kidney_reward/std": 1.3175435066223145, + "rewards/length2tails_reward/mean": 0.7419940233230591, + "rewards/length2tails_reward/std": 0.26973870396614075, + "rewards/thermo_reward/mean": 0.640204668045044, + "rewards/thermo_reward/std": 1.5962918996810913, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12007454875856638, + "epoch": 1.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5568537712097168, + "learning_rate": 5.095764961694922e-07, + "loss": -0.008, + "num_tokens": 5976895.0, + "reward": 6.600795745849609, + "reward_std": 1.7843338251113892, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": -0.1611795723438263, + "rewards/kidney_reward/std": 1.2040907144546509, + "rewards/length2tails_reward/mean": 0.755807638168335, + "rewards/length2tails_reward/std": 0.3260086178779602, + "rewards/thermo_reward/mean": 0.8321608901023865, + "rewards/thermo_reward/std": 1.3782223463058472, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13120761420577765, + "epoch": 1.3679999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.735037088394165, + "learning_rate": 5.06697237231835e-07, + "loss": 0.0041, + "num_tokens": 5985576.0, + "reward": 6.2987141609191895, + "reward_std": 2.951741933822632, + "rewards/fitness_reward/mean": 5.815138816833496, + "rewards/fitness_reward/std": 2.690138101577759, + "rewards/kidney_reward/mean": 0.7322832345962524, + "rewards/kidney_reward/std": 1.3732749223709106, + "rewards/length2tails_reward/mean": 0.7410439848899841, + "rewards/length2tails_reward/std": 0.33157578110694885, + "rewards/thermo_reward/mean": -0.13565552234649658, + "rewards/thermo_reward/std": 1.845339059829712, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.14077529218047857, + "epoch": 1.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.83749258518219, + "learning_rate": 5.038233729741577e-07, + "loss": 0.0621, + "num_tokens": 5994352.0, + "reward": 6.197768688201904, + "reward_std": 2.1141865253448486, + "rewards/fitness_reward/mean": 6.1678900718688965, + "rewards/fitness_reward/std": 1.885914921760559, + "rewards/kidney_reward/mean": -0.3956373929977417, + "rewards/kidney_reward/std": 1.379032015800476, + "rewards/length2tails_reward/mean": 0.8157838582992554, + "rewards/length2tails_reward/std": 0.26494520902633667, + "rewards/thermo_reward/mean": 0.04750201106071472, + "rewards/thermo_reward/std": 1.8462481498718262, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 285.90625, + "completions/mean_terminated_length": 285.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16495055705308914, + "epoch": 1.3719999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.541355609893799, + "learning_rate": 5.009549348245795e-07, + "loss": 0.0821, + "num_tokens": 6003533.0, + "reward": 5.699135780334473, + "reward_std": 4.05197811126709, + "rewards/fitness_reward/mean": 5.069490909576416, + "rewards/fitness_reward/std": 3.5660157203674316, + "rewards/kidney_reward/mean": 0.3704063892364502, + "rewards/kidney_reward/std": 1.5749589204788208, + "rewards/length2tails_reward/mean": 0.7373260259628296, + "rewards/length2tails_reward/std": 0.3231446444988251, + "rewards/thermo_reward/mean": 0.5202198624610901, + "rewards/thermo_reward/std": 1.6503164768218994, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 276.4375, + "completions/mean_terminated_length": 276.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11670089419931173, + "epoch": 1.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0955183506011963, + "learning_rate": 4.980919541518795e-07, + "loss": 0.1149, + "num_tokens": 6012411.0, + "reward": 6.278100967407227, + "reward_std": 2.291395425796509, + "rewards/fitness_reward/mean": 6.2216620445251465, + "rewards/fitness_reward/std": 2.0909764766693115, + "rewards/kidney_reward/mean": 0.06045813485980034, + "rewards/kidney_reward/std": 1.4518604278564453, + "rewards/length2tails_reward/mean": 0.721335768699646, + "rewards/length2tails_reward/std": 0.3074924051761627, + "rewards/thermo_reward/mean": -0.3082483112812042, + "rewards/thermo_reward/std": 1.8238410949707031, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.13439594209194183, + "epoch": 1.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3209054470062256, + "learning_rate": 4.952344622651566e-07, + "loss": -0.0077, + "num_tokens": 6021055.0, + "reward": 7.090793132781982, + "reward_std": 1.2847994565963745, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.35663118958473206, + "rewards/kidney_reward/std": 1.2885410785675049, + "rewards/length2tails_reward/mean": 0.7761370539665222, + "rewards/length2tails_reward/std": 0.2594932019710541, + "rewards/thermo_reward/mean": 0.4602685272693634, + "rewards/thermo_reward/std": 1.6738771200180054, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13340663630515337, + "epoch": 1.3780000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6911230087280273, + "learning_rate": 4.923824904134829e-07, + "loss": 0.001, + "num_tokens": 6029747.0, + "reward": 6.794254302978516, + "reward_std": 1.3270739316940308, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.26135924458503723, + "rewards/kidney_reward/std": 1.3938097953796387, + "rewards/length2tails_reward/mean": 0.7796659469604492, + "rewards/length2tails_reward/std": 0.294088214635849, + "rewards/thermo_reward/mean": -0.24528031051158905, + "rewards/thermo_reward/std": 2.0109946727752686, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13948645628988743, + "epoch": 1.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6078920364379883, + "learning_rate": 4.895360697855674e-07, + "loss": 0.0001, + "num_tokens": 6038408.0, + "reward": 7.045919418334961, + "reward_std": 1.057389497756958, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.06258798390626907, + "rewards/kidney_reward/std": 1.3073828220367432, + "rewards/length2tails_reward/mean": 0.7211709022521973, + "rewards/length2tails_reward/std": 0.24401357769966125, + "rewards/thermo_reward/mean": 0.4860687851905823, + "rewards/thermo_reward/std": 1.6293129920959473, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.147435849532485, + "epoch": 1.3820000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921867847442627, + "learning_rate": 4.866952315094087e-07, + "loss": -0.0307, + "num_tokens": 6047067.0, + "reward": 6.443836688995361, + "reward_std": 2.4145994186401367, + "rewards/fitness_reward/mean": 6.215827941894531, + "rewards/fitness_reward/std": 2.1239800453186035, + "rewards/kidney_reward/mean": 0.13426579535007477, + "rewards/kidney_reward/std": 1.329345703125, + "rewards/length2tails_reward/mean": 0.7906262874603271, + "rewards/length2tails_reward/std": 0.2970091998577118, + "rewards/thermo_reward/mean": -0.07356198132038116, + "rewards/thermo_reward/std": 1.9798394441604614, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 280.9375, + "completions/mean_terminated_length": 280.9375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.16280218120664358, + "epoch": 1.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.788339376449585, + "learning_rate": 4.838600066519596e-07, + "loss": 0.1016, + "num_tokens": 6056089.0, + "reward": 5.891046524047852, + "reward_std": 3.585066795349121, + "rewards/fitness_reward/mean": 5.487031936645508, + "rewards/fitness_reward/std": 3.200364828109741, + "rewards/kidney_reward/mean": 0.2090635895729065, + "rewards/kidney_reward/std": 1.172345757484436, + "rewards/length2tails_reward/mean": 0.896310567855835, + "rewards/length2tails_reward/std": 0.19149312376976013, + "rewards/thermo_reward/mean": 0.15081097185611725, + "rewards/thermo_reward/std": 2.0388600826263428, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 274.03125, + "completions/mean_terminated_length": 274.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1421026885509491, + "epoch": 1.3860000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7108883857727051, + "learning_rate": 4.810304262187851e-07, + "loss": 0.0068, + "num_tokens": 6064890.0, + "reward": 7.138672351837158, + "reward_std": 1.2033448219299316, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.26511430740356445, + "rewards/kidney_reward/std": 1.2008064985275269, + "rewards/length2tails_reward/mean": 0.8442004919052124, + "rewards/length2tails_reward/std": 0.27326276898384094, + "rewards/thermo_reward/mean": 0.6135116815567017, + "rewards/thermo_reward/std": 1.6334232091903687, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16671243961900473, + "epoch": 1.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3696401119232178, + "learning_rate": 4.782065211537225e-07, + "loss": 0.086, + "num_tokens": 6073794.0, + "reward": 5.975656509399414, + "reward_std": 2.807621717453003, + "rewards/fitness_reward/mean": 5.584591865539551, + "rewards/fitness_reward/std": 2.838075876235962, + "rewards/kidney_reward/mean": 0.04642722010612488, + "rewards/kidney_reward/std": 1.391793966293335, + "rewards/length2tails_reward/mean": 0.702508807182312, + "rewards/length2tails_reward/std": 0.35916393995285034, + "rewards/thermo_reward/mean": 0.3844468593597412, + "rewards/thermo_reward/std": 1.737068772315979, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.15083182603120804, + "epoch": 1.3900000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8778202533721924, + "learning_rate": 4.7538832233854665e-07, + "loss": 0.0129, + "num_tokens": 6082500.0, + "reward": 5.957052230834961, + "reward_std": 3.099594831466675, + "rewards/fitness_reward/mean": 5.869346618652344, + "rewards/fitness_reward/std": 2.8409500122070312, + "rewards/kidney_reward/mean": 0.050046831369400024, + "rewards/kidney_reward/std": 1.2768054008483887, + "rewards/length2tails_reward/mean": 0.8028084635734558, + "rewards/length2tails_reward/std": 0.2285366654396057, + "rewards/thermo_reward/mean": -0.2760394811630249, + "rewards/thermo_reward/std": 1.9791443347930908, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 276.8125, + "completions/mean_terminated_length": 276.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16747636813670397, + "epoch": 1.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4245190620422363, + "learning_rate": 4.72575860592627e-07, + "loss": 0.0737, + "num_tokens": 6091390.0, + "reward": 6.087239742279053, + "reward_std": 3.4649014472961426, + "rewards/fitness_reward/mean": 5.56655216217041, + "rewards/fitness_reward/std": 3.268420934677124, + "rewards/kidney_reward/mean": 0.37435388565063477, + "rewards/kidney_reward/std": 1.3217610120773315, + "rewards/length2tails_reward/mean": 0.7429100275039673, + "rewards/length2tails_reward/std": 0.2799903154373169, + "rewards/thermo_reward/mean": 0.29556602239608765, + "rewards/thermo_reward/std": 1.8402382135391235, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 284.125, + "completions/mean_terminated_length": 284.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1663734009489417, + "epoch": 1.3940000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.678997039794922, + "learning_rate": 4.6976916667259545e-07, + "loss": 0.2274, + "num_tokens": 6100514.0, + "reward": 6.876226425170898, + "reward_std": 1.9914690256118774, + "rewards/fitness_reward/mean": 6.232907295227051, + "rewards/fitness_reward/std": 2.0273635387420654, + "rewards/kidney_reward/mean": 0.42593345046043396, + "rewards/kidney_reward/std": 1.3813767433166504, + "rewards/length2tails_reward/mean": 0.781768798828125, + "rewards/length2tails_reward/std": 0.26373201608657837, + "rewards/thermo_reward/mean": 0.4698195457458496, + "rewards/thermo_reward/std": 1.7537208795547485, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13711238093674183, + "epoch": 1.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7596772909164429, + "learning_rate": 4.6696827127200645e-07, + "loss": 0.0345, + "num_tokens": 6109249.0, + "reward": 6.154385566711426, + "reward_std": 2.3298048973083496, + "rewards/fitness_reward/mean": 6.0287909507751465, + "rewards/fitness_reward/std": 2.1384429931640625, + "rewards/kidney_reward/mean": 0.17962273955345154, + "rewards/kidney_reward/std": 1.431584119796753, + "rewards/length2tails_reward/mean": 0.7196434736251831, + "rewards/length2tails_reward/std": 0.3107772171497345, + "rewards/thermo_reward/mean": -0.2882553040981293, + "rewards/thermo_reward/std": 2.1877593994140625, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13093996793031693, + "epoch": 1.3980000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4921972155570984, + "learning_rate": 4.641732050210031e-07, + "loss": -0.0016, + "num_tokens": 6117917.0, + "reward": 6.080353260040283, + "reward_std": 3.1049883365631104, + "rewards/fitness_reward/mean": 5.818460464477539, + "rewards/fitness_reward/std": 2.7074880599975586, + "rewards/kidney_reward/mean": -0.03080900013446808, + "rewards/kidney_reward/std": 1.447316288948059, + "rewards/length2tails_reward/mean": 0.7562304735183716, + "rewards/length2tails_reward/std": 0.30384624004364014, + "rewards/thermo_reward/mean": 0.17647922039031982, + "rewards/thermo_reward/std": 1.8737000226974487, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.1426345268264413, + "epoch": 1.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1829280853271484, + "learning_rate": 4.613839984859834e-07, + "loss": -0.0066, + "num_tokens": 6126608.0, + "reward": 6.628046989440918, + "reward_std": 2.157825231552124, + "rewards/fitness_reward/mean": 6.300601005554199, + "rewards/fitness_reward/std": 1.6444309949874878, + "rewards/kidney_reward/mean": 0.13302013278007507, + "rewards/kidney_reward/std": 1.1151403188705444, + "rewards/length2tails_reward/mean": 0.832331657409668, + "rewards/length2tails_reward/std": 0.23637966811656952, + "rewards/thermo_reward/mean": 0.10570541024208069, + "rewards/thermo_reward/std": 1.9269342422485352, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.14826463162899017, + "epoch": 1.4020000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9852263927459717, + "learning_rate": 4.5860068216926193e-07, + "loss": -0.0041, + "num_tokens": 6135321.0, + "reward": 6.435057640075684, + "reward_std": 2.8785433769226074, + "rewards/fitness_reward/mean": 5.807590484619141, + "rewards/fitness_reward/std": 2.732304811477661, + "rewards/kidney_reward/mean": 0.12211328744888306, + "rewards/kidney_reward/std": 1.2959322929382324, + "rewards/length2tails_reward/mean": 0.7865521907806396, + "rewards/length2tails_reward/std": 0.2832757234573364, + "rewards/thermo_reward/mean": 0.7395450472831726, + "rewards/thermo_reward/std": 1.5342499017715454, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1477870885282755, + "epoch": 1.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6566662192344666, + "learning_rate": 4.5582328650874093e-07, + "loss": -0.0002, + "num_tokens": 6144024.0, + "reward": 7.028265953063965, + "reward_std": 1.0408565998077393, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.11437780410051346, + "rewards/kidney_reward/std": 1.422194004058838, + "rewards/length2tails_reward/mean": 0.8510051965713501, + "rewards/length2tails_reward/std": 0.15942418575286865, + "rewards/thermo_reward/mean": 0.3340543508529663, + "rewards/thermo_reward/std": 1.6969246864318848, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 262.90625, + "completions/mean_terminated_length": 262.90625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.13171542342752218, + "epoch": 1.4060000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2658623456954956, + "learning_rate": 4.530518418775733e-07, + "loss": -0.1102, + "num_tokens": 6152469.0, + "reward": 6.7991437911987305, + "reward_std": 2.2850501537323, + "rewards/fitness_reward/mean": 6.139388084411621, + "rewards/fitness_reward/std": 2.039874792098999, + "rewards/kidney_reward/mean": 0.4406880736351013, + "rewards/kidney_reward/std": 1.2506152391433716, + "rewards/length2tails_reward/mean": 0.748024046421051, + "rewards/length2tails_reward/std": 0.31513237953186035, + "rewards/thermo_reward/mean": 0.504810094833374, + "rewards/thermo_reward/std": 1.7205911874771118, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 279.0, + "completions/mean_terminated_length": 279.0, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.15343958605080843, + "epoch": 1.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9244414567947388, + "learning_rate": 4.502863785838341e-07, + "loss": 0.0728, + "num_tokens": 6161429.0, + "reward": 5.295343399047852, + "reward_std": 4.2171220779418945, + "rewards/fitness_reward/mean": 4.8122711181640625, + "rewards/fitness_reward/std": 4.208260536193848, + "rewards/kidney_reward/mean": 0.4207957983016968, + "rewards/kidney_reward/std": 1.3003346920013428, + "rewards/length2tails_reward/mean": 0.7871172428131104, + "rewards/length2tails_reward/std": 0.3086947202682495, + "rewards/thermo_reward/mean": 0.151790052652359, + "rewards/thermo_reward/std": 1.9177764654159546, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15190527960658073, + "epoch": 1.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.009446382522583, + "learning_rate": 4.475269268701868e-07, + "loss": -0.0039, + "num_tokens": 6170162.0, + "reward": 6.023007392883301, + "reward_std": 2.6928510665893555, + "rewards/fitness_reward/mean": 6.067559242248535, + "rewards/fitness_reward/std": 1.9372423887252808, + "rewards/kidney_reward/mean": -0.1874266266822815, + "rewards/kidney_reward/std": 1.2976768016815186, + "rewards/length2tails_reward/mean": 0.8091951012611389, + "rewards/length2tails_reward/std": 0.3004447817802429, + "rewards/thermo_reward/mean": -0.30627530813217163, + "rewards/thermo_reward/std": 2.1211562156677246, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.13627486024051905, + "epoch": 1.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6029331684112549, + "learning_rate": 4.447735169135532e-07, + "loss": -0.0003, + "num_tokens": 6178856.0, + "reward": 7.332171440124512, + "reward_std": 1.241804599761963, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4455115795135498, + "rewards/kidney_reward/std": 1.292559266090393, + "rewards/length2tails_reward/mean": 0.8162680268287659, + "rewards/length2tails_reward/std": 0.2966054379940033, + "rewards/thermo_reward/mean": 0.6281002759933472, + "rewards/thermo_reward/std": 1.5638302564620972, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 274.8125, + "completions/mean_terminated_length": 274.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13102816976606846, + "epoch": 1.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6926238536834717, + "learning_rate": 4.42026178824784e-07, + "loss": 0.0572, + "num_tokens": 6187682.0, + "reward": 6.521988391876221, + "reward_std": 2.370529890060425, + "rewards/fitness_reward/mean": 5.97020959854126, + "rewards/fitness_reward/std": 2.4585437774658203, + "rewards/kidney_reward/mean": 0.2856016457080841, + "rewards/kidney_reward/std": 1.395315408706665, + "rewards/length2tails_reward/mean": 0.7549359202384949, + "rewards/length2tails_reward/std": 0.30937156081199646, + "rewards/thermo_reward/mean": 0.4404873251914978, + "rewards/thermo_reward/std": 1.609288215637207, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 268.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.15934985969215631, + "epoch": 1.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1592636108398438, + "learning_rate": 4.3928494264832736e-07, + "loss": -0.0237, + "num_tokens": 6196314.0, + "reward": 6.460187911987305, + "reward_std": 3.1838345527648926, + "rewards/fitness_reward/mean": 5.746843338012695, + "rewards/fitness_reward/std": 2.949125051498413, + "rewards/kidney_reward/mean": 0.1856032907962799, + "rewards/kidney_reward/std": 1.2820947170257568, + "rewards/length2tails_reward/mean": 0.7322978973388672, + "rewards/length2tails_reward/std": 0.35097241401672363, + "rewards/thermo_reward/mean": 0.8749363422393799, + "rewards/thermo_reward/std": 1.5884485244750977, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 275.03125, + "completions/mean_terminated_length": 275.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1464253282174468, + "epoch": 1.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1439844369888306, + "learning_rate": 4.3654983836190353e-07, + "loss": 0.0062, + "num_tokens": 6205147.0, + "reward": 7.147367477416992, + "reward_std": 1.3127976655960083, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5982942581176758, + "rewards/kidney_reward/std": 1.1980109214782715, + "rewards/length2tails_reward/mean": 0.816143274307251, + "rewards/length2tails_reward/std": 0.24777448177337646, + "rewards/thermo_reward/mean": 0.10577264428138733, + "rewards/thermo_reward/std": 1.8736296892166138, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 280.75, + "completions/mean_terminated_length": 280.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.18232927843928337, + "epoch": 1.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.643324851989746, + "learning_rate": 4.3382089587617466e-07, + "loss": 0.0791, + "num_tokens": 6214163.0, + "reward": 5.536927223205566, + "reward_std": 4.0529279708862305, + "rewards/fitness_reward/mean": 5.117922782897949, + "rewards/fitness_reward/std": 3.7145843505859375, + "rewards/kidney_reward/mean": 0.32142871618270874, + "rewards/kidney_reward/std": 1.3823848962783813, + "rewards/length2tails_reward/mean": 0.732482373714447, + "rewards/length2tails_reward/std": 0.32569506764411926, + "rewards/thermo_reward/mean": 0.15033851563930511, + "rewards/thermo_reward/std": 1.758060336112976, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.1543517718091607, + "epoch": 1.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5001479983329773, + "learning_rate": 4.310981450344189e-07, + "loss": 0.0008, + "num_tokens": 6223023.0, + "reward": 6.842094898223877, + "reward_std": 2.057859182357788, + "rewards/fitness_reward/mean": 6.2024149894714355, + "rewards/fitness_reward/std": 1.701224446296692, + "rewards/kidney_reward/mean": 0.35328245162963867, + "rewards/kidney_reward/std": 1.293013334274292, + "rewards/length2tails_reward/mean": 0.7150019407272339, + "rewards/length2tails_reward/std": 0.3326190114021301, + "rewards/thermo_reward/mean": 0.568576455116272, + "rewards/thermo_reward/std": 1.4404157400131226, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 277.625, + "completions/mean_terminated_length": 277.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16860883217304945, + "epoch": 1.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1438326835632324, + "learning_rate": 4.2838161561220244e-07, + "loss": 0.0812, + "num_tokens": 6231939.0, + "reward": 6.197243690490723, + "reward_std": 3.0033605098724365, + "rewards/fitness_reward/mean": 5.695122718811035, + "rewards/fitness_reward/std": 2.782221555709839, + "rewards/kidney_reward/mean": 0.34427207708358765, + "rewards/kidney_reward/std": 1.3794307708740234, + "rewards/length2tails_reward/mean": 0.7902058362960815, + "rewards/length2tails_reward/std": 0.271558940410614, + "rewards/thermo_reward/mean": 0.2648661136627197, + "rewards/thermo_reward/std": 1.9792098999023438, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13542633317410946, + "epoch": 1.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2660166025161743, + "learning_rate": 4.256713373170564e-07, + "loss": -0.0042, + "num_tokens": 6240630.0, + "reward": 6.704099655151367, + "reward_std": 1.2607440948486328, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.02215583622455597, + "rewards/kidney_reward/std": 1.2196651697158813, + "rewards/length2tails_reward/mean": 0.783263623714447, + "rewards/length2tails_reward/std": 0.28573325276374817, + "rewards/thermo_reward/mean": 0.06210419535636902, + "rewards/thermo_reward/std": 1.921356201171875, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.13486134819686413, + "epoch": 1.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6709299087524414, + "learning_rate": 4.2296733978814993e-07, + "loss": 0.0063, + "num_tokens": 6249313.0, + "reward": 6.4241414070129395, + "reward_std": 2.8587379455566406, + "rewards/fitness_reward/mean": 5.806258678436279, + "rewards/fitness_reward/std": 2.7234749794006348, + "rewards/kidney_reward/mean": 0.1877380609512329, + "rewards/kidney_reward/std": 1.3938689231872559, + "rewards/length2tails_reward/mean": 0.7809352874755859, + "rewards/length2tails_reward/std": 0.2129465937614441, + "rewards/thermo_reward/mean": 0.6575589179992676, + "rewards/thermo_reward/std": 1.4148534536361694, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 285.09375, + "completions/mean_terminated_length": 285.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16030831821262836, + "epoch": 1.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9095091819763184, + "learning_rate": 4.202696525959666e-07, + "loss": 0.1271, + "num_tokens": 6258468.0, + "reward": 6.294330596923828, + "reward_std": 3.1249303817749023, + "rewards/fitness_reward/mean": 5.614455699920654, + "rewards/fitness_reward/std": 3.1002912521362305, + "rewards/kidney_reward/mean": 0.4036597013473511, + "rewards/kidney_reward/std": 1.2593122720718384, + "rewards/length2tails_reward/mean": 0.8412476778030396, + "rewards/length2tails_reward/std": 0.2489180564880371, + "rewards/thermo_reward/mean": 0.5354666113853455, + "rewards/thermo_reward/std": 1.867226481437683, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12977517396211624, + "epoch": 1.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7026931047439575, + "learning_rate": 4.175783052419819e-07, + "loss": 0.0048, + "num_tokens": 6267161.0, + "reward": 6.894268989562988, + "reward_std": 2.2060632705688477, + "rewards/fitness_reward/mean": 6.192601203918457, + "rewards/fitness_reward/std": 1.7534875869750977, + "rewards/kidney_reward/mean": 0.4717509150505066, + "rewards/kidney_reward/std": 1.4465810060501099, + "rewards/length2tails_reward/mean": 0.7974022030830383, + "rewards/length2tails_reward/std": 0.2455170601606369, + "rewards/thermo_reward/mean": 0.5328824520111084, + "rewards/thermo_reward/std": 1.7917709350585938, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14737006183713675, + "epoch": 1.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37094080448150635, + "learning_rate": 4.148933271583385e-07, + "loss": -0.0051, + "num_tokens": 6275855.0, + "reward": 6.644562244415283, + "reward_std": 1.907673716545105, + "rewards/fitness_reward/mean": 6.208745956420898, + "rewards/fitness_reward/std": 1.667616844177246, + "rewards/kidney_reward/mean": 0.17287281155586243, + "rewards/kidney_reward/std": 1.3916083574295044, + "rewards/length2tails_reward/mean": 0.7228462100028992, + "rewards/length2tails_reward/std": 0.33311793208122253, + "rewards/thermo_reward/mean": 0.3373357951641083, + "rewards/thermo_reward/std": 1.5994102954864502, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14476253651082516, + "epoch": 1.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4074987173080444, + "learning_rate": 4.1221474770752696e-07, + "loss": 0.0015, + "num_tokens": 6284565.0, + "reward": 6.082912445068359, + "reward_std": 2.8743603229522705, + "rewards/fitness_reward/mean": 5.716564655303955, + "rewards/fitness_reward/std": 2.714968204498291, + "rewards/kidney_reward/mean": 0.4139128625392914, + "rewards/kidney_reward/std": 1.3907051086425781, + "rewards/length2tails_reward/mean": 0.7764250040054321, + "rewards/length2tails_reward/std": 0.3029692769050598, + "rewards/thermo_reward/mean": -0.06942936778068542, + "rewards/thermo_reward/std": 1.9993343353271484, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13786921091377735, + "epoch": 1.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49812331795692444, + "learning_rate": 4.095425961820629e-07, + "loss": 0.0024, + "num_tokens": 6293282.0, + "reward": 6.756870746612549, + "reward_std": 2.109835624694824, + "rewards/fitness_reward/mean": 6.285638332366943, + "rewards/fitness_reward/std": 1.7290723323822021, + "rewards/kidney_reward/mean": 0.2809326648712158, + "rewards/kidney_reward/std": 1.3989744186401367, + "rewards/length2tails_reward/mean": 0.7933116555213928, + "rewards/length2tails_reward/std": 0.25988250970840454, + "rewards/thermo_reward/mean": 0.26487648487091064, + "rewards/thermo_reward/std": 1.68715238571167, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12949824333190918, + "epoch": 1.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40592843294143677, + "learning_rate": 4.0687690180416735e-07, + "loss": 0.0058, + "num_tokens": 6301970.0, + "reward": 6.743653297424316, + "reward_std": 2.1691648960113525, + "rewards/fitness_reward/mean": 6.305906295776367, + "rewards/fitness_reward/std": 1.6144205331802368, + "rewards/kidney_reward/mean": 0.3100203573703766, + "rewards/kidney_reward/std": 1.57994544506073, + "rewards/length2tails_reward/mean": 0.7571724057197571, + "rewards/length2tails_reward/std": 0.280365914106369, + "rewards/thermo_reward/mean": 0.18688730895519257, + "rewards/thermo_reward/std": 2.027261257171631, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1389858890324831, + "epoch": 1.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.682332456111908, + "learning_rate": 4.0421769372544735e-07, + "loss": 0.0016, + "num_tokens": 6310676.0, + "reward": 6.990462303161621, + "reward_std": 1.2100632190704346, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.08624760806560516, + "rewards/kidney_reward/std": 1.4657799005508423, + "rewards/length2tails_reward/mean": 0.787990152835846, + "rewards/length2tails_reward/std": 0.2811163067817688, + "rewards/thermo_reward/mean": 0.5240628719329834, + "rewards/thermo_reward/std": 1.766283392906189, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.1607711762189865, + "epoch": 1.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2282334566116333, + "learning_rate": 4.0156500102657565e-07, + "loss": -0.0008, + "num_tokens": 6319393.0, + "reward": 6.009991645812988, + "reward_std": 2.9848763942718506, + "rewards/fitness_reward/mean": 5.738341331481934, + "rewards/fitness_reward/std": 2.6460328102111816, + "rewards/kidney_reward/mean": -0.1635066568851471, + "rewards/kidney_reward/std": 1.2695133686065674, + "rewards/length2tails_reward/mean": 0.7677510976791382, + "rewards/length2tails_reward/std": 0.317500501871109, + "rewards/thermo_reward/mean": 0.32293200492858887, + "rewards/thermo_reward/std": 1.7275975942611694, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1493921745568514, + "epoch": 1.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7565339803695679, + "learning_rate": 3.989188527169749e-07, + "loss": 0.0019, + "num_tokens": 6328102.0, + "reward": 7.0062761306762695, + "reward_std": 1.3715838193893433, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.20004817843437195, + "rewards/kidney_reward/std": 1.3357537984848022, + "rewards/length2tails_reward/mean": 0.8116669654846191, + "rewards/length2tails_reward/std": 0.2387673556804657, + "rewards/thermo_reward/mean": 0.4300525188446045, + "rewards/thermo_reward/std": 1.686034917831421, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 269.3125, + "completions/mean_terminated_length": 269.3125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.16109525505453348, + "epoch": 1.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2488136291503906, + "learning_rate": 3.962792777344992e-07, + "loss": -0.0079, + "num_tokens": 6336752.0, + "reward": 5.827999114990234, + "reward_std": 3.409680128097534, + "rewards/fitness_reward/mean": 5.630746364593506, + "rewards/fitness_reward/std": 3.0257010459899902, + "rewards/kidney_reward/mean": 0.071767657995224, + "rewards/kidney_reward/std": 1.482879877090454, + "rewards/length2tails_reward/mean": 0.780424952507019, + "rewards/length2tails_reward/std": 0.2868749499320984, + "rewards/thermo_reward/mean": -0.06747539341449738, + "rewards/thermo_reward/std": 1.8410335779190063, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 274.84375, + "completions/mean_terminated_length": 274.84375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.1310996413230896, + "epoch": 1.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6772858500480652, + "learning_rate": 3.9364630494511785e-07, + "loss": 0.0615, + "num_tokens": 6345579.0, + "reward": 6.597156524658203, + "reward_std": 2.6364808082580566, + "rewards/fitness_reward/mean": 5.959640026092529, + "rewards/fitness_reward/std": 2.4948079586029053, + "rewards/kidney_reward/mean": 0.16516277194023132, + "rewards/kidney_reward/std": 1.3675750494003296, + "rewards/length2tails_reward/mean": 0.8238409161567688, + "rewards/length2tails_reward/std": 0.22302155196666718, + "rewards/thermo_reward/mean": 0.6979496479034424, + "rewards/thermo_reward/std": 1.7199643850326538, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13089086394757032, + "epoch": 1.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2512903213500977, + "learning_rate": 3.910199631425989e-07, + "loss": 0.0021, + "num_tokens": 6354240.0, + "reward": 6.735081195831299, + "reward_std": 1.1765868663787842, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.17595863342285156, + "rewards/kidney_reward/std": 1.393303394317627, + "rewards/length2tails_reward/mean": 0.7197104692459106, + "rewards/length2tails_reward/std": 0.28845617175102234, + "rewards/thermo_reward/mean": 0.3096475303173065, + "rewards/thermo_reward/std": 1.8400081396102905, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1427407544106245, + "epoch": 1.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0994105339050293, + "learning_rate": 3.884002810481958e-07, + "loss": 0.026, + "num_tokens": 6362917.0, + "reward": 6.652989387512207, + "reward_std": 2.174849510192871, + "rewards/fitness_reward/mean": 6.154126167297363, + "rewards/fitness_reward/std": 1.9601200819015503, + "rewards/kidney_reward/mean": 0.07902175188064575, + "rewards/kidney_reward/std": 1.5565807819366455, + "rewards/length2tails_reward/mean": 0.7475125789642334, + "rewards/length2tails_reward/std": 0.2664393484592438, + "rewards/thermo_reward/mean": 0.5449486374855042, + "rewards/thermo_reward/std": 1.9561376571655273, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 265.78125, + "completions/mean_terminated_length": 265.78125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.1477462099865079, + "epoch": 1.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5613549947738647, + "learning_rate": 3.8578728731033214e-07, + "loss": -0.0858, + "num_tokens": 6371454.0, + "reward": 6.234399795532227, + "reward_std": 3.3079497814178467, + "rewards/fitness_reward/mean": 5.8375749588012695, + "rewards/fitness_reward/std": 2.622896432876587, + "rewards/kidney_reward/mean": 0.4113604724407196, + "rewards/kidney_reward/std": 1.4180752038955688, + "rewards/length2tails_reward/mean": 0.7879568338394165, + "rewards/length2tails_reward/std": 0.28058895468711853, + "rewards/thermo_reward/mean": -0.011688388884067535, + "rewards/thermo_reward/std": 1.937193751335144, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1305059539154172, + "epoch": 1.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48471352458000183, + "learning_rate": 3.83181010504289e-07, + "loss": 0.0015, + "num_tokens": 6380159.0, + "reward": 7.542658805847168, + "reward_std": 1.0278334617614746, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.6681248545646667, + "rewards/kidney_reward/std": 1.405533790588379, + "rewards/length2tails_reward/mean": 0.8058980107307434, + "rewards/length2tails_reward/std": 0.23396119475364685, + "rewards/thermo_reward/mean": 0.8316472172737122, + "rewards/thermo_reward/std": 1.488695502281189, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14527638908475637, + "epoch": 1.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6321706771850586, + "learning_rate": 3.805814791318921e-07, + "loss": 0.0374, + "num_tokens": 6388902.0, + "reward": 6.352047920227051, + "reward_std": 2.3665616512298584, + "rewards/fitness_reward/mean": 6.037888526916504, + "rewards/fitness_reward/std": 2.0909011363983154, + "rewards/kidney_reward/mean": -0.15655474364757538, + "rewards/kidney_reward/std": 1.4381589889526367, + "rewards/length2tails_reward/mean": 0.757357656955719, + "rewards/length2tails_reward/std": 0.2956610321998596, + "rewards/thermo_reward/mean": 0.4061948359012604, + "rewards/thermo_reward/std": 1.599929928779602, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 278.28125, + "completions/mean_terminated_length": 278.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15624709613621235, + "epoch": 1.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.671223521232605, + "learning_rate": 3.7798872162119944e-07, + "loss": -0.0213, + "num_tokens": 6397839.0, + "reward": 7.021697998046875, + "reward_std": 1.865939736366272, + "rewards/fitness_reward/mean": 6.300729751586914, + "rewards/fitness_reward/std": 1.6437019109725952, + "rewards/kidney_reward/mean": 0.26250338554382324, + "rewards/kidney_reward/std": 1.2467670440673828, + "rewards/length2tails_reward/mean": 0.8321989178657532, + "rewards/length2tails_reward/std": 0.2071176916360855, + "rewards/thermo_reward/mean": 0.7633329629898071, + "rewards/thermo_reward/std": 1.7512410879135132, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12121308408677578, + "epoch": 1.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6998794078826904, + "learning_rate": 3.754027663261922e-07, + "loss": 0.0009, + "num_tokens": 6406502.0, + "reward": 6.616936206817627, + "reward_std": 1.400047779083252, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.12396623194217682, + "rewards/kidney_reward/std": 1.2756109237670898, + "rewards/length2tails_reward/mean": 0.7037824392318726, + "rewards/length2tails_reward/std": 0.32783785462379456, + "rewards/thermo_reward/mean": -0.21860280632972717, + "rewards/thermo_reward/std": 1.9138315916061401, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 274.28125, + "completions/mean_terminated_length": 274.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15775783639401197, + "epoch": 1.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.551760673522949, + "learning_rate": 3.7282364152646295e-07, + "loss": 0.0629, + "num_tokens": 6415311.0, + "reward": 6.355654239654541, + "reward_std": 3.179107427597046, + "rewards/fitness_reward/mean": 5.740793228149414, + "rewards/fitness_reward/std": 2.9725489616394043, + "rewards/kidney_reward/mean": -0.011758260428905487, + "rewards/kidney_reward/std": 1.2169761657714844, + "rewards/length2tails_reward/mean": 0.7925014495849609, + "rewards/length2tails_reward/std": 0.24488230049610138, + "rewards/thermo_reward/mean": 0.8452297449111938, + "rewards/thermo_reward/std": 1.5913621187210083, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.03125, + "completions/mean_terminated_length": 267.03125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.1718240324407816, + "epoch": 1.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.152747631072998, + "learning_rate": 3.7025137542690755e-07, + "loss": -0.0355, + "num_tokens": 6423888.0, + "reward": 6.196893692016602, + "reward_std": 3.082811117172241, + "rewards/fitness_reward/mean": 5.675358772277832, + "rewards/fitness_reward/std": 2.875387191772461, + "rewards/kidney_reward/mean": 0.2695336639881134, + "rewards/kidney_reward/std": 1.4880645275115967, + "rewards/length2tails_reward/mean": 0.8455706834793091, + "rewards/length2tails_reward/std": 0.21372568607330322, + "rewards/thermo_reward/mean": 0.35075080394744873, + "rewards/thermo_reward/std": 1.8669308423995972, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1462901495397091, + "epoch": 1.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.639938473701477, + "learning_rate": 3.676859961574161e-07, + "loss": 0.0018, + "num_tokens": 6432603.0, + "reward": 6.459516525268555, + "reward_std": 2.2774791717529297, + "rewards/fitness_reward/mean": 6.283863067626953, + "rewards/fitness_reward/std": 1.7391149997711182, + "rewards/kidney_reward/mean": 0.06943115592002869, + "rewards/kidney_reward/std": 1.2155630588531494, + "rewards/length2tails_reward/mean": 0.7969694137573242, + "rewards/length2tails_reward/std": 0.23394715785980225, + "rewards/thermo_reward/mean": -0.11660884320735931, + "rewards/thermo_reward/std": 2.0059316158294678, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 278.5625, + "completions/mean_terminated_length": 278.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14995314087718725, + "epoch": 1.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5865871906280518, + "learning_rate": 3.651275317725647e-07, + "loss": 0.0141, + "num_tokens": 6441549.0, + "reward": 6.942385673522949, + "reward_std": 1.2349860668182373, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5532947182655334, + "rewards/kidney_reward/std": 1.4463164806365967, + "rewards/length2tails_reward/mean": 0.778032124042511, + "rewards/length2tails_reward/std": 0.2799816131591797, + "rewards/thermo_reward/mean": -0.03415824472904205, + "rewards/thermo_reward/std": 1.9036084413528442, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.148209435865283, + "epoch": 1.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4772525131702423, + "learning_rate": 3.625760102513102e-07, + "loss": -0.0094, + "num_tokens": 6450300.0, + "reward": 7.0528411865234375, + "reward_std": 1.1199755668640137, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4002948999404907, + "rewards/kidney_reward/std": 1.1845812797546387, + "rewards/length2tails_reward/mean": 0.7716137170791626, + "rewards/length2tails_reward/std": 0.28937315940856934, + "rewards/thermo_reward/mean": 0.13698430359363556, + "rewards/thermo_reward/std": 1.7761671543121338, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 277.09375, + "completions/mean_terminated_length": 277.09375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.16739436611533165, + "epoch": 1.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3354683816432953, + "learning_rate": 3.6003145949668337e-07, + "loss": 0.0001, + "num_tokens": 6459199.0, + "reward": 6.582250595092773, + "reward_std": 1.9421275854110718, + "rewards/fitness_reward/mean": 6.31489372253418, + "rewards/fitness_reward/std": 1.563578486442566, + "rewards/kidney_reward/mean": 0.23158857226371765, + "rewards/kidney_reward/std": 1.3394967317581177, + "rewards/length2tails_reward/mean": 0.7185154557228088, + "rewards/length2tails_reward/std": 0.3428117632865906, + "rewards/thermo_reward/mean": -0.05613328516483307, + "rewards/thermo_reward/std": 1.9670833349227905, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 281.34375, + "completions/mean_terminated_length": 281.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18640653602778912, + "epoch": 1.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.452791690826416, + "learning_rate": 3.574939073354838e-07, + "loss": 0.1234, + "num_tokens": 6468234.0, + "reward": 6.656317710876465, + "reward_std": 2.1846671104431152, + "rewards/fitness_reward/mean": 6.233187675476074, + "rewards/fitness_reward/std": 2.0257773399353027, + "rewards/kidney_reward/mean": 0.15020138025283813, + "rewards/kidney_reward/std": 1.2639641761779785, + "rewards/length2tails_reward/mean": 0.8355987071990967, + "rewards/length2tails_reward/std": 0.26534393429756165, + "rewards/thermo_reward/mean": 0.2782592177391052, + "rewards/thermo_reward/std": 1.6797844171524048, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.13094077538698912, + "epoch": 1.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1043927669525146, + "learning_rate": 3.5496338151797455e-07, + "loss": 0.0285, + "num_tokens": 6476955.0, + "reward": 6.581585884094238, + "reward_std": 3.3811569213867188, + "rewards/fitness_reward/mean": 5.868983268737793, + "rewards/fitness_reward/std": 2.8453943729400635, + "rewards/kidney_reward/mean": 0.2880333662033081, + "rewards/kidney_reward/std": 1.3903483152389526, + "rewards/length2tails_reward/mean": 0.764015793800354, + "rewards/length2tails_reward/std": 0.304887980222702, + "rewards/thermo_reward/mean": 0.755163848400116, + "rewards/thermo_reward/std": 1.597685694694519, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1269969353452325, + "epoch": 1.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49138695001602173, + "learning_rate": 3.524399097175812e-07, + "loss": 0.0031, + "num_tokens": 6485620.0, + "reward": 6.329876899719238, + "reward_std": 2.166242837905884, + "rewards/fitness_reward/mean": 6.006310939788818, + "rewards/fitness_reward/std": 1.7928966283798218, + "rewards/kidney_reward/mean": 0.23051409423351288, + "rewards/kidney_reward/std": 1.360937237739563, + "rewards/length2tails_reward/mean": 0.6920793056488037, + "rewards/length2tails_reward/std": 0.34749844670295715, + "rewards/thermo_reward/mean": 0.07057763636112213, + "rewards/thermo_reward/std": 1.8305820226669312, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 275.59375, + "completions/mean_terminated_length": 275.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1454880153760314, + "epoch": 1.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2874481678009033, + "learning_rate": 3.4992351953058674e-07, + "loss": 0.0562, + "num_tokens": 6494471.0, + "reward": 6.655930519104004, + "reward_std": 2.397416830062866, + "rewards/fitness_reward/mean": 6.119657516479492, + "rewards/fitness_reward/std": 2.1470818519592285, + "rewards/kidney_reward/mean": 0.4230097234249115, + "rewards/kidney_reward/std": 1.2466100454330444, + "rewards/length2tails_reward/mean": 0.778205394744873, + "rewards/length2tails_reward/std": 0.32155641913414, + "rewards/thermo_reward/mean": 0.26043346524238586, + "rewards/thermo_reward/std": 1.7763069868087769, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14261612948030233, + "epoch": 1.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6479060649871826, + "learning_rate": 3.4741423847583127e-07, + "loss": 0.0011, + "num_tokens": 6503202.0, + "reward": 6.945497989654541, + "reward_std": 1.229175329208374, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.250993937253952, + "rewards/kidney_reward/std": 1.3482167720794678, + "rewards/length2tails_reward/mean": 0.8469655513763428, + "rewards/length2tails_reward/std": 0.17578914761543274, + "rewards/thermo_reward/mean": 0.5359105467796326, + "rewards/thermo_reward/std": 1.7304365634918213, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1367456018924713, + "epoch": 1.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4656844735145569, + "learning_rate": 3.449120939944107e-07, + "loss": -0.0022, + "num_tokens": 6511891.0, + "reward": 6.813487529754639, + "reward_std": 1.253298044204712, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.1098024919629097, + "rewards/kidney_reward/std": 1.445411205291748, + "rewards/length2tails_reward/mean": 0.7371547818183899, + "rewards/length2tails_reward/std": 0.3081863820552826, + "rewards/thermo_reward/mean": 0.17197644710540771, + "rewards/thermo_reward/std": 2.0185134410858154, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 277.03125, + "completions/mean_terminated_length": 277.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1425960147753358, + "epoch": 1.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9627833962440491, + "learning_rate": 3.424171134493755e-07, + "loss": -0.0259, + "num_tokens": 6520788.0, + "reward": 6.977728843688965, + "reward_std": 1.2582330703735352, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.09188287705183029, + "rewards/kidney_reward/std": 1.2674611806869507, + "rewards/length2tails_reward/mean": 0.7306835651397705, + "rewards/length2tails_reward/std": 0.3111197054386139, + "rewards/thermo_reward/mean": 0.3156362771987915, + "rewards/thermo_reward/std": 1.8351783752441406, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13787766080349684, + "epoch": 1.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3223366141319275, + "learning_rate": 3.399293241254335e-07, + "loss": 0.0005, + "num_tokens": 6529487.0, + "reward": 7.105623722076416, + "reward_std": 1.2418174743652344, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.3590460419654846, + "rewards/kidney_reward/std": 1.3093960285186768, + "rewards/length2tails_reward/mean": 0.8181041479110718, + "rewards/length2tails_reward/std": 0.2645956575870514, + "rewards/thermo_reward/mean": 0.26055315136909485, + "rewards/thermo_reward/std": 1.879141092300415, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.14608249254524708, + "epoch": 1.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.280642032623291, + "learning_rate": 3.374487532286503e-07, + "loss": -0.0227, + "num_tokens": 6538158.0, + "reward": 6.483077049255371, + "reward_std": 2.6963179111480713, + "rewards/fitness_reward/mean": 5.915217399597168, + "rewards/fitness_reward/std": 2.243011236190796, + "rewards/kidney_reward/mean": 0.2806551158428192, + "rewards/kidney_reward/std": 1.3728772401809692, + "rewards/length2tails_reward/mean": 0.788128674030304, + "rewards/length2tails_reward/std": 0.30882179737091064, + "rewards/thermo_reward/mean": 0.4609989523887634, + "rewards/thermo_reward/std": 1.7979570627212524, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.13642570655792952, + "epoch": 1.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.656134843826294, + "learning_rate": 3.349754278861516e-07, + "loss": -0.004, + "num_tokens": 6546860.0, + "reward": 6.611035346984863, + "reward_std": 2.2067809104919434, + "rewards/fitness_reward/mean": 6.207198143005371, + "rewards/fitness_reward/std": 2.1727962493896484, + "rewards/kidney_reward/mean": 0.1110004112124443, + "rewards/kidney_reward/std": 1.3077661991119385, + "rewards/length2tails_reward/mean": 0.8213242888450623, + "rewards/length2tails_reward/std": 0.22857342660427094, + "rewards/thermo_reward/mean": 0.28601163625717163, + "rewards/thermo_reward/std": 1.7221519947052002, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1397933829575777, + "epoch": 1.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.934018075466156, + "learning_rate": 3.3250937514582753e-07, + "loss": -0.0035, + "num_tokens": 6555568.0, + "reward": 7.2150750160217285, + "reward_std": 1.4517903327941895, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.5459381937980652, + "rewards/kidney_reward/std": 1.1985267400741577, + "rewards/length2tails_reward/mean": 0.8230719566345215, + "rewards/length2tails_reward/std": 0.22582411766052246, + "rewards/thermo_reward/mean": 0.7020360231399536, + "rewards/thermo_reward/std": 1.766005516052246, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.149237509816885, + "epoch": 1.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5774146914482117, + "learning_rate": 3.3005062197603506e-07, + "loss": 0.0005, + "num_tokens": 6564300.0, + "reward": 7.204289436340332, + "reward_std": 1.24748957157135, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5114156007766724, + "rewards/kidney_reward/std": 1.2313088178634644, + "rewards/length2tails_reward/mean": 0.8665796518325806, + "rewards/length2tails_reward/std": 0.18371771275997162, + "rewards/thermo_reward/mean": 0.28127649426460266, + "rewards/thermo_reward/std": 1.792626142501831, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 281.09375, + "completions/mean_terminated_length": 281.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14412524551153183, + "epoch": 1.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6420033574104309, + "learning_rate": 3.275991952653053e-07, + "loss": 0.0054, + "num_tokens": 6573327.0, + "reward": 7.389340400695801, + "reward_std": 1.008957862854004, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5648700594902039, + "rewards/kidney_reward/std": 1.3394099473953247, + "rewards/length2tails_reward/mean": 0.7444267868995667, + "rewards/length2tails_reward/std": 0.29618191719055176, + "rewards/thermo_reward/mean": 0.6590012311935425, + "rewards/thermo_reward/std": 1.6438076496124268, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1457577347755432, + "epoch": 1.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5573708415031433, + "learning_rate": 3.25155121822048e-07, + "loss": -0.0014, + "num_tokens": 6582048.0, + "reward": 7.003170013427734, + "reward_std": 1.8233615159988403, + "rewards/fitness_reward/mean": 6.211255073547363, + "rewards/fitness_reward/std": 1.6543270349502563, + "rewards/kidney_reward/mean": 0.5857619047164917, + "rewards/kidney_reward/std": 1.37223219871521, + "rewards/length2tails_reward/mean": 0.8062136173248291, + "rewards/length2tails_reward/std": 0.28524044156074524, + "rewards/thermo_reward/mean": 0.5949615240097046, + "rewards/thermo_reward/std": 1.6294947862625122, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.1419423883780837, + "epoch": 1.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9684510231018066, + "learning_rate": 3.227184283742591e-07, + "loss": 0.0262, + "num_tokens": 6590825.0, + "reward": 6.279714107513428, + "reward_std": 3.0279858112335205, + "rewards/fitness_reward/mean": 5.8744001388549805, + "rewards/fitness_reward/std": 2.8213443756103516, + "rewards/kidney_reward/mean": -0.025787919759750366, + "rewards/kidney_reward/std": 1.2580734491348267, + "rewards/length2tails_reward/mean": 0.8213226795196533, + "rewards/length2tails_reward/std": 0.2884416878223419, + "rewards/thermo_reward/mean": 0.4257543683052063, + "rewards/thermo_reward/std": 1.9172329902648926, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1424771547317505, + "epoch": 1.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0744562149047852, + "learning_rate": 3.20289141569227e-07, + "loss": -0.0169, + "num_tokens": 6599587.0, + "reward": 7.331998825073242, + "reward_std": 0.9264138340950012, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.17860844731330872, + "rewards/kidney_reward/std": 1.1688541173934937, + "rewards/length2tails_reward/mean": 0.7586427927017212, + "rewards/length2tails_reward/std": 0.2912338376045227, + "rewards/thermo_reward/mean": 0.9234717488288879, + "rewards/thermo_reward/std": 1.4338617324829102, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13977038767188787, + "epoch": 1.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6949334740638733, + "learning_rate": 3.178672879732435e-07, + "loss": -0.0046, + "num_tokens": 6608299.0, + "reward": 6.426409721374512, + "reward_std": 1.485270380973816, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.16230611503124237, + "rewards/kidney_reward/std": 1.4444512128829956, + "rewards/length2tails_reward/mean": 0.7741398811340332, + "rewards/length2tails_reward/std": 0.310896635055542, + "rewards/thermo_reward/mean": 0.06339424848556519, + "rewards/thermo_reward/std": 1.902835488319397, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.14292740728706121, + "epoch": 1.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.283230185508728, + "learning_rate": 3.154528940713113e-07, + "loss": -0.014, + "num_tokens": 6617014.0, + "reward": 6.407959461212158, + "reward_std": 2.7665135860443115, + "rewards/fitness_reward/mean": 5.613762378692627, + "rewards/fitness_reward/std": 2.7442517280578613, + "rewards/kidney_reward/mean": 0.08089083433151245, + "rewards/kidney_reward/std": 1.2893985509872437, + "rewards/length2tails_reward/mean": 0.8684854507446289, + "rewards/length2tails_reward/std": 0.19367212057113647, + "rewards/thermo_reward/mean": 1.0732603073120117, + "rewards/thermo_reward/std": 1.2305798530578613, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12837980967015028, + "epoch": 1.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46868693828582764, + "learning_rate": 3.130459862668554e-07, + "loss": -0.0007, + "num_tokens": 6625681.0, + "reward": 6.868636131286621, + "reward_std": 2.4470386505126953, + "rewards/fitness_reward/mean": 6.196429252624512, + "rewards/fitness_reward/std": 1.7330775260925293, + "rewards/kidney_reward/mean": 0.29288044571876526, + "rewards/kidney_reward/std": 1.4604089260101318, + "rewards/length2tails_reward/mean": 0.732858419418335, + "rewards/length2tails_reward/std": 0.3018275499343872, + "rewards/thermo_reward/mean": 0.6851028203964233, + "rewards/thermo_reward/std": 1.8471310138702393, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.14259129762649536, + "epoch": 1.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7912111878395081, + "learning_rate": 3.106465908814342e-07, + "loss": -0.0194, + "num_tokens": 6634452.0, + "reward": 7.157774448394775, + "reward_std": 1.1669427156448364, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.19365474581718445, + "rewards/kidney_reward/std": 1.4141430854797363, + "rewards/length2tails_reward/mean": 0.7690622806549072, + "rewards/length2tails_reward/std": 0.2600538730621338, + "rewards/thermo_reward/mean": 0.5547664165496826, + "rewards/thermo_reward/std": 1.8259605169296265, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.1724933609366417, + "epoch": 1.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.958454132080078, + "learning_rate": 3.082547341544507e-07, + "loss": -0.0201, + "num_tokens": 6643107.0, + "reward": 6.448651313781738, + "reward_std": 2.6311211585998535, + "rewards/fitness_reward/mean": 5.9628705978393555, + "rewards/fitness_reward/std": 2.4928791522979736, + "rewards/kidney_reward/mean": 0.640070915222168, + "rewards/kidney_reward/std": 1.338715672492981, + "rewards/length2tails_reward/mean": 0.8216780424118042, + "rewards/length2tails_reward/std": 0.2406664788722992, + "rewards/thermo_reward/mean": -0.07934901118278503, + "rewards/thermo_reward/std": 1.9766124486923218, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.14426173083484173, + "epoch": 1.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3123664855957031, + "learning_rate": 3.058704422428674e-07, + "loss": -0.0195, + "num_tokens": 6651879.0, + "reward": 5.917682647705078, + "reward_std": 3.5878570079803467, + "rewards/fitness_reward/mean": 5.518614768981934, + "rewards/fitness_reward/std": 3.100980520248413, + "rewards/kidney_reward/mean": 0.08244429528713226, + "rewards/kidney_reward/std": 1.3578275442123413, + "rewards/length2tails_reward/mean": 0.7430970072746277, + "rewards/length2tails_reward/std": 0.3185111880302429, + "rewards/thermo_reward/mean": 0.34414273500442505, + "rewards/thermo_reward/std": 1.9183611869812012, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1381951654329896, + "epoch": 1.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7138792872428894, + "learning_rate": 3.034937412209178e-07, + "loss": 0.0002, + "num_tokens": 6660582.0, + "reward": 7.420782089233398, + "reward_std": 1.140636682510376, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.6168582439422607, + "rewards/kidney_reward/std": 1.0981253385543823, + "rewards/length2tails_reward/mean": 0.7969210147857666, + "rewards/length2tails_reward/std": 0.2585451602935791, + "rewards/thermo_reward/mean": 0.6436493396759033, + "rewards/thermo_reward/std": 1.69111967086792, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14546612836420536, + "epoch": 1.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8966149091720581, + "learning_rate": 3.0112465707982416e-07, + "loss": -0.0007, + "num_tokens": 6669292.0, + "reward": 6.910525321960449, + "reward_std": 1.5989652872085571, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.32800981402397156, + "rewards/kidney_reward/std": 1.3178656101226807, + "rewards/length2tails_reward/mean": 0.8122183084487915, + "rewards/length2tails_reward/std": 0.24153052270412445, + "rewards/thermo_reward/mean": 0.31629177927970886, + "rewards/thermo_reward/std": 1.8588348627090454, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.16970859467983246, + "epoch": 1.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0278172492980957, + "learning_rate": 2.987632157275114e-07, + "loss": -0.022, + "num_tokens": 6677904.0, + "reward": 6.156267166137695, + "reward_std": 3.5620105266571045, + "rewards/fitness_reward/mean": 5.511993408203125, + "rewards/fitness_reward/std": 3.427915334701538, + "rewards/kidney_reward/mean": 0.068385548889637, + "rewards/kidney_reward/std": 1.282875418663025, + "rewards/length2tails_reward/mean": 0.7827757596969604, + "rewards/length2tails_reward/std": 0.2792876958847046, + "rewards/thermo_reward/mean": 0.828774094581604, + "rewards/thermo_reward/std": 1.5107433795928955, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.17203214205801487, + "epoch": 1.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5960419178009033, + "learning_rate": 2.9640944298832305e-07, + "loss": -0.0086, + "num_tokens": 6686648.0, + "reward": 6.601698875427246, + "reward_std": 2.5591580867767334, + "rewards/fitness_reward/mean": 6.115531921386719, + "rewards/fitness_reward/std": 2.169556140899658, + "rewards/kidney_reward/mean": -0.007775813341140747, + "rewards/kidney_reward/std": 1.2828702926635742, + "rewards/length2tails_reward/mean": 0.8065013885498047, + "rewards/length2tails_reward/std": 0.24861988425254822, + "rewards/thermo_reward/mean": 0.5768600702285767, + "rewards/thermo_reward/std": 1.7211017608642578, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.13950800150632858, + "epoch": 1.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.167220950126648, + "learning_rate": 2.940633646027414e-07, + "loss": -0.0092, + "num_tokens": 6695357.0, + "reward": 6.477987289428711, + "reward_std": 2.6941211223602295, + "rewards/fitness_reward/mean": 6.027511119842529, + "rewards/fitness_reward/std": 2.1451451778411865, + "rewards/kidney_reward/mean": 0.24668362736701965, + "rewards/kidney_reward/std": 1.1877830028533936, + "rewards/length2tails_reward/mean": 0.8343377113342285, + "rewards/length2tails_reward/std": 0.2589648962020874, + "rewards/thermo_reward/mean": 0.23709911108016968, + "rewards/thermo_reward/std": 1.8579859733581543, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 269.28125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.1609698971733451, + "epoch": 1.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4610633850097656, + "learning_rate": 2.9172500622710263e-07, + "loss": -0.027, + "num_tokens": 6704006.0, + "reward": 6.39097785949707, + "reward_std": 2.239888906478882, + "rewards/fitness_reward/mean": 6.104963302612305, + "rewards/fitness_reward/std": 1.7472589015960693, + "rewards/kidney_reward/mean": 0.10971612483263016, + "rewards/kidney_reward/std": 1.3672573566436768, + "rewards/length2tails_reward/mean": 0.8449288010597229, + "rewards/length2tails_reward/std": 0.23866154253482819, + "rewards/thermo_reward/mean": 0.03984944522380829, + "rewards/thermo_reward/std": 2.0633909702301025, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14749841205775738, + "epoch": 1.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4970659911632538, + "learning_rate": 2.8939439343332085e-07, + "loss": 0.0055, + "num_tokens": 6712725.0, + "reward": 6.730075359344482, + "reward_std": 2.084306001663208, + "rewards/fitness_reward/mean": 6.205704689025879, + "rewards/fitness_reward/std": 1.6837490797042847, + "rewards/kidney_reward/mean": 0.2538720965385437, + "rewards/kidney_reward/std": 1.404616117477417, + "rewards/length2tails_reward/mean": 0.812264084815979, + "rewards/length2tails_reward/std": 0.2586541473865509, + "rewards/thermo_reward/mean": 0.3887367248535156, + "rewards/thermo_reward/std": 1.8735015392303467, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 264.6875, + "completions/mean_terminated_length": 264.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.1574457697570324, + "epoch": 1.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2018728256225586, + "learning_rate": 2.87071551708603e-07, + "loss": -0.0648, + "num_tokens": 6721227.0, + "reward": 6.428178787231445, + "reward_std": 3.0503768920898438, + "rewards/fitness_reward/mean": 5.852085590362549, + "rewards/fitness_reward/std": 2.909271240234375, + "rewards/kidney_reward/mean": 0.39190569519996643, + "rewards/kidney_reward/std": 1.2846938371658325, + "rewards/length2tails_reward/mean": 0.7614554166793823, + "rewards/length2tails_reward/std": 0.29023200273513794, + "rewards/thermo_reward/mean": 0.3795527219772339, + "rewards/thermo_reward/std": 1.9699292182922363, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.142701574601233, + "epoch": 1.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5678212642669678, + "learning_rate": 2.847565064551747e-07, + "loss": 0.0046, + "num_tokens": 6729955.0, + "reward": 7.310421466827393, + "reward_std": 1.179175853729248, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.23811368644237518, + "rewards/kidney_reward/std": 1.3418374061584473, + "rewards/length2tails_reward/mean": 0.7479592561721802, + "rewards/length2tails_reward/std": 0.2607904374599457, + "rewards/thermo_reward/mean": 0.8261528015136719, + "rewards/thermo_reward/std": 1.7512696981430054, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.18231241684406996, + "epoch": 1.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5674819946289062, + "learning_rate": 2.824492829899994e-07, + "loss": -0.0263, + "num_tokens": 6738491.0, + "reward": 5.8658342361450195, + "reward_std": 3.3635613918304443, + "rewards/fitness_reward/mean": 5.505724906921387, + "rewards/fitness_reward/std": 3.4293646812438965, + "rewards/kidney_reward/mean": 0.09991557896137238, + "rewards/kidney_reward/std": 1.4150450229644775, + "rewards/length2tails_reward/mean": 0.8093632459640503, + "rewards/length2tails_reward/std": 0.26971057057380676, + "rewards/thermo_reward/mean": 0.21562257409095764, + "rewards/thermo_reward/std": 2.0421526432037354, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13571206014603376, + "epoch": 1.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6787880659103394, + "learning_rate": 2.801499065445032e-07, + "loss": 0.0041, + "num_tokens": 6747163.0, + "reward": 6.675548553466797, + "reward_std": 2.7135398387908936, + "rewards/fitness_reward/mean": 5.891197204589844, + "rewards/fitness_reward/std": 2.3944013118743896, + "rewards/kidney_reward/mean": 0.19133597612380981, + "rewards/kidney_reward/std": 1.3772848844528198, + "rewards/length2tails_reward/mean": 0.7056885957717896, + "rewards/length2tails_reward/std": 0.30836087465286255, + "rewards/thermo_reward/mean": 1.0245215892791748, + "rewards/thermo_reward/std": 1.319177508354187, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15427696518599987, + "epoch": 1.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3900893926620483, + "learning_rate": 2.778584022642996e-07, + "loss": -0.0041, + "num_tokens": 6755909.0, + "reward": 7.161505222320557, + "reward_std": 0.9952014088630676, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.21117135882377625, + "rewards/kidney_reward/std": 1.4653359651565552, + "rewards/length2tails_reward/mean": 0.8569097518920898, + "rewards/length2tails_reward/std": 0.22087019681930542, + "rewards/thermo_reward/mean": 0.5007882118225098, + "rewards/thermo_reward/std": 1.9092121124267578, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.14175315853208303, + "epoch": 1.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0321834087371826, + "learning_rate": 2.7557479520891104e-07, + "loss": -0.0096, + "num_tokens": 6764561.0, + "reward": 6.4538679122924805, + "reward_std": 3.3221423625946045, + "rewards/fitness_reward/mean": 5.78311824798584, + "rewards/fitness_reward/std": 2.816518545150757, + "rewards/kidney_reward/mean": 0.38053661584854126, + "rewards/kidney_reward/std": 1.2438056468963623, + "rewards/length2tails_reward/mean": 0.737084150314331, + "rewards/length2tails_reward/std": 0.308733195066452, + "rewards/thermo_reward/mean": 0.5924209952354431, + "rewards/thermo_reward/std": 1.7645432949066162, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1274666776880622, + "epoch": 1.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47710728645324707, + "learning_rate": 2.7329911035149934e-07, + "loss": 0.0166, + "num_tokens": 6773279.0, + "reward": 6.905461311340332, + "reward_std": 1.1370421648025513, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.29974085092544556, + "rewards/kidney_reward/std": 1.2740517854690552, + "rewards/length2tails_reward/mean": 0.738558292388916, + "rewards/length2tails_reward/std": 0.26039358973503113, + "rewards/thermo_reward/mean": -0.040694430470466614, + "rewards/thermo_reward/std": 1.8345433473587036, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 267.3125, + "completions/mean_terminated_length": 267.3125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.1419068006798625, + "epoch": 1.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1457231044769287, + "learning_rate": 2.7103137257858863e-07, + "loss": -0.0786, + "num_tokens": 6781865.0, + "reward": 6.60987663269043, + "reward_std": 2.389982223510742, + "rewards/fitness_reward/mean": 5.82939338684082, + "rewards/fitness_reward/std": 2.2017037868499756, + "rewards/kidney_reward/mean": 0.5430529117584229, + "rewards/kidney_reward/std": 1.3035211563110352, + "rewards/length2tails_reward/mean": 0.7117866277694702, + "rewards/length2tails_reward/std": 0.34175872802734375, + "rewards/thermo_reward/mean": 0.6620202660560608, + "rewards/thermo_reward/std": 1.5475127696990967, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 283.84375, + "completions/mean_terminated_length": 283.84375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1547088474035263, + "epoch": 1.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7892978191375732, + "learning_rate": 2.6877160668979636e-07, + "loss": 0.0089, + "num_tokens": 6790980.0, + "reward": 6.873981475830078, + "reward_std": 1.389536738395691, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.42670226097106934, + "rewards/kidney_reward/std": 1.1826317310333252, + "rewards/length2tails_reward/mean": 0.7123697400093079, + "rewards/length2tails_reward/std": 0.324055552482605, + "rewards/thermo_reward/mean": 0.19443663954734802, + "rewards/thermo_reward/std": 1.7825313806533813, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13518366497009993, + "epoch": 1.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7815073728561401, + "learning_rate": 2.6651983739756023e-07, + "loss": 0.0037, + "num_tokens": 6799648.0, + "reward": 7.146909713745117, + "reward_std": 2.248424768447876, + "rewards/fitness_reward/mean": 6.238016605377197, + "rewards/fitness_reward/std": 1.9984614849090576, + "rewards/kidney_reward/mean": 0.6763706803321838, + "rewards/kidney_reward/std": 1.2657153606414795, + "rewards/length2tails_reward/mean": 0.7601178884506226, + "rewards/length2tails_reward/std": 0.2368663251399994, + "rewards/thermo_reward/mean": 0.7613566517829895, + "rewards/thermo_reward/std": 1.490486741065979, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14888322167098522, + "epoch": 1.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078140497207642, + "learning_rate": 2.642760893268684e-07, + "loss": 0.0032, + "num_tokens": 6808361.0, + "reward": 7.304175853729248, + "reward_std": 1.061590552330017, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.7734180092811584, + "rewards/kidney_reward/std": 1.291848063468933, + "rewards/length2tails_reward/mean": 0.8478966951370239, + "rewards/length2tails_reward/std": 0.22242490947246552, + "rewards/thermo_reward/mean": 0.43436625599861145, + "rewards/thermo_reward/std": 1.6876534223556519, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 283.6875, + "completions/mean_terminated_length": 283.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15383954532444477, + "epoch": 1.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.014909744262695, + "learning_rate": 2.6204038701499053e-07, + "loss": 0.1809, + "num_tokens": 6817471.0, + "reward": 6.61821174621582, + "reward_std": 2.701411008834839, + "rewards/fitness_reward/mean": 5.924351692199707, + "rewards/fitness_reward/std": 2.1965856552124023, + "rewards/kidney_reward/mean": 0.06441353261470795, + "rewards/kidney_reward/std": 1.5847315788269043, + "rewards/length2tails_reward/mean": 0.8142881393432617, + "rewards/length2tails_reward/std": 0.26185664534568787, + "rewards/thermo_reward/mean": 0.9161627888679504, + "rewards/thermo_reward/std": 1.5615767240524292, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.14910172019153833, + "epoch": 1.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5022507309913635, + "learning_rate": 2.598127549112084e-07, + "loss": -0.0045, + "num_tokens": 6826142.0, + "reward": 7.229008674621582, + "reward_std": 1.0317364931106567, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5430067777633667, + "rewards/kidney_reward/std": 1.281389832496643, + "rewards/length2tails_reward/mean": 0.8323854207992554, + "rewards/length2tails_reward/std": 0.20013611018657684, + "rewards/thermo_reward/mean": 0.3162211775779724, + "rewards/thermo_reward/std": 1.700129508972168, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 278.40625, + "completions/mean_terminated_length": 278.40625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.16043044719845057, + "epoch": 1.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1040589809417725, + "learning_rate": 2.575932173765502e-07, + "loss": -0.0811, + "num_tokens": 6835083.0, + "reward": 6.40438985824585, + "reward_std": 2.2401676177978516, + "rewards/fitness_reward/mean": 6.154246807098389, + "rewards/fitness_reward/std": 1.9594675302505493, + "rewards/kidney_reward/mean": -0.48664194345474243, + "rewards/kidney_reward/std": 1.3095874786376953, + "rewards/length2tails_reward/mean": 0.902005672454834, + "rewards/length2tails_reward/std": 0.1389356106519699, + "rewards/thermo_reward/mean": 0.535925567150116, + "rewards/thermo_reward/std": 1.9621033668518066, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 276.3125, + "completions/mean_terminated_length": 276.3125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.15305947791785002, + "epoch": 1.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1466064453125, + "learning_rate": 2.553817986835225e-07, + "loss": 0.0243, + "num_tokens": 6843957.0, + "reward": 7.012514591217041, + "reward_std": 2.4925708770751953, + "rewards/fitness_reward/mean": 6.2459917068481445, + "rewards/fitness_reward/std": 1.95334792137146, + "rewards/kidney_reward/mean": 0.39113542437553406, + "rewards/kidney_reward/std": 1.2302358150482178, + "rewards/length2tails_reward/mean": 0.7674136161804199, + "rewards/length2tails_reward/std": 0.29283997416496277, + "rewards/thermo_reward/mean": 0.75820392370224, + "rewards/thermo_reward/std": 1.780465841293335, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 264.9375, + "completions/mean_terminated_length": 264.9375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.1664528949186206, + "epoch": 1.5659999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.524474620819092, + "learning_rate": 2.5317852301584643e-07, + "loss": -0.0789, + "num_tokens": 6852467.0, + "reward": 6.491879463195801, + "reward_std": 3.0557470321655273, + "rewards/fitness_reward/mean": 5.886578559875488, + "rewards/fitness_reward/std": 2.7737200260162354, + "rewards/kidney_reward/mean": 0.5440658926963806, + "rewards/kidney_reward/std": 1.4770121574401855, + "rewards/length2tails_reward/mean": 0.8402332663536072, + "rewards/length2tails_reward/std": 0.20666223764419556, + "rewards/thermo_reward/mean": 0.24641892313957214, + "rewards/thermo_reward/std": 1.538109302520752, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15580147691071033, + "epoch": 1.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296324253082275, + "learning_rate": 2.5098341446819093e-07, + "loss": 0.0065, + "num_tokens": 6861212.0, + "reward": 6.969008922576904, + "reward_std": 1.5260560512542725, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.008192509412765503, + "rewards/kidney_reward/std": 1.3466236591339111, + "rewards/length2tails_reward/mean": 0.7792737483978271, + "rewards/length2tails_reward/std": 0.29194724559783936, + "rewards/thermo_reward/mean": 0.991911768913269, + "rewards/thermo_reward/std": 1.4307126998901367, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 260.1875, + "completions/mean_terminated_length": 260.1875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.227813720703125, + "epoch": 1.5699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0767550468444824, + "learning_rate": 2.487964970459118e-07, + "loss": -0.1137, + "num_tokens": 6869570.0, + "reward": 5.950715065002441, + "reward_std": 3.853970766067505, + "rewards/fitness_reward/mean": 5.527054309844971, + "rewards/fitness_reward/std": 3.3675484657287598, + "rewards/kidney_reward/mean": 0.4216958284378052, + "rewards/kidney_reward/std": 1.341357707977295, + "rewards/length2tails_reward/mean": 0.7649134993553162, + "rewards/length2tails_reward/std": 0.32988977432250977, + "rewards/thermo_reward/mean": 0.04316858574748039, + "rewards/thermo_reward/std": 1.859584927558899, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.15193017665296793, + "epoch": 1.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2276837825775146, + "learning_rate": 2.4661779466478736e-07, + "loss": -0.0173, + "num_tokens": 6878215.0, + "reward": 6.380091667175293, + "reward_std": 2.3457376956939697, + "rewards/fitness_reward/mean": 6.121331691741943, + "rewards/fitness_reward/std": 2.137967586517334, + "rewards/kidney_reward/mean": 0.07171832025051117, + "rewards/kidney_reward/std": 1.1949107646942139, + "rewards/length2tails_reward/mean": 0.7781567573547363, + "rewards/length2tails_reward/std": 0.28654590249061584, + "rewards/thermo_reward/mean": 0.056723058223724365, + "rewards/thermo_reward/std": 1.9479655027389526, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13805292826145887, + "epoch": 1.5739999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7220945358276367, + "learning_rate": 2.444473311507582e-07, + "loss": 0.0056, + "num_tokens": 6886898.0, + "reward": 6.924524307250977, + "reward_std": 1.4148192405700684, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.10553894191980362, + "rewards/kidney_reward/std": 1.2154009342193604, + "rewards/length2tails_reward/mean": 0.7812000513076782, + "rewards/length2tails_reward/std": 0.24531032145023346, + "rewards/thermo_reward/mean": 0.3762912154197693, + "rewards/thermo_reward/std": 1.8832708597183228, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 269.28125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1284910449758172, + "epoch": 1.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46782413125038147, + "learning_rate": 2.422851302396655e-07, + "loss": 0.0074, + "num_tokens": 6895547.0, + "reward": 6.3736066818237305, + "reward_std": 2.541079521179199, + "rewards/fitness_reward/mean": 6.151097297668457, + "rewards/fitness_reward/std": 1.9764838218688965, + "rewards/kidney_reward/mean": -0.024903282523155212, + "rewards/kidney_reward/std": 1.3204262256622314, + "rewards/length2tails_reward/mean": 0.7805046439170837, + "rewards/length2tails_reward/std": 0.26294493675231934, + "rewards/thermo_reward/mean": 0.07966911792755127, + "rewards/thermo_reward/std": 1.8099154233932495, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1412920905277133, + "epoch": 1.5779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0019266605377197, + "learning_rate": 2.4013121557699157e-07, + "loss": 0.0026, + "num_tokens": 6904288.0, + "reward": 7.319521903991699, + "reward_std": 1.0260179042816162, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.05846037715673447, + "rewards/kidney_reward/std": 1.342934489250183, + "rewards/length2tails_reward/mean": 0.8107945919036865, + "rewards/length2tails_reward/std": 0.24635398387908936, + "rewards/thermo_reward/mean": 0.9925893545150757, + "rewards/thermo_reward/std": 1.4179774522781372, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16240247525274754, + "epoch": 1.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.670385479927063, + "learning_rate": 2.3798561071760238e-07, + "loss": 0.0031, + "num_tokens": 6913020.0, + "reward": 6.83107328414917, + "reward_std": 2.257927179336548, + "rewards/fitness_reward/mean": 6.23477840423584, + "rewards/fitness_reward/std": 2.0167782306671143, + "rewards/kidney_reward/mean": 0.3128570318222046, + "rewards/kidney_reward/std": 1.3388986587524414, + "rewards/length2tails_reward/mean": 0.8407726287841797, + "rewards/length2tails_reward/std": 0.21106097102165222, + "rewards/thermo_reward/mean": 0.4593451917171478, + "rewards/thermo_reward/std": 1.5769519805908203, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.13401680998504162, + "epoch": 1.5819999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4992716312408447, + "learning_rate": 2.3584833912548885e-07, + "loss": -0.0272, + "num_tokens": 6921611.0, + "reward": 6.293649196624756, + "reward_std": 2.6007704734802246, + "rewards/fitness_reward/mean": 5.80429744720459, + "rewards/fitness_reward/std": 2.375201940536499, + "rewards/kidney_reward/mean": -0.09890797734260559, + "rewards/kidney_reward/std": 1.3485544919967651, + "rewards/length2tails_reward/mean": 0.7262336015701294, + "rewards/length2tails_reward/std": 0.32025158405303955, + "rewards/thermo_reward/mean": 0.7144942879676819, + "rewards/thermo_reward/std": 1.566086769104004, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1410212367773056, + "epoch": 1.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5930579304695129, + "learning_rate": 2.3371942417351076e-07, + "loss": -0.0036, + "num_tokens": 6930329.0, + "reward": 6.707575798034668, + "reward_std": 1.335412859916687, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.0902789831161499, + "rewards/kidney_reward/std": 1.2444789409637451, + "rewards/length2tails_reward/mean": 0.7977325916290283, + "rewards/length2tails_reward/std": 0.3074288070201874, + "rewards/thermo_reward/mean": 0.5419025421142578, + "rewards/thermo_reward/std": 1.7505912780761719, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1581169944256544, + "epoch": 1.5859999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.256063222885132, + "learning_rate": 2.3159888914314119e-07, + "loss": -0.0001, + "num_tokens": 6939015.0, + "reward": 6.67472505569458, + "reward_std": 2.342672109603882, + "rewards/fitness_reward/mean": 6.134800910949707, + "rewards/fitness_reward/std": 2.0647575855255127, + "rewards/kidney_reward/mean": -0.04198363423347473, + "rewards/kidney_reward/std": 1.4810811281204224, + "rewards/length2tails_reward/mean": 0.7866643071174622, + "rewards/length2tails_reward/std": 0.26788610219955444, + "rewards/thermo_reward/mean": 0.7284992337226868, + "rewards/thermo_reward/std": 1.5208215713500977, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 264.9375, + "completions/mean_terminated_length": 264.9375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.14164164289832115, + "epoch": 1.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.337836503982544, + "learning_rate": 2.2948675722421085e-07, + "loss": -0.0781, + "num_tokens": 6947525.0, + "reward": 6.28550910949707, + "reward_std": 2.7345645427703857, + "rewards/fitness_reward/mean": 6.2102885246276855, + "rewards/fitness_reward/std": 2.155316114425659, + "rewards/kidney_reward/mean": 0.0910630151629448, + "rewards/kidney_reward/std": 1.3561928272247314, + "rewards/length2tails_reward/mean": 0.8018416166305542, + "rewards/length2tails_reward/std": 0.2113863080739975, + "rewards/thermo_reward/mean": -0.3415431082248688, + "rewards/thermo_reward/std": 1.6621954441070557, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13372136000543833, + "epoch": 1.5899999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7315834760665894, + "learning_rate": 2.2738305151465642e-07, + "loss": -0.0008, + "num_tokens": 6956206.0, + "reward": 7.293285369873047, + "reward_std": 1.2811592817306519, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.7350008487701416, + "rewards/kidney_reward/std": 1.335471510887146, + "rewards/length2tails_reward/mean": 0.7437633872032166, + "rewards/length2tails_reward/std": 0.29865309596061707, + "rewards/thermo_reward/mean": 0.503070592880249, + "rewards/thermo_reward/std": 1.568402647972107, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1477584159001708, + "epoch": 1.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7809637784957886, + "learning_rate": 2.252877950202665e-07, + "loss": -0.003, + "num_tokens": 6965024.0, + "reward": 6.985225677490234, + "reward_std": 1.19850754737854, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.12417389452457428, + "rewards/kidney_reward/std": 1.200655221939087, + "rewards/length2tails_reward/mean": 0.8002365827560425, + "rewards/length2tails_reward/std": 0.202206090092659, + "rewards/thermo_reward/mean": 0.46954041719436646, + "rewards/thermo_reward/std": 1.8501996994018555, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.15518053621053696, + "epoch": 1.5939999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2437355518341064, + "learning_rate": 2.2320101065443054e-07, + "loss": 0.0495, + "num_tokens": 6973792.0, + "reward": 6.102499961853027, + "reward_std": 3.0417087078094482, + "rewards/fitness_reward/mean": 5.797402858734131, + "rewards/fitness_reward/std": 2.7604498863220215, + "rewards/kidney_reward/mean": 0.17722631990909576, + "rewards/kidney_reward/std": 1.234021544456482, + "rewards/length2tails_reward/mean": 0.7253434658050537, + "rewards/length2tails_reward/std": 0.3229313790798187, + "rewards/thermo_reward/mean": 0.07029581815004349, + "rewards/thermo_reward/std": 1.8266401290893555, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.13572121411561966, + "epoch": 1.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45929965376853943, + "learning_rate": 2.2112272123788768e-07, + "loss": 0.0055, + "num_tokens": 6982527.0, + "reward": 6.9664306640625, + "reward_std": 1.2069658041000366, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.05869106948375702, + "rewards/kidney_reward/std": 1.2762436866760254, + "rewards/length2tails_reward/mean": 0.8596430420875549, + "rewards/length2tails_reward/std": 0.18852636218070984, + "rewards/thermo_reward/mean": 0.46772995591163635, + "rewards/thermo_reward/std": 1.4941520690917969, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 277.3125, + "completions/mean_terminated_length": 277.3125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15002206526696682, + "epoch": 1.5979999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4861711263656616, + "learning_rate": 2.190529494984782e-07, + "loss": -0.0281, + "num_tokens": 6991433.0, + "reward": 6.680995941162109, + "reward_std": 1.2425885200500488, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03977344185113907, + "rewards/kidney_reward/std": 1.210723876953125, + "rewards/length2tails_reward/mean": 0.7880594730377197, + "rewards/length2tails_reward/std": 0.25015008449554443, + "rewards/thermo_reward/mean": 0.03111743927001953, + "rewards/thermo_reward/std": 1.84101402759552, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14804279245436192, + "epoch": 1.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3991144597530365, + "learning_rate": 2.1699171807089411e-07, + "loss": 0.0012, + "num_tokens": 7000137.0, + "reward": 7.197426795959473, + "reward_std": 1.6150994300842285, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.6555510759353638, + "rewards/kidney_reward/std": 1.4246822595596313, + "rewards/length2tails_reward/mean": 0.8065072298049927, + "rewards/length2tails_reward/std": 0.25084543228149414, + "rewards/thermo_reward/mean": 0.35943105816841125, + "rewards/thermo_reward/std": 1.762044072151184, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12980582658201456, + "epoch": 1.6019999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5571575164794922, + "learning_rate": 2.1493904949643225e-07, + "loss": 0.0059, + "num_tokens": 7008801.0, + "reward": 6.574358940124512, + "reward_std": 2.3424017429351807, + "rewards/fitness_reward/mean": 6.138425827026367, + "rewards/fitness_reward/std": 2.0450949668884277, + "rewards/kidney_reward/mean": 0.027852151542901993, + "rewards/kidney_reward/std": 1.4272733926773071, + "rewards/length2tails_reward/mean": 0.7272623181343079, + "rewards/length2tails_reward/std": 0.27904385328292847, + "rewards/thermo_reward/mean": 0.48038309812545776, + "rewards/thermo_reward/std": 1.7055349349975586, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1358693977817893, + "epoch": 1.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7121937870979309, + "learning_rate": 2.128949662227475e-07, + "loss": -0.0013, + "num_tokens": 7017468.0, + "reward": 7.3364996910095215, + "reward_std": 1.1265954971313477, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5245989561080933, + "rewards/kidney_reward/std": 1.398279070854187, + "rewards/length2tails_reward/mean": 0.7204639911651611, + "rewards/length2tails_reward/std": 0.3124878704547882, + "rewards/thermo_reward/mean": 0.6055716276168823, + "rewards/thermo_reward/std": 1.518963098526001, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 264.1875, + "completions/mean_terminated_length": 264.1875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.12570040207356215, + "epoch": 1.6059999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6690869331359863, + "learning_rate": 2.1085949060360653e-07, + "loss": -0.0384, + "num_tokens": 7025954.0, + "reward": 6.881993293762207, + "reward_std": 2.2557709217071533, + "rewards/fitness_reward/mean": 6.231057167053223, + "rewards/fitness_reward/std": 2.0378293991088867, + "rewards/kidney_reward/mean": 0.3923177719116211, + "rewards/kidney_reward/std": 1.2387869358062744, + "rewards/length2tails_reward/mean": 0.6261150240898132, + "rewards/length2tails_reward/std": 0.33638495206832886, + "rewards/thermo_reward/mean": 0.5964961051940918, + "rewards/thermo_reward/std": 1.7912360429763794, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.1450491389259696, + "epoch": 1.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38868409395217896, + "learning_rate": 2.088326448986447e-07, + "loss": -0.0064, + "num_tokens": 7034667.0, + "reward": 6.616262435913086, + "reward_std": 2.1470398902893066, + "rewards/fitness_reward/mean": 6.103446006774902, + "rewards/fitness_reward/std": 1.7548693418502808, + "rewards/kidney_reward/mean": -0.06676850467920303, + "rewards/kidney_reward/std": 1.32046639919281, + "rewards/length2tails_reward/mean": 0.844550609588623, + "rewards/length2tails_reward/std": 0.2336605340242386, + "rewards/thermo_reward/mean": 0.6701264381408691, + "rewards/thermo_reward/std": 1.9092317819595337, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1405103961005807, + "epoch": 1.6099999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5941565036773682, + "learning_rate": 2.0681445127312213e-07, + "loss": -0.0026, + "num_tokens": 7043418.0, + "reward": 6.242457389831543, + "reward_std": 2.2547147274017334, + "rewards/fitness_reward/mean": 6.002241134643555, + "rewards/fitness_reward/std": 1.8122645616531372, + "rewards/kidney_reward/mean": -0.08604323118925095, + "rewards/kidney_reward/std": 1.29534113407135, + "rewards/length2tails_reward/mean": 0.7693860530853271, + "rewards/length2tails_reward/std": 0.2814192771911621, + "rewards/thermo_reward/mean": 0.18178275227546692, + "rewards/thermo_reward/std": 1.8364304304122925, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.1399152297526598, + "epoch": 1.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6891273260116577, + "learning_rate": 2.048049317976809e-07, + "loss": 0.0037, + "num_tokens": 7052075.0, + "reward": 6.234017372131348, + "reward_std": 2.945122480392456, + "rewards/fitness_reward/mean": 5.858860969543457, + "rewards/fitness_reward/std": 2.5184099674224854, + "rewards/kidney_reward/mean": 0.10943882912397385, + "rewards/kidney_reward/std": 1.201502799987793, + "rewards/length2tails_reward/mean": 0.7075182199478149, + "rewards/length2tails_reward/std": 0.32359179854393005, + "rewards/thermo_reward/mean": 0.2871146500110626, + "rewards/thermo_reward/std": 1.7703351974487305, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1428690068423748, + "epoch": 1.6139999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46447667479515076, + "learning_rate": 2.0280410844810424e-07, + "loss": 0.0009, + "num_tokens": 7060766.0, + "reward": 7.032682418823242, + "reward_std": 1.0429326295852661, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.426297128200531, + "rewards/kidney_reward/std": 1.3421649932861328, + "rewards/length2tails_reward/mean": 0.6655017733573914, + "rewards/length2tails_reward/std": 0.34165987372398376, + "rewards/thermo_reward/mean": 0.1237204298377037, + "rewards/thermo_reward/std": 1.7502018213272095, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14284992218017578, + "epoch": 1.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1443132162094116, + "learning_rate": 2.0081200310507528e-07, + "loss": 0.0021, + "num_tokens": 7069456.0, + "reward": 7.200804710388184, + "reward_std": 1.1046377420425415, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.023301243782043457, + "rewards/kidney_reward/std": 1.3528456687927246, + "rewards/length2tails_reward/mean": 0.8265809416770935, + "rewards/length2tails_reward/std": 0.20167027413845062, + "rewards/thermo_reward/mean": 0.782421886920929, + "rewards/thermo_reward/std": 1.600035548210144, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14760678727179766, + "epoch": 1.6179999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5073155760765076, + "learning_rate": 1.9882863755393908e-07, + "loss": 0.0051, + "num_tokens": 7078159.0, + "reward": 6.877035617828369, + "reward_std": 2.390430450439453, + "rewards/fitness_reward/mean": 6.3059892654418945, + "rewards/fitness_reward/std": 1.613950490951538, + "rewards/kidney_reward/mean": 0.0437052845954895, + "rewards/kidney_reward/std": 1.29998779296875, + "rewards/length2tails_reward/mean": 0.8162371516227722, + "rewards/length2tails_reward/std": 0.22516478598117828, + "rewards/thermo_reward/mean": 0.6902687549591064, + "rewards/thermo_reward/std": 1.5998762845993042, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14818468876183033, + "epoch": 1.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8331058621406555, + "learning_rate": 1.9685403348446373e-07, + "loss": 0.0064, + "num_tokens": 7086859.0, + "reward": 7.085220813751221, + "reward_std": 0.9025030732154846, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.27463358640670776, + "rewards/kidney_reward/std": 1.150914192199707, + "rewards/length2tails_reward/mean": 0.7218012809753418, + "rewards/length2tails_reward/std": 0.30797049403190613, + "rewards/thermo_reward/mean": 0.3523106575012207, + "rewards/thermo_reward/std": 1.8932554721832275, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.15223387628793716, + "epoch": 1.6219999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0666444301605225, + "learning_rate": 1.9488821249060293e-07, + "loss": -0.0714, + "num_tokens": 7095636.0, + "reward": 6.91280460357666, + "reward_std": 2.35825514793396, + "rewards/fitness_reward/mean": 6.128098011016846, + "rewards/fitness_reward/std": 2.1011643409729004, + "rewards/kidney_reward/mean": 0.3937515616416931, + "rewards/kidney_reward/std": 1.2993850708007812, + "rewards/length2tails_reward/mean": 0.787409245967865, + "rewards/length2tails_reward/std": 0.2610521614551544, + "rewards/thermo_reward/mean": 0.7819565534591675, + "rewards/thermo_reward/std": 1.629075050354004, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13425441551953554, + "epoch": 1.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33635297417640686, + "learning_rate": 1.9293119607025987e-07, + "loss": -0.0018, + "num_tokens": 7104319.0, + "reward": 6.992291450500488, + "reward_std": 1.3824111223220825, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.39375168085098267, + "rewards/kidney_reward/std": 1.306369662284851, + "rewards/length2tails_reward/mean": 0.759617805480957, + "rewards/length2tails_reward/std": 0.2773108184337616, + "rewards/thermo_reward/mean": 0.44038185477256775, + "rewards/thermo_reward/std": 1.6529444456100464, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1357668610289693, + "epoch": 1.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9993639588356018, + "learning_rate": 1.9098300562505264e-07, + "loss": 0.0025, + "num_tokens": 7113036.0, + "reward": 6.908905982971191, + "reward_std": 1.4435275793075562, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.26299813389778137, + "rewards/kidney_reward/std": 1.391488790512085, + "rewards/length2tails_reward/mean": 0.7875744700431824, + "rewards/length2tails_reward/std": 0.27959251403808594, + "rewards/thermo_reward/mean": 0.18440839648246765, + "rewards/thermo_reward/std": 1.9216700792312622, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.5, + "completions/mean_terminated_length": 266.5, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.14211956411600113, + "epoch": 1.6280000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9641854763031006, + "learning_rate": 1.8904366246007997e-07, + "loss": -0.0494, + "num_tokens": 7121596.0, + "reward": 5.956068992614746, + "reward_std": 2.598559617996216, + "rewards/fitness_reward/mean": 5.666597366333008, + "rewards/fitness_reward/std": 2.548593759536743, + "rewards/kidney_reward/mean": 0.26107099652290344, + "rewards/kidney_reward/std": 1.2811338901519775, + "rewards/length2tails_reward/mean": 0.7258620262145996, + "rewards/length2tails_reward/std": 0.3040766716003418, + "rewards/thermo_reward/mean": -0.04505925998091698, + "rewards/thermo_reward/std": 1.9570527076721191, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.12681808788329363, + "epoch": 1.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5192362666130066, + "learning_rate": 1.8711318778368789e-07, + "loss": 0.0061, + "num_tokens": 7130259.0, + "reward": 6.6440229415893555, + "reward_std": 2.303011417388916, + "rewards/fitness_reward/mean": 6.205258369445801, + "rewards/fitness_reward/std": 1.6861180067062378, + "rewards/kidney_reward/mean": 0.30537569522857666, + "rewards/kidney_reward/std": 1.3576401472091675, + "rewards/length2tails_reward/mean": 0.7708885669708252, + "rewards/length2tails_reward/std": 0.2693851888179779, + "rewards/thermo_reward/mean": 0.18670809268951416, + "rewards/thermo_reward/std": 2.076995611190796, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13475374784320593, + "epoch": 1.6320000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6971718668937683, + "learning_rate": 1.8519160270723854e-07, + "loss": 0.0024, + "num_tokens": 7138963.0, + "reward": 7.206413269042969, + "reward_std": 1.1324994564056396, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5670938491821289, + "rewards/kidney_reward/std": 1.452012538909912, + "rewards/length2tails_reward/mean": 0.7637877464294434, + "rewards/length2tails_reward/std": 0.28725025057792664, + "rewards/thermo_reward/mean": 0.48721978068351746, + "rewards/thermo_reward/std": 1.802435040473938, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13609969802200794, + "epoch": 1.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6328021883964539, + "learning_rate": 1.832789282448779e-07, + "loss": -0.0071, + "num_tokens": 7147714.0, + "reward": 7.324970245361328, + "reward_std": 0.9820321202278137, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.47436121106147766, + "rewards/kidney_reward/std": 1.2194892168045044, + "rewards/length2tails_reward/mean": 0.8134773969650269, + "rewards/length2tails_reward/std": 0.24075032770633698, + "rewards/thermo_reward/mean": 0.586244523525238, + "rewards/thermo_reward/std": 1.6655681133270264, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.1564929075539112, + "epoch": 1.6360000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7099519371986389, + "learning_rate": 1.8137518531330764e-07, + "loss": -0.0056, + "num_tokens": 7156478.0, + "reward": 7.158771514892578, + "reward_std": 1.0499330759048462, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.7384054064750671, + "rewards/kidney_reward/std": 1.1375004053115845, + "rewards/length2tails_reward/mean": 0.7614933848381042, + "rewards/length2tails_reward/std": 0.33037009835243225, + "rewards/thermo_reward/mean": 0.4277508854866028, + "rewards/thermo_reward/std": 1.6975600719451904, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 263.90625, + "completions/mean_terminated_length": 263.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.12976836785674095, + "epoch": 1.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7589884400367737, + "learning_rate": 1.794803947315555e-07, + "loss": -0.0824, + "num_tokens": 7164955.0, + "reward": 6.463073253631592, + "reward_std": 2.420807361602783, + "rewards/fitness_reward/mean": 6.005888938903809, + "rewards/fitness_reward/std": 2.258882999420166, + "rewards/kidney_reward/mean": 0.1028832420706749, + "rewards/kidney_reward/std": 1.2945185899734497, + "rewards/length2tails_reward/mean": 0.7627241611480713, + "rewards/length2tails_reward/std": 0.29401490092277527, + "rewards/thermo_reward/mean": 0.43012282252311707, + "rewards/thermo_reward/std": 1.7847391366958618, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.17181082256138325, + "epoch": 1.6400000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.445078134536743, + "learning_rate": 1.7759457722074766e-07, + "loss": -0.0121, + "num_tokens": 7173688.0, + "reward": 5.95964241027832, + "reward_std": 2.90617299079895, + "rewards/fitness_reward/mean": 5.662052154541016, + "rewards/fitness_reward/std": 2.907393455505371, + "rewards/kidney_reward/mean": -0.028043724596500397, + "rewards/kidney_reward/std": 1.2703628540039062, + "rewards/length2tails_reward/mean": 0.8470255136489868, + "rewards/length2tails_reward/std": 0.22639378905296326, + "rewards/thermo_reward/mean": 0.19971126317977905, + "rewards/thermo_reward/std": 2.109585762023926, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1558460621163249, + "epoch": 1.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7270005345344543, + "learning_rate": 1.7571775340388272e-07, + "loss": 0.0045, + "num_tokens": 7182544.0, + "reward": 6.957390308380127, + "reward_std": 1.1817389726638794, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.1663399338722229, + "rewards/kidney_reward/std": 1.3476557731628418, + "rewards/length2tails_reward/mean": 0.8373498320579529, + "rewards/length2tails_reward/std": 0.1934429109096527, + "rewards/thermo_reward/mean": 0.47984856367111206, + "rewards/thermo_reward/std": 1.604375958442688, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.0625, + "completions/mean_terminated_length": 263.0625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.15894608478993177, + "epoch": 1.6440000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.293469190597534, + "learning_rate": 1.7384994380560448e-07, + "loss": -0.095, + "num_tokens": 7190994.0, + "reward": 5.762653350830078, + "reward_std": 3.522106170654297, + "rewards/fitness_reward/mean": 5.67220401763916, + "rewards/fitness_reward/std": 2.8764731884002686, + "rewards/kidney_reward/mean": -0.11606693267822266, + "rewards/kidney_reward/std": 1.4680663347244263, + "rewards/length2tails_reward/mean": 0.8622728586196899, + "rewards/length2tails_reward/std": 0.22785277664661407, + "rewards/thermo_reward/mean": -0.1341715008020401, + "rewards/thermo_reward/std": 2.0464932918548584, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1348210908472538, + "epoch": 1.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4558381140232086, + "learning_rate": 1.7199116885197996e-07, + "loss": 0.0043, + "num_tokens": 7199693.0, + "reward": 6.687112331390381, + "reward_std": 2.265033483505249, + "rewards/fitness_reward/mean": 6.074307441711426, + "rewards/fitness_reward/std": 1.9026298522949219, + "rewards/kidney_reward/mean": 0.49384766817092896, + "rewards/kidney_reward/std": 1.4014723300933838, + "rewards/length2tails_reward/mean": 0.7414044737815857, + "rewards/length2tails_reward/std": 0.3127528429031372, + "rewards/thermo_reward/mean": 0.36106032133102417, + "rewards/thermo_reward/std": 1.657314419746399, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 280.65625, + "completions/mean_terminated_length": 280.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16019416879862547, + "epoch": 1.6480000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.579204261302948, + "learning_rate": 1.7014144887027405e-07, + "loss": -0.004, + "num_tokens": 7208706.0, + "reward": 6.605236053466797, + "reward_std": 2.2676591873168945, + "rewards/fitness_reward/mean": 6.283232688903809, + "rewards/fitness_reward/std": 1.7426812648773193, + "rewards/kidney_reward/mean": -0.06617752462625504, + "rewards/kidney_reward/std": 1.3891545534133911, + "rewards/length2tails_reward/mean": 0.7619524002075195, + "rewards/length2tails_reward/std": 0.3083850145339966, + "rewards/thermo_reward/mean": 0.3292073607444763, + "rewards/thermo_reward/std": 1.7445483207702637, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13707383908331394, + "epoch": 1.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8161059021949768, + "learning_rate": 1.683008040887285e-07, + "loss": 0.0003, + "num_tokens": 7217396.0, + "reward": 6.974254608154297, + "reward_std": 1.2776795625686646, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.14366000890731812, + "rewards/kidney_reward/std": 1.3012430667877197, + "rewards/length2tails_reward/mean": 0.7557255625724792, + "rewards/length2tails_reward/std": 0.3187832832336426, + "rewards/thermo_reward/mean": 0.45036783814430237, + "rewards/thermo_reward/std": 1.8396073579788208, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13950870279222727, + "epoch": 1.6520000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6956560611724854, + "learning_rate": 1.664692546363392e-07, + "loss": -0.0082, + "num_tokens": 7226131.0, + "reward": 6.393102645874023, + "reward_std": 2.120586633682251, + "rewards/fitness_reward/mean": 6.197003364562988, + "rewards/fitness_reward/std": 1.730019450187683, + "rewards/kidney_reward/mean": -0.06846331059932709, + "rewards/kidney_reward/std": 1.4518868923187256, + "rewards/length2tails_reward/mean": 0.7773817777633667, + "rewards/length2tails_reward/std": 0.2807996869087219, + "rewards/thermo_reward/mean": 0.07197131961584091, + "rewards/thermo_reward/std": 2.163224458694458, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12410412635654211, + "epoch": 1.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8254323601722717, + "learning_rate": 1.6464682054263767e-07, + "loss": 0.0052, + "num_tokens": 7234804.0, + "reward": 7.0719709396362305, + "reward_std": 1.0682038068771362, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.15627434849739075, + "rewards/kidney_reward/std": 1.2921384572982788, + "rewards/length2tails_reward/mean": 0.7281785011291504, + "rewards/length2tails_reward/std": 0.3003217577934265, + "rewards/thermo_reward/mean": 0.44098129868507385, + "rewards/thermo_reward/std": 1.7667676210403442, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.15098392963409424, + "epoch": 1.6560000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.884126603603363, + "learning_rate": 1.6283352173747146e-07, + "loss": -0.0049, + "num_tokens": 7243529.0, + "reward": 6.955116271972656, + "reward_std": 1.2057723999023438, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.11913199722766876, + "rewards/kidney_reward/std": 1.388401746749878, + "rewards/length2tails_reward/mean": 0.8637431859970093, + "rewards/length2tails_reward/std": 0.2176281362771988, + "rewards/thermo_reward/mean": 0.620873749256134, + "rewards/thermo_reward/std": 1.4743573665618896, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15889708884060383, + "epoch": 1.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546905755996704, + "learning_rate": 1.6102937805078542e-07, + "loss": -0.0044, + "num_tokens": 7252240.0, + "reward": 6.587793350219727, + "reward_std": 1.9747923612594604, + "rewards/fitness_reward/mean": 6.283938407897949, + "rewards/fitness_reward/std": 1.738689661026001, + "rewards/kidney_reward/mean": -0.4210178256034851, + "rewards/kidney_reward/std": 0.928088903427124, + "rewards/length2tails_reward/mean": 0.8309322595596313, + "rewards/length2tails_reward/std": 0.2687700092792511, + "rewards/thermo_reward/mean": 0.6132618188858032, + "rewards/thermo_reward/std": 1.4882124662399292, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 283.875, + "completions/mean_terminated_length": 283.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14827707316726446, + "epoch": 1.6600000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5123414993286133, + "learning_rate": 1.5923440921240639e-07, + "loss": 0.146, + "num_tokens": 7261356.0, + "reward": 6.196780204772949, + "reward_std": 4.031224250793457, + "rewards/fitness_reward/mean": 5.475707054138184, + "rewards/fitness_reward/std": 3.526364326477051, + "rewards/kidney_reward/mean": 0.34759849309921265, + "rewards/kidney_reward/std": 1.2305727005004883, + "rewards/length2tails_reward/mean": 0.7912404537200928, + "rewards/length2tails_reward/std": 0.25785160064697266, + "rewards/thermo_reward/mean": 0.6989284753799438, + "rewards/thermo_reward/std": 1.6143102645874023, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14084977097809315, + "epoch": 1.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5071995854377747, + "learning_rate": 1.5744863485182535e-07, + "loss": 0.0002, + "num_tokens": 7270059.0, + "reward": 7.149096488952637, + "reward_std": 1.3311703205108643, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.7804248332977295, + "rewards/kidney_reward/std": 1.2438985109329224, + "rewards/length2tails_reward/mean": 0.7402344942092896, + "rewards/length2tails_reward/std": 0.31538769602775574, + "rewards/thermo_reward/mean": 0.3770107328891754, + "rewards/thermo_reward/std": 1.8036571741104126, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.18307852651923895, + "epoch": 1.6640000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.794896125793457, + "learning_rate": 1.5567207449798515e-07, + "loss": 0.0169, + "num_tokens": 7278774.0, + "reward": 5.479580879211426, + "reward_std": 3.697575569152832, + "rewards/fitness_reward/mean": 5.116209983825684, + "rewards/fitness_reward/std": 3.4127492904663086, + "rewards/kidney_reward/mean": 0.3621028661727905, + "rewards/kidney_reward/std": 1.4506288766860962, + "rewards/length2tails_reward/mean": 0.7454742193222046, + "rewards/length2tails_reward/std": 0.3160284757614136, + "rewards/thermo_reward/mean": -0.008097946643829346, + "rewards/thermo_reward/std": 1.974648118019104, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1516521004959941, + "epoch": 1.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.571131706237793, + "learning_rate": 1.5390474757906448e-07, + "loss": 0.0448, + "num_tokens": 7287564.0, + "reward": 6.586319923400879, + "reward_std": 2.2640275955200195, + "rewards/fitness_reward/mean": 6.125868797302246, + "rewards/fitness_reward/std": 2.1132843494415283, + "rewards/kidney_reward/mean": -0.13343212008476257, + "rewards/kidney_reward/std": 1.5046125650405884, + "rewards/length2tails_reward/mean": 0.8478919863700867, + "rewards/length2tails_reward/std": 0.16571584343910217, + "rewards/thermo_reward/mean": 0.6303893327713013, + "rewards/thermo_reward/std": 1.7262694835662842, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12323288340121508, + "epoch": 1.6680000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.849729597568512, + "learning_rate": 1.5214667342226816e-07, + "loss": 0.0041, + "num_tokens": 7296206.0, + "reward": 6.9101948738098145, + "reward_std": 1.0292601585388184, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.18415537476539612, + "rewards/kidney_reward/std": 1.192476749420166, + "rewards/length2tails_reward/mean": 0.746961236000061, + "rewards/length2tails_reward/std": 0.28581270575523376, + "rewards/thermo_reward/mean": 0.44846826791763306, + "rewards/thermo_reward/std": 1.7755752801895142, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.53125, + "completions/mean_terminated_length": 268.53125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.1294352300465107, + "epoch": 1.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5036392211914062, + "learning_rate": 1.5039787125361326e-07, + "loss": -0.0003, + "num_tokens": 7304831.0, + "reward": 6.762212753295898, + "reward_std": 2.5893232822418213, + "rewards/fitness_reward/mean": 5.9096784591674805, + "rewards/fitness_reward/std": 2.325226068496704, + "rewards/kidney_reward/mean": 0.4972172677516937, + "rewards/kidney_reward/std": 1.136783480644226, + "rewards/length2tails_reward/mean": 0.7451069355010986, + "rewards/length2tails_reward/std": 0.31411874294281006, + "rewards/thermo_reward/mean": 0.8352988958358765, + "rewards/thermo_reward/std": 1.3285595178604126, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 286.71875, + "completions/mean_terminated_length": 286.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.18842580541968346, + "epoch": 1.6720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.40046215057373, + "learning_rate": 1.4865836019771993e-07, + "loss": 0.1679, + "num_tokens": 7314038.0, + "reward": 6.866921424865723, + "reward_std": 2.5648040771484375, + "rewards/fitness_reward/mean": 5.974783897399902, + "rewards/fitness_reward/std": 2.4268317222595215, + "rewards/kidney_reward/mean": 0.6484463214874268, + "rewards/kidney_reward/std": 1.3802800178527832, + "rewards/length2tails_reward/mean": 0.7863222360610962, + "rewards/length2tails_reward/std": 0.2881692349910736, + "rewards/thermo_reward/mean": 0.7426664233207703, + "rewards/thermo_reward/std": 1.5013645887374878, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 282.90625, + "completions/mean_terminated_length": 282.90625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.1520282533019781, + "epoch": 1.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.291491508483887, + "learning_rate": 1.469281592776027e-07, + "loss": 0.1507, + "num_tokens": 7323123.0, + "reward": 6.265990257263184, + "reward_std": 2.812899589538574, + "rewards/fitness_reward/mean": 5.733098983764648, + "rewards/fitness_reward/std": 2.639847993850708, + "rewards/kidney_reward/mean": 0.014443039894104004, + "rewards/kidney_reward/std": 1.5156126022338867, + "rewards/length2tails_reward/mean": 0.8318912982940674, + "rewards/length2tails_reward/std": 0.2504137456417084, + "rewards/thermo_reward/mean": 0.6353940963745117, + "rewards/thermo_reward/std": 1.5909487009048462, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13566430285573006, + "epoch": 1.6760000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3573877811431885, + "learning_rate": 1.4520728741446086e-07, + "loss": 0.0002, + "num_tokens": 7331822.0, + "reward": 7.143100261688232, + "reward_std": 1.27162766456604, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2948033809661865, + "rewards/kidney_reward/std": 1.2164710760116577, + "rewards/length2tails_reward/mean": 0.804263710975647, + "rewards/length2tails_reward/std": 0.2332266867160797, + "rewards/thermo_reward/mean": 0.4066682457923889, + "rewards/thermo_reward/std": 1.9505378007888794, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1343477349728346, + "epoch": 1.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4439067244529724, + "learning_rate": 1.434957634274746e-07, + "loss": 0.0022, + "num_tokens": 7340495.0, + "reward": 6.519376754760742, + "reward_std": 2.0977344512939453, + "rewards/fitness_reward/mean": 6.103977680206299, + "rewards/fitness_reward/std": 1.7522006034851074, + "rewards/kidney_reward/mean": 0.152456596493721, + "rewards/kidney_reward/std": 1.440082311630249, + "rewards/length2tails_reward/mean": 0.7946549654006958, + "rewards/length2tails_reward/std": 0.2629484236240387, + "rewards/thermo_reward/mean": 0.28101375699043274, + "rewards/thermo_reward/std": 1.9162328243255615, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13612825609743595, + "epoch": 1.6800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1147886514663696, + "learning_rate": 1.4179360603359503e-07, + "loss": 0.0011, + "num_tokens": 7349198.0, + "reward": 7.322343826293945, + "reward_std": 1.2475579977035522, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.49828895926475525, + "rewards/kidney_reward/std": 1.3315337896347046, + "rewards/length2tails_reward/mean": 0.8391580581665039, + "rewards/length2tails_reward/std": 0.2007627934217453, + "rewards/thermo_reward/mean": 0.750201404094696, + "rewards/thermo_reward/std": 1.4005272388458252, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 280.1875, + "completions/mean_terminated_length": 280.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15768698323518038, + "epoch": 1.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0588042736053467, + "learning_rate": 1.4010083384734306e-07, + "loss": 0.1321, + "num_tokens": 7358196.0, + "reward": 6.309168815612793, + "reward_std": 2.858778238296509, + "rewards/fitness_reward/mean": 5.777714729309082, + "rewards/fitness_reward/std": 2.8353958129882812, + "rewards/kidney_reward/mean": 0.10814039409160614, + "rewards/kidney_reward/std": 1.4258649349212646, + "rewards/length2tails_reward/mean": 0.7436399459838867, + "rewards/length2tails_reward/std": 0.31671804189682007, + "rewards/thermo_reward/mean": 0.5829468369483948, + "rewards/thermo_reward/std": 1.772304654121399, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.1528618261218071, + "epoch": 1.6840000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0883593559265137, + "learning_rate": 1.384174653806044e-07, + "loss": -0.0153, + "num_tokens": 7366851.0, + "reward": 6.111975193023682, + "reward_std": 3.416835308074951, + "rewards/fitness_reward/mean": 5.5633440017700195, + "rewards/fitness_reward/std": 2.9556238651275635, + "rewards/kidney_reward/mean": 0.5944656133651733, + "rewards/kidney_reward/std": 1.4616034030914307, + "rewards/length2tails_reward/mean": 0.7819907069206238, + "rewards/length2tails_reward/std": 0.28782516717910767, + "rewards/thermo_reward/mean": 0.11180165410041809, + "rewards/thermo_reward/std": 1.9642974138259888, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15140019077807665, + "epoch": 1.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.618274211883545, + "learning_rate": 1.3674351904242608e-07, + "loss": 0.0354, + "num_tokens": 7375564.0, + "reward": 6.797853946685791, + "reward_std": 1.9600287675857544, + "rewards/fitness_reward/mean": 6.159884452819824, + "rewards/fitness_reward/std": 1.929038405418396, + "rewards/kidney_reward/mean": 0.2869061827659607, + "rewards/kidney_reward/std": 1.224570870399475, + "rewards/length2tails_reward/mean": 0.7241692543029785, + "rewards/length2tails_reward/std": 0.2719430923461914, + "rewards/thermo_reward/mean": 0.6269471645355225, + "rewards/thermo_reward/std": 1.5864113569259644, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 274.9375, + "completions/mean_terminated_length": 274.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13365026656538248, + "epoch": 1.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5036654472351074, + "learning_rate": 1.350790131388181e-07, + "loss": 0.0851, + "num_tokens": 7384394.0, + "reward": 6.495261192321777, + "reward_std": 2.676274061203003, + "rewards/fitness_reward/mean": 6.114790916442871, + "rewards/fitness_reward/std": 2.173590898513794, + "rewards/kidney_reward/mean": 0.3495478630065918, + "rewards/kidney_reward/std": 1.3514161109924316, + "rewards/length2tails_reward/mean": 0.7509225010871887, + "rewards/length2tails_reward/std": 0.2755270302295685, + "rewards/thermo_reward/mean": 0.03593030571937561, + "rewards/thermo_reward/std": 1.7318520545959473, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 274.84375, + "completions/mean_terminated_length": 274.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14785962086170912, + "epoch": 1.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7910501956939697, + "learning_rate": 1.3342396587254957e-07, + "loss": 0.0588, + "num_tokens": 7393221.0, + "reward": 6.4793291091918945, + "reward_std": 2.450364589691162, + "rewards/fitness_reward/mean": 5.838988304138184, + "rewards/fitness_reward/std": 2.601452350616455, + "rewards/kidney_reward/mean": 0.31948888301849365, + "rewards/kidney_reward/std": 1.538975715637207, + "rewards/length2tails_reward/mean": 0.7570391297340393, + "rewards/length2tails_reward/std": 0.29220935702323914, + "rewards/thermo_reward/mean": 0.5826727747917175, + "rewards/thermo_reward/std": 1.7818015813827515, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17782294657081366, + "epoch": 1.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8454959392547607, + "learning_rate": 1.3177839534295277e-07, + "loss": 0.0171, + "num_tokens": 7402014.0, + "reward": 6.645394325256348, + "reward_std": 2.350458860397339, + "rewards/fitness_reward/mean": 6.233956336975098, + "rewards/fitness_reward/std": 2.0214316844940186, + "rewards/kidney_reward/mean": 0.33905676007270813, + "rewards/kidney_reward/std": 1.3427672386169434, + "rewards/length2tails_reward/mean": 0.7578026652336121, + "rewards/length2tails_reward/std": 0.2790738344192505, + "rewards/thermo_reward/mean": 0.10491818189620972, + "rewards/thermo_reward/std": 2.076547384262085, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12934922520071268, + "epoch": 1.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6795424818992615, + "learning_rate": 1.3014231954572286e-07, + "loss": 0.0031, + "num_tokens": 7410705.0, + "reward": 6.923931121826172, + "reward_std": 1.6895887851715088, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.4949648380279541, + "rewards/kidney_reward/std": 1.4649431705474854, + "rewards/length2tails_reward/mean": 0.7548559308052063, + "rewards/length2tails_reward/std": 0.269429087638855, + "rewards/thermo_reward/mean": 0.6167846918106079, + "rewards/thermo_reward/std": 1.759171724319458, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14902009721845388, + "epoch": 1.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7889152765274048, + "learning_rate": 1.285157563727226e-07, + "loss": -0.0052, + "num_tokens": 7419411.0, + "reward": 6.836027145385742, + "reward_std": 1.8570570945739746, + "rewards/fitness_reward/mean": 6.19685697555542, + "rewards/fitness_reward/std": 1.7307991981506348, + "rewards/kidney_reward/mean": 0.4804500937461853, + "rewards/kidney_reward/std": 1.4231055974960327, + "rewards/length2tails_reward/mean": 0.7190946936607361, + "rewards/length2tails_reward/std": 0.3033592104911804, + "rewards/thermo_reward/mean": 0.43834346532821655, + "rewards/thermo_reward/std": 1.6495345830917358, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.13939962070435286, + "epoch": 1.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5602226257324219, + "learning_rate": 1.26898723611787e-07, + "loss": 0.0033, + "num_tokens": 7428064.0, + "reward": 7.376455307006836, + "reward_std": 1.5139089822769165, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5721681714057922, + "rewards/kidney_reward/std": 1.167947769165039, + "rewards/length2tails_reward/mean": 0.7647218704223633, + "rewards/length2tails_reward/std": 0.24678917229175568, + "rewards/thermo_reward/mean": 0.8217629194259644, + "rewards/thermo_reward/std": 1.6348930597305298, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1650812430307269, + "epoch": 1.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4563441276550293, + "learning_rate": 1.252912389465266e-07, + "loss": 0.0299, + "num_tokens": 7436793.0, + "reward": 6.556080341339111, + "reward_std": 2.8344311714172363, + "rewards/fitness_reward/mean": 6.109508514404297, + "rewards/fitness_reward/std": 2.2023956775665283, + "rewards/kidney_reward/mean": 0.6741557121276855, + "rewards/kidney_reward/std": 1.199660062789917, + "rewards/length2tails_reward/mean": 0.7195312976837158, + "rewards/length2tails_reward/std": 0.30274638533592224, + "rewards/thermo_reward/mean": -0.1407776176929474, + "rewards/thermo_reward/std": 1.9188156127929688, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13511865586042404, + "epoch": 1.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6227527856826782, + "learning_rate": 1.2369331995613663e-07, + "loss": 0.0014, + "num_tokens": 7445484.0, + "reward": 6.782176494598389, + "reward_std": 1.1895499229431152, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.0177227221429348, + "rewards/kidney_reward/std": 1.5055136680603027, + "rewards/length2tails_reward/mean": 0.7957772016525269, + "rewards/length2tails_reward/std": 0.25311028957366943, + "rewards/thermo_reward/mean": -0.03385445475578308, + "rewards/thermo_reward/std": 1.7607090473175049, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1365264095366001, + "epoch": 1.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40608924627304077, + "learning_rate": 1.2210498411520253e-07, + "loss": 0.0004, + "num_tokens": 7454149.0, + "reward": 7.0736541748046875, + "reward_std": 1.7951874732971191, + "rewards/fitness_reward/mean": 6.280205249786377, + "rewards/fitness_reward/std": 1.7598060369491577, + "rewards/kidney_reward/mean": 0.1357700079679489, + "rewards/kidney_reward/std": 1.3923213481903076, + "rewards/length2tails_reward/mean": 0.7220439314842224, + "rewards/length2tails_reward/std": 0.2839628756046295, + "rewards/thermo_reward/mean": 1.0901048183441162, + "rewards/thermo_reward/std": 1.1645164489746094, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14461984485387802, + "epoch": 1.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6263294219970703, + "learning_rate": 1.2052624879351103e-07, + "loss": -0.0072, + "num_tokens": 7462872.0, + "reward": 7.149246692657471, + "reward_std": 1.0854411125183105, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.31492069363594055, + "rewards/kidney_reward/std": 1.3848581314086914, + "rewards/length2tails_reward/mean": 0.7686634063720703, + "rewards/length2tails_reward/std": 0.31319841742515564, + "rewards/thermo_reward/mean": 0.41664430499076843, + "rewards/thermo_reward/std": 1.6596086025238037, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1368907280266285, + "epoch": 1.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3722684681415558, + "learning_rate": 1.1895713125585849e-07, + "loss": -0.0025, + "num_tokens": 7471593.0, + "reward": 6.9046430587768555, + "reward_std": 1.4990440607070923, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3640343248844147, + "rewards/kidney_reward/std": 1.2583822011947632, + "rewards/length2tails_reward/mean": 0.8037251234054565, + "rewards/length2tails_reward/std": 0.2687337100505829, + "rewards/thermo_reward/mean": 0.06677071005105972, + "rewards/thermo_reward/std": 1.854303240776062, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16624288819730282, + "epoch": 1.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.259831666946411, + "learning_rate": 1.1739764866186308e-07, + "loss": 0.013, + "num_tokens": 7480330.0, + "reward": 6.618877410888672, + "reward_std": 2.3948142528533936, + "rewards/fitness_reward/mean": 6.026947021484375, + "rewards/fitness_reward/std": 2.1481029987335205, + "rewards/kidney_reward/mean": 0.21269099414348602, + "rewards/kidney_reward/std": 1.3428399562835693, + "rewards/length2tails_reward/mean": 0.8097426891326904, + "rewards/length2tails_reward/std": 0.261301189661026, + "rewards/thermo_reward/mean": 0.5662983655929565, + "rewards/thermo_reward/std": 1.617175817489624, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 275.28125, + "completions/mean_terminated_length": 275.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13487814739346504, + "epoch": 1.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.240163803100586, + "learning_rate": 1.1584781806577659e-07, + "loss": 0.0944, + "num_tokens": 7489171.0, + "reward": 6.826386451721191, + "reward_std": 2.2010557651519775, + "rewards/fitness_reward/mean": 6.130312919616699, + "rewards/fitness_reward/std": 2.089127779006958, + "rewards/kidney_reward/mean": 0.8020090460777283, + "rewards/kidney_reward/std": 1.1603827476501465, + "rewards/length2tails_reward/mean": 0.6492934226989746, + "rewards/length2tails_reward/std": 0.3738612234592438, + "rewards/thermo_reward/mean": 0.2654907703399658, + "rewards/thermo_reward/std": 1.7134251594543457, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.19597869087010622, + "epoch": 1.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.632514476776123, + "learning_rate": 1.1430765641629769e-07, + "loss": -0.0028, + "num_tokens": 7497946.0, + "reward": 6.539190769195557, + "reward_std": 3.013807773590088, + "rewards/fitness_reward/mean": 5.791252136230469, + "rewards/fitness_reward/std": 2.7777340412139893, + "rewards/kidney_reward/mean": 0.18271785974502563, + "rewards/kidney_reward/std": 1.481676697731018, + "rewards/length2tails_reward/mean": 0.8152397871017456, + "rewards/length2tails_reward/std": 0.21467870473861694, + "rewards/thermo_reward/mean": 0.9055393934249878, + "rewards/thermo_reward/std": 1.590433120727539, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13027154561132193, + "epoch": 1.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7954562306404114, + "learning_rate": 1.1277718055638819e-07, + "loss": -0.0037, + "num_tokens": 7506653.0, + "reward": 6.789008140563965, + "reward_std": 1.3232008218765259, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.0037052780389785767, + "rewards/kidney_reward/std": 1.0773756504058838, + "rewards/length2tails_reward/mean": 0.8135291337966919, + "rewards/length2tails_reward/std": 0.26040005683898926, + "rewards/thermo_reward/mean": 0.19092856347560883, + "rewards/thermo_reward/std": 1.8923982381820679, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1470224941149354, + "epoch": 1.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7486069202423096, + "learning_rate": 1.1125640722308626e-07, + "loss": 0.0052, + "num_tokens": 7515344.0, + "reward": 6.916852951049805, + "reward_std": 1.3720299005508423, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.26475033164024353, + "rewards/kidney_reward/std": 1.4243637323379517, + "rewards/length2tails_reward/mean": 0.7544494271278381, + "rewards/length2tails_reward/std": 0.29098063707351685, + "rewards/thermo_reward/mean": 0.21511265635490417, + "rewards/thermo_reward/std": 1.7419779300689697, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.19795255735516548, + "epoch": 1.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.887604236602783, + "learning_rate": 1.097453530473258e-07, + "loss": -0.0017, + "num_tokens": 7524060.0, + "reward": 6.920472145080566, + "reward_std": 2.917083978652954, + "rewards/fitness_reward/mean": 5.953352451324463, + "rewards/fitness_reward/std": 2.5297625064849854, + "rewards/kidney_reward/mean": 0.49193012714385986, + "rewards/kidney_reward/std": 1.5334367752075195, + "rewards/length2tails_reward/mean": 0.8205807209014893, + "rewards/length2tails_reward/std": 0.2580236494541168, + "rewards/thermo_reward/mean": 1.0320186614990234, + "rewards/thermo_reward/std": 1.3274585008621216, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13470555748790503, + "epoch": 1.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9139510989189148, + "learning_rate": 1.0824403455375286e-07, + "loss": -0.0066, + "num_tokens": 7532715.0, + "reward": 5.958120822906494, + "reward_std": 3.3007261753082275, + "rewards/fitness_reward/mean": 5.6109299659729, + "rewards/fitness_reward/std": 2.758577585220337, + "rewards/kidney_reward/mean": 0.34058311581611633, + "rewards/kidney_reward/std": 1.5214576721191406, + "rewards/length2tails_reward/mean": 0.7589548826217651, + "rewards/length2tails_reward/std": 0.300558865070343, + "rewards/thermo_reward/mean": -0.02567894756793976, + "rewards/thermo_reward/std": 1.9741401672363281, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1334414891898632, + "epoch": 1.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5678361654281616, + "learning_rate": 1.0675246816054584e-07, + "loss": -0.0012, + "num_tokens": 7541384.0, + "reward": 7.025940895080566, + "reward_std": 1.4083274602890015, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.09456261992454529, + "rewards/kidney_reward/std": 1.3638854026794434, + "rewards/length2tails_reward/mean": 0.7774710655212402, + "rewards/length2tails_reward/std": 0.260670930147171, + "rewards/thermo_reward/mean": 0.38598716259002686, + "rewards/thermo_reward/std": 1.948667287826538, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.14441207330673933, + "epoch": 1.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37577173113822937, + "learning_rate": 1.0527067017923652e-07, + "loss": 0.0009, + "num_tokens": 7550077.0, + "reward": 7.621313571929932, + "reward_std": 0.9573418498039246, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.6574143767356873, + "rewards/kidney_reward/std": 1.2293705940246582, + "rewards/length2tails_reward/mean": 0.7869528532028198, + "rewards/length2tails_reward/std": 0.2935744822025299, + "rewards/thermo_reward/mean": 1.0091394186019897, + "rewards/thermo_reward/std": 1.5361192226409912, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14310902636498213, + "epoch": 1.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6794575452804565, + "learning_rate": 1.0379865681452971e-07, + "loss": -0.0045, + "num_tokens": 7558757.0, + "reward": 7.241986274719238, + "reward_std": 1.2677793502807617, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.43365275859832764, + "rewards/kidney_reward/std": 1.2535395622253418, + "rewards/length2tails_reward/mean": 0.822176456451416, + "rewards/length2tails_reward/std": 0.24815700948238373, + "rewards/thermo_reward/mean": 0.4566337466239929, + "rewards/thermo_reward/std": 1.8501441478729248, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 274.9375, + "completions/mean_terminated_length": 274.9375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.1765538053587079, + "epoch": 1.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.416180372238159, + "learning_rate": 1.023364441641279e-07, + "loss": -0.0468, + "num_tokens": 7567587.0, + "reward": 6.229531764984131, + "reward_std": 3.4541232585906982, + "rewards/fitness_reward/mean": 5.801756858825684, + "rewards/fitness_reward/std": 2.73724102973938, + "rewards/kidney_reward/mean": 0.24523508548736572, + "rewards/kidney_reward/std": 1.5411834716796875, + "rewards/length2tails_reward/mean": 0.8638964891433716, + "rewards/length2tails_reward/std": 0.23644766211509705, + "rewards/thermo_reward/mean": 0.1783655732870102, + "rewards/thermo_reward/std": 1.8947030305862427, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.14107922557741404, + "epoch": 1.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8553034067153931, + "learning_rate": 1.0088404821855412e-07, + "loss": 0.0208, + "num_tokens": 7576341.0, + "reward": 6.971405506134033, + "reward_std": 1.288536787033081, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.23505310714244843, + "rewards/kidney_reward/std": 1.1954965591430664, + "rewards/length2tails_reward/mean": 0.7517733573913574, + "rewards/length2tails_reward/std": 0.3153061270713806, + "rewards/thermo_reward/mean": 0.35525327920913696, + "rewards/thermo_reward/std": 1.7748334407806396, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15056451689451933, + "epoch": 1.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5692810416221619, + "learning_rate": 9.944148486097792e-08, + "loss": -0.0043, + "num_tokens": 7585063.0, + "reward": 7.364882946014404, + "reward_std": 1.3964899778366089, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.9183021187782288, + "rewards/kidney_reward/std": 1.2635247707366943, + "rewards/length2tails_reward/mean": 0.8077138662338257, + "rewards/length2tails_reward/std": 0.292235791683197, + "rewards/thermo_reward/mean": 0.43098878860473633, + "rewards/thermo_reward/std": 1.5578397512435913, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14127114601433277, + "epoch": 1.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7149919271469116, + "learning_rate": 9.800876986704109e-08, + "loss": 0.0008, + "num_tokens": 7593737.0, + "reward": 6.592535018920898, + "reward_std": 2.406205177307129, + "rewards/fitness_reward/mean": 6.248412609100342, + "rewards/fitness_reward/std": 1.939652442932129, + "rewards/kidney_reward/mean": 0.008557301014661789, + "rewards/kidney_reward/std": 1.5363945960998535, + "rewards/length2tails_reward/mean": 0.7467073202133179, + "rewards/length2tails_reward/std": 0.2775883078575134, + "rewards/thermo_reward/mean": 0.3063332736492157, + "rewards/thermo_reward/std": 1.8663932085037231, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.13826614152640104, + "epoch": 1.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1713569164276123, + "learning_rate": 9.658591890468514e-08, + "loss": -0.0056, + "num_tokens": 7602388.0, + "reward": 5.815595626831055, + "reward_std": 3.479989528656006, + "rewards/fitness_reward/mean": 5.640358924865723, + "rewards/fitness_reward/std": 2.988858938217163, + "rewards/kidney_reward/mean": 0.15935379266738892, + "rewards/kidney_reward/std": 1.3506901264190674, + "rewards/length2tails_reward/mean": 0.7489001750946045, + "rewards/length2tails_reward/std": 0.31669992208480835, + "rewards/thermo_reward/mean": -0.18332980573177338, + "rewards/thermo_reward/std": 1.9829697608947754, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.15171773824840784, + "epoch": 1.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.003143787384033, + "learning_rate": 9.517294753398064e-08, + "loss": -0.0149, + "num_tokens": 7611004.0, + "reward": 5.929316520690918, + "reward_std": 3.3550143241882324, + "rewards/fitness_reward/mean": 5.575882434844971, + "rewards/fitness_reward/std": 3.2158267498016357, + "rewards/kidney_reward/mean": 0.10459327697753906, + "rewards/kidney_reward/std": 1.4213767051696777, + "rewards/length2tails_reward/mean": 0.7260620594024658, + "rewards/length2tails_reward/std": 0.2993079423904419, + "rewards/thermo_reward/mean": 0.23924347758293152, + "rewards/thermo_reward/std": 1.769123911857605, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 269.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13259113393723965, + "epoch": 1.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5299409627914429, + "learning_rate": 9.376987120695545e-08, + "loss": -0.0011, + "num_tokens": 7619653.0, + "reward": 7.236759662628174, + "reward_std": 1.0758943557739258, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5730593204498291, + "rewards/kidney_reward/std": 1.3397749662399292, + "rewards/length2tails_reward/mean": 0.7181392908096313, + "rewards/length2tails_reward/std": 0.3129393458366394, + "rewards/thermo_reward/mean": 0.3587936758995056, + "rewards/thermo_reward/std": 1.7715466022491455, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12837810162454844, + "epoch": 1.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6544708013534546, + "learning_rate": 9.237670526742791e-08, + "loss": 0.0017, + "num_tokens": 7628350.0, + "reward": 6.732784271240234, + "reward_std": 1.5039775371551514, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.13095012307167053, + "rewards/kidney_reward/std": 1.3360952138900757, + "rewards/length2tails_reward/mean": 0.814708948135376, + "rewards/length2tails_reward/std": 0.2178410440683365, + "rewards/thermo_reward/mean": 0.41852378845214844, + "rewards/thermo_reward/std": 1.7801318168640137, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14584413263946772, + "epoch": 1.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6890602111816406, + "learning_rate": 9.099346495083749e-08, + "loss": 0.0427, + "num_tokens": 7637112.0, + "reward": 6.361073017120361, + "reward_std": 2.8738362789154053, + "rewards/fitness_reward/mean": 5.92578125, + "rewards/fitness_reward/std": 2.6346867084503174, + "rewards/kidney_reward/mean": 0.3251514434814453, + "rewards/kidney_reward/std": 1.198372483253479, + "rewards/length2tails_reward/mean": 0.7073013782501221, + "rewards/length2tails_reward/std": 0.32669582962989807, + "rewards/thermo_reward/mean": 0.19178199768066406, + "rewards/thermo_reward/std": 1.8539153337478638, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 262.90625, + "completions/mean_terminated_length": 262.90625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.20052525494247675, + "epoch": 1.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7776063680648804, + "learning_rate": 8.96201653840788e-08, + "loss": -0.0952, + "num_tokens": 7645557.0, + "reward": 5.555744171142578, + "reward_std": 3.82283091545105, + "rewards/fitness_reward/mean": 5.20194149017334, + "rewards/fitness_reward/std": 3.491765260696411, + "rewards/kidney_reward/mean": -0.038750261068344116, + "rewards/kidney_reward/std": 1.3175898790359497, + "rewards/length2tails_reward/mean": 0.8112931251525879, + "rewards/length2tails_reward/std": 0.25601938366889954, + "rewards/thermo_reward/mean": 0.34071022272109985, + "rewards/thermo_reward/std": 1.9983152151107788, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 280.25, + "completions/mean_terminated_length": 280.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15280383359640837, + "epoch": 1.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2936370372772217, + "learning_rate": 8.825682158533553e-08, + "loss": 0.0633, + "num_tokens": 7654557.0, + "reward": 6.904385089874268, + "reward_std": 1.084137201309204, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.1500115543603897, + "rewards/kidney_reward/std": 1.357714295387268, + "rewards/length2tails_reward/mean": 0.8248668909072876, + "rewards/length2tails_reward/std": 0.1853381097316742, + "rewards/thermo_reward/mean": 0.36375191807746887, + "rewards/thermo_reward/std": 1.8171186447143555, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 275.90625, + "completions/mean_terminated_length": 275.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14687765389680862, + "epoch": 1.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3300075829029083, + "learning_rate": 8.690344846391729e-08, + "loss": 0.0091, + "num_tokens": 7663418.0, + "reward": 7.049173831939697, + "reward_std": 1.1484054327011108, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.16825039684772491, + "rewards/kidney_reward/std": 1.362807273864746, + "rewards/length2tails_reward/mean": 0.7734864950180054, + "rewards/length2tails_reward/std": 0.2753138542175293, + "rewards/thermo_reward/mean": 0.5667352676391602, + "rewards/thermo_reward/std": 1.5838664770126343, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13390665873885155, + "epoch": 1.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6301953792572021, + "learning_rate": 8.556006082009559e-08, + "loss": 0.0004, + "num_tokens": 7672092.0, + "reward": 7.138387680053711, + "reward_std": 1.2911202907562256, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5173478722572327, + "rewards/kidney_reward/std": 1.373558521270752, + "rewards/length2tails_reward/mean": 0.7490646243095398, + "rewards/length2tails_reward/std": 0.2916795611381531, + "rewards/thermo_reward/mean": 0.2022983878850937, + "rewards/thermo_reward/std": 1.8432154655456543, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15134918317198753, + "epoch": 1.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9220150113105774, + "learning_rate": 8.422667334494249e-08, + "loss": -0.0124, + "num_tokens": 7680860.0, + "reward": 7.084668159484863, + "reward_std": 1.864588737487793, + "rewards/fitness_reward/mean": 6.310906887054443, + "rewards/fitness_reward/std": 1.5861327648162842, + "rewards/kidney_reward/mean": 0.3537640869617462, + "rewards/kidney_reward/std": 1.1724754571914673, + "rewards/length2tails_reward/mean": 0.7611767649650574, + "rewards/length2tails_reward/std": 0.2823205590248108, + "rewards/thermo_reward/mean": 0.813170313835144, + "rewards/thermo_reward/std": 1.5483834743499756, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1470579979941249, + "epoch": 1.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37683096528053284, + "learning_rate": 8.290330062017014e-08, + "loss": -0.0096, + "num_tokens": 7689552.0, + "reward": 7.244678497314453, + "reward_std": 1.1117514371871948, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5936733484268188, + "rewards/kidney_reward/std": 1.2107832431793213, + "rewards/length2tails_reward/mean": 0.7467259764671326, + "rewards/length2tails_reward/std": 0.3155352771282196, + "rewards/thermo_reward/mean": 0.5457018613815308, + "rewards/thermo_reward/std": 1.5984930992126465, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 274.9375, + "completions/mean_terminated_length": 274.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.17980888672173023, + "epoch": 1.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.92822265625, + "learning_rate": 8.158995711797e-08, + "loss": 0.0742, + "num_tokens": 7698382.0, + "reward": 6.669667720794678, + "reward_std": 2.109738349914551, + "rewards/fitness_reward/mean": 6.226877212524414, + "rewards/fitness_reward/std": 2.061476945877075, + "rewards/kidney_reward/mean": -0.0723903626203537, + "rewards/kidney_reward/std": 1.3391774892807007, + "rewards/length2tails_reward/mean": 0.8269414901733398, + "rewards/length2tails_reward/std": 0.22345077991485596, + "rewards/thermo_reward/mean": 0.5445005893707275, + "rewards/thermo_reward/std": 1.7287259101867676, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1400172933936119, + "epoch": 1.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24981628358364105, + "learning_rate": 8.028665720085659e-08, + "loss": -0.0062, + "num_tokens": 7707101.0, + "reward": 6.8972272872924805, + "reward_std": 2.1188056468963623, + "rewards/fitness_reward/mean": 6.206755638122559, + "rewards/fitness_reward/std": 1.6781727075576782, + "rewards/kidney_reward/mean": 0.3389732241630554, + "rewards/kidney_reward/std": 1.286360502243042, + "rewards/length2tails_reward/mean": 0.7938892841339111, + "rewards/length2tails_reward/std": 0.28922373056411743, + "rewards/thermo_reward/mean": 0.6450251340866089, + "rewards/thermo_reward/std": 1.8009586334228516, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 265.6875, + "completions/mean_terminated_length": 265.6875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.17370106279850006, + "epoch": 1.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4611979722976685, + "learning_rate": 7.899341512150892e-08, + "loss": -0.0341, + "num_tokens": 7715635.0, + "reward": 6.483504295349121, + "reward_std": 1.6373029947280884, + "rewards/fitness_reward/mean": 6.105436325073242, + "rewards/fitness_reward/std": 1.7448866367340088, + "rewards/kidney_reward/mean": 0.28378981351852417, + "rewards/kidney_reward/std": 1.2895056009292603, + "rewards/length2tails_reward/mean": 0.7321654558181763, + "rewards/length2tails_reward/std": 0.3196871280670166, + "rewards/thermo_reward/mean": 0.10626305639743805, + "rewards/thermo_reward/std": 1.7605559825897217, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.14771200716495514, + "epoch": 1.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.318401575088501, + "learning_rate": 7.771024502261525e-08, + "loss": -0.02, + "num_tokens": 7724310.0, + "reward": 6.454639911651611, + "reward_std": 2.4978907108306885, + "rewards/fitness_reward/mean": 6.211494445800781, + "rewards/fitness_reward/std": 2.1484947204589844, + "rewards/kidney_reward/mean": -0.548114538192749, + "rewards/kidney_reward/std": 1.4144492149353027, + "rewards/length2tails_reward/mean": 0.8405541181564331, + "rewards/length2tails_reward/std": 0.19978399574756622, + "rewards/thermo_reward/mean": 0.6141285300254822, + "rewards/thermo_reward/std": 1.7411296367645264, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1276962710544467, + "epoch": 1.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5441233515739441, + "learning_rate": 7.643716093671826e-08, + "loss": 0.002, + "num_tokens": 7733009.0, + "reward": 6.225451469421387, + "reward_std": 2.8613123893737793, + "rewards/fitness_reward/mean": 5.846860885620117, + "rewards/fitness_reward/std": 2.5778920650482178, + "rewards/kidney_reward/mean": -0.29991012811660767, + "rewards/kidney_reward/std": 1.223515272140503, + "rewards/length2tails_reward/mean": 0.7918553948402405, + "rewards/length2tails_reward/std": 0.2854537069797516, + "rewards/thermo_reward/mean": 0.6611631512641907, + "rewards/thermo_reward/std": 1.8043864965438843, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12592537607997656, + "epoch": 1.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1005501747131348, + "learning_rate": 7.51741767860612e-08, + "loss": 0.0084, + "num_tokens": 7741704.0, + "reward": 7.101147651672363, + "reward_std": 1.5783418416976929, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4693070650100708, + "rewards/kidney_reward/std": 1.224630355834961, + "rewards/length2tails_reward/mean": 0.7284267544746399, + "rewards/length2tails_reward/std": 0.3301963210105896, + "rewards/thermo_reward/mean": 0.3921566307544708, + "rewards/thermo_reward/std": 1.8385539054870605, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1400387305766344, + "epoch": 1.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2703351974487305, + "learning_rate": 7.392130638243665e-08, + "loss": 0.0014, + "num_tokens": 7750427.0, + "reward": 6.691196441650391, + "reward_std": 2.474586248397827, + "rewards/fitness_reward/mean": 5.979096412658691, + "rewards/fitness_reward/std": 2.4129538536071777, + "rewards/kidney_reward/mean": 0.5387409925460815, + "rewards/kidney_reward/std": 1.5256801843643188, + "rewards/length2tails_reward/mean": 0.8083356618881226, + "rewards/length2tails_reward/std": 0.2541159391403198, + "rewards/thermo_reward/mean": 0.48129159212112427, + "rewards/thermo_reward/std": 1.8087587356567383, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1450464529916644, + "epoch": 1.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1582823991775513, + "learning_rate": 7.26785634270346e-08, + "loss": -0.0005, + "num_tokens": 7759134.0, + "reward": 7.495002746582031, + "reward_std": 1.0697494745254517, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.436704158782959, + "rewards/kidney_reward/std": 1.3325331211090088, + "rewards/length2tails_reward/mean": 0.8409305810928345, + "rewards/length2tails_reward/std": 0.16789835691452026, + "rewards/thermo_reward/mean": 0.9502402544021606, + "rewards/thermo_reward/std": 1.3539648056030273, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.17105464451014996, + "epoch": 1.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.4417619705200195, + "learning_rate": 7.144596151029303e-08, + "loss": -0.0113, + "num_tokens": 7767818.0, + "reward": 6.825206756591797, + "reward_std": 2.593777656555176, + "rewards/fitness_reward/mean": 6.118467330932617, + "rewards/fitness_reward/std": 2.1535637378692627, + "rewards/kidney_reward/mean": 0.359657883644104, + "rewards/kidney_reward/std": 1.5858391523361206, + "rewards/length2tails_reward/mean": 0.7853270769119263, + "rewards/length2tails_reward/std": 0.29089006781578064, + "rewards/thermo_reward/mean": 0.6611579060554504, + "rewards/thermo_reward/std": 1.627814531326294, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1219852464273572, + "epoch": 1.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6431599259376526, + "learning_rate": 7.022351411174865e-08, + "loss": 0.0029, + "num_tokens": 7776505.0, + "reward": 6.884721755981445, + "reward_std": 1.1603665351867676, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.10085730999708176, + "rewards/kidney_reward/std": 1.325021505355835, + "rewards/length2tails_reward/mean": 0.7144224643707275, + "rewards/length2tails_reward/std": 0.2776328921318054, + "rewards/thermo_reward/mean": 0.5407341122627258, + "rewards/thermo_reward/std": 1.7512967586517334, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 284.28125, + "completions/mean_terminated_length": 284.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17884879652410746, + "epoch": 1.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.968055725097656, + "learning_rate": 6.901123459989066e-08, + "loss": 0.2512, + "num_tokens": 7785634.0, + "reward": 6.550700664520264, + "reward_std": 2.344426155090332, + "rewards/fitness_reward/mean": 6.221928596496582, + "rewards/fitness_reward/std": 2.0894691944122314, + "rewards/kidney_reward/mean": 0.07067226618528366, + "rewards/kidney_reward/std": 1.3066167831420898, + "rewards/length2tails_reward/mean": 0.7352291941642761, + "rewards/length2tails_reward/std": 0.271246075630188, + "rewards/thermo_reward/mean": 0.2192571461200714, + "rewards/thermo_reward/std": 1.9306559562683105, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1206783689558506, + "epoch": 1.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4432059526443481, + "learning_rate": 6.780913623201345e-08, + "loss": 0.004, + "num_tokens": 7794304.0, + "reward": 6.147652626037598, + "reward_std": 2.3736939430236816, + "rewards/fitness_reward/mean": 6.129303932189941, + "rewards/fitness_reward/std": 2.094611883163452, + "rewards/kidney_reward/mean": -0.20430654287338257, + "rewards/kidney_reward/std": 1.285444736480713, + "rewards/length2tails_reward/mean": 0.7060990333557129, + "rewards/length2tails_reward/std": 0.3547629117965698, + "rewards/thermo_reward/mean": -0.11204513907432556, + "rewards/thermo_reward/std": 2.097207546234131, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.16053821239620447, + "epoch": 1.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9191988706588745, + "learning_rate": 6.661723215407222e-08, + "loss": 0.0089, + "num_tokens": 7802980.0, + "reward": 6.828422546386719, + "reward_std": 2.34560227394104, + "rewards/fitness_reward/mean": 6.0311808586120605, + "rewards/fitness_reward/std": 2.1259360313415527, + "rewards/kidney_reward/mean": 0.24174097180366516, + "rewards/kidney_reward/std": 1.343316674232483, + "rewards/length2tails_reward/mean": 0.855763852596283, + "rewards/length2tails_reward/std": 0.23646438121795654, + "rewards/thermo_reward/mean": 0.9248610734939575, + "rewards/thermo_reward/std": 1.6797188520431519, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1538703255355358, + "epoch": 1.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6966981887817383, + "learning_rate": 6.543553540053926e-08, + "loss": 0.0118, + "num_tokens": 7811693.0, + "reward": 6.792122840881348, + "reward_std": 2.712594747543335, + "rewards/fitness_reward/mean": 6.116663932800293, + "rewards/fitness_reward/std": 2.163386106491089, + "rewards/kidney_reward/mean": 0.6079121828079224, + "rewards/kidney_reward/std": 1.1184345483779907, + "rewards/length2tails_reward/mean": 0.7678369283676147, + "rewards/length2tails_reward/std": 0.29102542996406555, + "rewards/thermo_reward/mean": 0.3590865433216095, + "rewards/thermo_reward/std": 1.8563789129257202, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1296884110197425, + "epoch": 1.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3490790128707886, + "learning_rate": 6.426405889426046e-08, + "loss": 0.0034, + "num_tokens": 7820366.0, + "reward": 5.937580108642578, + "reward_std": 3.626763105392456, + "rewards/fitness_reward/mean": 5.515270709991455, + "rewards/fitness_reward/std": 3.1108944416046143, + "rewards/kidney_reward/mean": 0.18754181265830994, + "rewards/kidney_reward/std": 1.4053640365600586, + "rewards/length2tails_reward/mean": 0.7833015322685242, + "rewards/length2tails_reward/std": 0.28317564725875854, + "rewards/thermo_reward/mean": 0.26542627811431885, + "rewards/thermo_reward/std": 1.8957198858261108, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.4375, + "completions/mean_terminated_length": 263.4375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.14165944885462523, + "epoch": 1.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6650650501251221, + "learning_rate": 6.310281544631546e-08, + "loss": -0.1071, + "num_tokens": 7828828.0, + "reward": 6.596134185791016, + "reward_std": 2.8833301067352295, + "rewards/fitness_reward/mean": 5.952981948852539, + "rewards/fitness_reward/std": 2.51713490486145, + "rewards/kidney_reward/mean": 0.4031717777252197, + "rewards/kidney_reward/std": 1.3050614595413208, + "rewards/length2tails_reward/mean": 0.718561053276062, + "rewards/length2tails_reward/std": 0.32841771841049194, + "rewards/thermo_reward/mean": 0.5238514542579651, + "rewards/thermo_reward/std": 1.6414804458618164, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13612266164273024, + "epoch": 1.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7729809284210205, + "learning_rate": 6.195181775587654e-08, + "loss": -0.0014, + "num_tokens": 7837515.0, + "reward": 7.305784702301025, + "reward_std": 1.2178415060043335, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.11849098652601242, + "rewards/kidney_reward/std": 1.4096735715866089, + "rewards/length2tails_reward/mean": 0.7737295627593994, + "rewards/length2tails_reward/std": 0.299077570438385, + "rewards/thermo_reward/mean": 0.9236170649528503, + "rewards/thermo_reward/std": 1.5738530158996582, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13240704778581858, + "epoch": 1.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4106767475605011, + "learning_rate": 6.081107841007004e-08, + "loss": -0.0058, + "num_tokens": 7846193.0, + "reward": 6.7430419921875, + "reward_std": 1.967689871788025, + "rewards/fitness_reward/mean": 6.283493518829346, + "rewards/fitness_reward/std": 1.741206407546997, + "rewards/kidney_reward/mean": -0.15647932887077332, + "rewards/kidney_reward/std": 1.4461088180541992, + "rewards/length2tails_reward/mean": 0.7610254287719727, + "rewards/length2tails_reward/std": 0.2843559980392456, + "rewards/thermo_reward/mean": 0.6950637698173523, + "rewards/thermo_reward/std": 1.5674270391464233, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 276.3125, + "completions/mean_terminated_length": 276.3125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.15826256200671196, + "epoch": 1.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.426916241645813, + "learning_rate": 5.968060988383883e-08, + "loss": -0.0008, + "num_tokens": 7855067.0, + "reward": 6.635344505310059, + "reward_std": 3.0358753204345703, + "rewards/fitness_reward/mean": 5.7535176277160645, + "rewards/fitness_reward/std": 2.581522226333618, + "rewards/kidney_reward/mean": 0.38117876648902893, + "rewards/kidney_reward/std": 1.3751283884048462, + "rewards/length2tails_reward/mean": 0.8234817981719971, + "rewards/length2tails_reward/std": 0.23257982730865479, + "rewards/thermo_reward/mean": 0.9707328677177429, + "rewards/thermo_reward/std": 1.3778488636016846, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 278.15625, + "completions/mean_terminated_length": 278.15625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1904309457167983, + "epoch": 1.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.930673122406006, + "learning_rate": 5.8560424539805255e-08, + "loss": 0.1001, + "num_tokens": 7864000.0, + "reward": 6.422598361968994, + "reward_std": 2.912506103515625, + "rewards/fitness_reward/mean": 5.703462600708008, + "rewards/fitness_reward/std": 2.7510440349578857, + "rewards/kidney_reward/mean": 0.4721541404724121, + "rewards/kidney_reward/std": 1.3861840963363647, + "rewards/length2tails_reward/mean": 0.7354128360748291, + "rewards/length2tails_reward/std": 0.2993165850639343, + "rewards/thermo_reward/mean": 0.5984115600585938, + "rewards/thermo_reward/std": 1.752209186553955, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.65625, + "completions/mean_terminated_length": 267.65625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.13632580637931824, + "epoch": 1.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7912124395370483, + "learning_rate": 5.745053462813698e-08, + "loss": -0.0321, + "num_tokens": 7872597.0, + "reward": 6.588764667510986, + "reward_std": 2.591097593307495, + "rewards/fitness_reward/mean": 6.114811420440674, + "rewards/fitness_reward/std": 2.1734812259674072, + "rewards/kidney_reward/mean": 0.43896540999412537, + "rewards/kidney_reward/std": 1.2711267471313477, + "rewards/length2tails_reward/mean": 0.6907632946968079, + "rewards/length2tails_reward/std": 0.3464326858520508, + "rewards/thermo_reward/mean": 0.16355997323989868, + "rewards/thermo_reward/std": 1.9130176305770874, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 277.40625, + "completions/mean_terminated_length": 277.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16155453212559223, + "epoch": 1.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1367851495742798, + "learning_rate": 5.635095228641229e-08, + "loss": -0.007, + "num_tokens": 7881506.0, + "reward": 6.575996398925781, + "reward_std": 1.951995611190796, + "rewards/fitness_reward/mean": 6.191231727600098, + "rewards/fitness_reward/std": 1.7607989311218262, + "rewards/kidney_reward/mean": 0.38982832431793213, + "rewards/kidney_reward/std": 1.3408080339431763, + "rewards/length2tails_reward/mean": 0.8052226305007935, + "rewards/length2tails_reward/std": 0.2901822030544281, + "rewards/thermo_reward/mean": -0.02291056513786316, + "rewards/thermo_reward/std": 2.0373799800872803, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.1660868115723133, + "epoch": 1.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.083187103271484, + "learning_rate": 5.526168953948751e-08, + "loss": -0.0157, + "num_tokens": 7890148.0, + "reward": 6.006649017333984, + "reward_std": 3.3939294815063477, + "rewards/fitness_reward/mean": 5.553999423980713, + "rewards/fitness_reward/std": 3.2893714904785156, + "rewards/kidney_reward/mean": 0.020442910492420197, + "rewards/kidney_reward/std": 1.3696351051330566, + "rewards/length2tails_reward/mean": 0.7577709555625916, + "rewards/length2tails_reward/std": 0.2945621907711029, + "rewards/thermo_reward/mean": 0.5059705972671509, + "rewards/thermo_reward/std": 1.7821928262710571, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 266.75, + "completions/mean_terminated_length": 266.75, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.14469025656580925, + "epoch": 1.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4151942729949951, + "learning_rate": 5.4182758299365364e-08, + "loss": -0.0545, + "num_tokens": 7898716.0, + "reward": 6.381585121154785, + "reward_std": 3.3608179092407227, + "rewards/fitness_reward/mean": 5.826628684997559, + "rewards/fitness_reward/std": 3.0092155933380127, + "rewards/kidney_reward/mean": 0.24827441573143005, + "rewards/kidney_reward/std": 1.2708081007003784, + "rewards/length2tails_reward/mean": 0.8759379386901855, + "rewards/length2tails_reward/std": 0.19582390785217285, + "rewards/thermo_reward/mean": 0.42367005348205566, + "rewards/thermo_reward/std": 1.8299076557159424, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 265.96875, + "completions/mean_terminated_length": 265.96875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.14840120822191238, + "epoch": 1.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7651766538619995, + "learning_rate": 5.311417036506516e-08, + "loss": -0.0554, + "num_tokens": 7907259.0, + "reward": 6.51305627822876, + "reward_std": 2.578157901763916, + "rewards/fitness_reward/mean": 5.82460880279541, + "rewards/fitness_reward/std": 2.6523773670196533, + "rewards/kidney_reward/mean": 0.1977565884590149, + "rewards/kidney_reward/std": 1.5997408628463745, + "rewards/length2tails_reward/mean": 0.7544538378715515, + "rewards/length2tails_reward/std": 0.2939538359642029, + "rewards/thermo_reward/mean": 0.8019118309020996, + "rewards/thermo_reward/std": 1.5719510316848755, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1253397213295102, + "epoch": 1.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.51847243309021, + "learning_rate": 5.2055937422493254e-08, + "loss": -0.0053, + "num_tokens": 7915928.0, + "reward": 6.530694007873535, + "reward_std": 2.1286239624023438, + "rewards/fitness_reward/mean": 6.198827743530273, + "rewards/fitness_reward/std": 1.72030508518219, + "rewards/kidney_reward/mean": 0.2919164299964905, + "rewards/kidney_reward/std": 1.3949557542800903, + "rewards/length2tails_reward/mean": 0.7195277214050293, + "rewards/length2tails_reward/std": 0.3228347897529602, + "rewards/thermo_reward/mean": 0.0120522640645504, + "rewards/thermo_reward/std": 1.8861885070800781, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14113183226436377, + "epoch": 1.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5314672589302063, + "learning_rate": 5.10080710443157e-08, + "loss": 0.0056, + "num_tokens": 7924600.0, + "reward": 6.955929756164551, + "reward_std": 1.5478595495224, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2759285569190979, + "rewards/kidney_reward/std": 1.3739991188049316, + "rewards/length2tails_reward/mean": 0.7502394914627075, + "rewards/length2tails_reward/std": 0.24875158071517944, + "rewards/thermo_reward/mean": 0.28419262170791626, + "rewards/thermo_reward/std": 1.8665560483932495, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.13395661301910877, + "epoch": 1.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3607888221740723, + "learning_rate": 4.9970582689831345e-08, + "loss": -0.0024, + "num_tokens": 7933253.0, + "reward": 7.333518981933594, + "reward_std": 1.1018414497375488, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.46436917781829834, + "rewards/kidney_reward/std": 1.3686516284942627, + "rewards/length2tails_reward/mean": 0.8126377463340759, + "rewards/length2tails_reward/std": 0.26576361060142517, + "rewards/thermo_reward/mean": 0.6137527823448181, + "rewards/thermo_reward/std": 1.6033440828323364, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 286.125, + "completions/mean_terminated_length": 286.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1834298437461257, + "epoch": 1.8159999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.362483501434326, + "learning_rate": 4.8943483704846465e-08, + "loss": 0.0976, + "num_tokens": 7942441.0, + "reward": 6.293238162994385, + "reward_std": 2.936807870864868, + "rewards/fitness_reward/mean": 5.7916669845581055, + "rewards/fitness_reward/std": 2.7825539112091064, + "rewards/kidney_reward/mean": 0.1086951196193695, + "rewards/kidney_reward/std": 1.4603363275527954, + "rewards/length2tails_reward/mean": 0.8296551704406738, + "rewards/length2tails_reward/std": 0.2563319504261017, + "rewards/thermo_reward/mean": 0.47961995005607605, + "rewards/thermo_reward/std": 1.8772151470184326, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14250972401350737, + "epoch": 1.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4105027914047241, + "learning_rate": 4.792678532155114e-08, + "loss": 0.0091, + "num_tokens": 7951122.0, + "reward": 6.543004512786865, + "reward_std": 2.8884639739990234, + "rewards/fitness_reward/mean": 5.912921905517578, + "rewards/fitness_reward/std": 2.254716396331787, + "rewards/kidney_reward/mean": 0.5233614444732666, + "rewards/kidney_reward/std": 1.234429121017456, + "rewards/length2tails_reward/mean": 0.7206652760505676, + "rewards/length2tails_reward/std": 0.3155767023563385, + "rewards/thermo_reward/mean": 0.37647131085395813, + "rewards/thermo_reward/std": 1.75032377243042, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 264.9375, + "completions/mean_terminated_length": 264.9375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.14505137130618095, + "epoch": 1.8199999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3851816654205322, + "learning_rate": 4.6920498658395645e-08, + "loss": -0.085, + "num_tokens": 7959632.0, + "reward": 6.135385513305664, + "reward_std": 2.7251851558685303, + "rewards/fitness_reward/mean": 5.789863586425781, + "rewards/fitness_reward/std": 2.7894539833068848, + "rewards/kidney_reward/mean": 0.47935307025909424, + "rewards/kidney_reward/std": 1.44709050655365, + "rewards/length2tails_reward/mean": 0.7476708889007568, + "rewards/length2tails_reward/std": 0.3154660761356354, + "rewards/thermo_reward/mean": -0.16214501857757568, + "rewards/thermo_reward/std": 1.980732798576355, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1313042137771845, + "epoch": 1.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6158406734466553, + "learning_rate": 4.5924634719970215e-08, + "loss": -0.003, + "num_tokens": 7968285.0, + "reward": 6.9807281494140625, + "reward_std": 1.2773802280426025, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.3752599358558655, + "rewards/kidney_reward/std": 1.352660059928894, + "rewards/length2tails_reward/mean": 0.6983648538589478, + "rewards/length2tails_reward/std": 0.28671926259994507, + "rewards/thermo_reward/mean": 0.6723517775535583, + "rewards/thermo_reward/std": 1.542386770248413, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12930038757622242, + "epoch": 1.8239999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6919370293617249, + "learning_rate": 4.4939204396883146e-08, + "loss": -0.0009, + "num_tokens": 7976944.0, + "reward": 7.2305908203125, + "reward_std": 1.895634651184082, + "rewards/fitness_reward/mean": 6.289312362670898, + "rewards/fitness_reward/std": 1.7082889080047607, + "rewards/kidney_reward/mean": 0.5778818726539612, + "rewards/kidney_reward/std": 1.224001169204712, + "rewards/length2tails_reward/mean": 0.719877302646637, + "rewards/length2tails_reward/std": 0.283610999584198, + "rewards/thermo_reward/mean": 0.9447357654571533, + "rewards/thermo_reward/std": 1.415595531463623, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15086272917687893, + "epoch": 1.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5846606492996216, + "learning_rate": 4.396421846564235e-08, + "loss": 0.0007, + "num_tokens": 7985622.0, + "reward": 6.879499912261963, + "reward_std": 1.3751397132873535, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.26464492082595825, + "rewards/kidney_reward/std": 1.2152553796768188, + "rewards/length2tails_reward/mean": 0.6895222663879395, + "rewards/length2tails_reward/std": 0.34168052673339844, + "rewards/thermo_reward/mean": 0.5849318504333496, + "rewards/thermo_reward/std": 1.6798861026763916, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12939132563769817, + "epoch": 1.8279999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6918027997016907, + "learning_rate": 4.2999687588538117e-08, + "loss": 0.0135, + "num_tokens": 7994406.0, + "reward": 6.786921501159668, + "reward_std": 1.7943003177642822, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1254861205816269, + "rewards/kidney_reward/std": 1.3599218130111694, + "rewards/length2tails_reward/mean": 0.728277862071991, + "rewards/length2tails_reward/std": 0.35588890314102173, + "rewards/thermo_reward/mean": 0.3135784864425659, + "rewards/thermo_reward/std": 1.6825498342514038, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.1532157612964511, + "epoch": 1.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.137582778930664, + "learning_rate": 4.204562231352515e-08, + "loss": -0.0103, + "num_tokens": 8003053.0, + "reward": 6.5856146812438965, + "reward_std": 2.3236374855041504, + "rewards/fitness_reward/mean": 6.0249199867248535, + "rewards/fitness_reward/std": 2.158726692199707, + "rewards/kidney_reward/mean": 0.4450969994068146, + "rewards/kidney_reward/std": 1.502671480178833, + "rewards/length2tails_reward/mean": 0.7668288350105286, + "rewards/length2tails_reward/std": 0.31348109245300293, + "rewards/thermo_reward/mean": 0.29287809133529663, + "rewards/thermo_reward/std": 1.889686107635498, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13073045760393143, + "epoch": 1.8319999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.54015052318573, + "learning_rate": 4.110203307410898e-08, + "loss": -0.0051, + "num_tokens": 8011733.0, + "reward": 6.931526184082031, + "reward_std": 1.3817802667617798, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.23195861279964447, + "rewards/kidney_reward/std": 1.168290376663208, + "rewards/length2tails_reward/mean": 0.6876571178436279, + "rewards/length2tails_reward/std": 0.3494473993778229, + "rewards/thermo_reward/mean": 0.5166248679161072, + "rewards/thermo_reward/std": 1.4714998006820679, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.14845786150544882, + "epoch": 1.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3271218538284302, + "learning_rate": 4.016893018922996e-08, + "loss": -0.0138, + "num_tokens": 8020396.0, + "reward": 6.578068733215332, + "reward_std": 2.395250082015991, + "rewards/fitness_reward/mean": 5.997559547424316, + "rewards/fitness_reward/std": 2.3363797664642334, + "rewards/kidney_reward/mean": 0.19487053155899048, + "rewards/kidney_reward/std": 1.410597562789917, + "rewards/length2tails_reward/mean": 0.8126527667045593, + "rewards/length2tails_reward/std": 0.2656821012496948, + "rewards/thermo_reward/mean": 0.5598208904266357, + "rewards/thermo_reward/std": 1.6639939546585083, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 265.71875, + "completions/mean_terminated_length": 265.71875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.1842678403481841, + "epoch": 1.8359999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1664130687713623, + "learning_rate": 3.924632386315185e-08, + "loss": -0.0521, + "num_tokens": 8028931.0, + "reward": 5.476888656616211, + "reward_std": 4.09977388381958, + "rewards/fitness_reward/mean": 5.117265701293945, + "rewards/fitness_reward/std": 3.962766170501709, + "rewards/kidney_reward/mean": -0.026276148855686188, + "rewards/kidney_reward/std": 1.5192123651504517, + "rewards/length2tails_reward/mean": 0.8071082830429077, + "rewards/length2tails_reward/std": 0.28930649161338806, + "rewards/thermo_reward/mean": 0.3419671654701233, + "rewards/thermo_reward/std": 1.7738760709762573, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 261.5625, + "completions/mean_terminated_length": 261.5625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.13398587610572577, + "epoch": 1.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8205679655075073, + "learning_rate": 3.833422418534959e-08, + "loss": -0.1458, + "num_tokens": 8037333.0, + "reward": 6.507333278656006, + "reward_std": 2.474182605743408, + "rewards/fitness_reward/mean": 6.131397247314453, + "rewards/fitness_reward/std": 2.083240032196045, + "rewards/kidney_reward/mean": -0.20961281657218933, + "rewards/kidney_reward/std": 1.2751054763793945, + "rewards/length2tails_reward/mean": 0.7548587918281555, + "rewards/length2tails_reward/std": 0.3330869674682617, + "rewards/thermo_reward/mean": 0.5840553045272827, + "rewards/thermo_reward/std": 1.7713924646377563, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12410981953144073, + "epoch": 1.8399999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8048571348190308, + "learning_rate": 3.7432641130399236e-08, + "loss": -0.0037, + "num_tokens": 8045978.0, + "reward": 6.347002029418945, + "reward_std": 2.320749044418335, + "rewards/fitness_reward/mean": 5.887991905212402, + "rewards/fitness_reward/std": 1.9221867322921753, + "rewards/kidney_reward/mean": 0.28953516483306885, + "rewards/kidney_reward/std": 1.3673288822174072, + "rewards/length2tails_reward/mean": 0.6951801180839539, + "rewards/length2tails_reward/std": 0.30936911702156067, + "rewards/thermo_reward/mean": 0.2808953523635864, + "rewards/thermo_reward/std": 2.02681303024292, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1332868579775095, + "epoch": 1.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5726380348205566, + "learning_rate": 3.65415845578686e-08, + "loss": 0.0045, + "num_tokens": 8054634.0, + "reward": 6.981606960296631, + "reward_std": 1.257129430770874, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.06643954664468765, + "rewards/kidney_reward/std": 1.5494112968444824, + "rewards/length2tails_reward/mean": 0.7483483552932739, + "rewards/length2tails_reward/std": 0.27187320590019226, + "rewards/thermo_reward/mean": 0.3400038480758667, + "rewards/thermo_reward/std": 1.9042646884918213, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.0, + "completions/mean_terminated_length": 263.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.15293076913803816, + "epoch": 1.8439999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8495379686355591, + "learning_rate": 3.566106421220949e-08, + "loss": -0.0883, + "num_tokens": 8063082.0, + "reward": 6.1899309158325195, + "reward_std": 3.331753969192505, + "rewards/fitness_reward/mean": 5.777008056640625, + "rewards/fitness_reward/std": 2.4754786491394043, + "rewards/kidney_reward/mean": 0.5471885204315186, + "rewards/kidney_reward/std": 1.3780357837677002, + "rewards/length2tails_reward/mean": 0.760220468044281, + "rewards/length2tails_reward/std": 0.3178078830242157, + "rewards/thermo_reward/mean": -0.10145311057567596, + "rewards/thermo_reward/std": 1.939703345298767, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1835830295458436, + "epoch": 1.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.043091297149658, + "learning_rate": 3.4791089722651435e-08, + "loss": 0.0138, + "num_tokens": 8071852.0, + "reward": 6.146520614624023, + "reward_std": 2.735840320587158, + "rewards/fitness_reward/mean": 5.695765972137451, + "rewards/fitness_reward/std": 2.7802505493164062, + "rewards/kidney_reward/mean": 0.3207208514213562, + "rewards/kidney_reward/std": 1.2620347738265991, + "rewards/length2tails_reward/mean": 0.8141761422157288, + "rewards/length2tails_reward/std": 0.25408387184143066, + "rewards/thermo_reward/mean": 0.17370061576366425, + "rewards/thermo_reward/std": 1.9788756370544434, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13533665426075459, + "epoch": 1.8479999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5187647342681885, + "learning_rate": 3.393167060309587e-08, + "loss": -0.0044, + "num_tokens": 8080518.0, + "reward": 7.110082149505615, + "reward_std": 1.1385376453399658, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.15414252877235413, + "rewards/kidney_reward/std": 1.2835636138916016, + "rewards/length2tails_reward/mean": 0.752368152141571, + "rewards/length2tails_reward/std": 0.29145577549934387, + "rewards/thermo_reward/mean": 0.9191972613334656, + "rewards/thermo_reward/std": 1.439913034439087, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 275.09375, + "completions/mean_terminated_length": 275.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14910481311380863, + "epoch": 1.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6289584636688232, + "learning_rate": 3.308281625201292e-08, + "loss": 0.0574, + "num_tokens": 8089353.0, + "reward": 6.773259162902832, + "reward_std": 2.9144980907440186, + "rewards/fitness_reward/mean": 5.900321960449219, + "rewards/fitness_reward/std": 2.7295475006103516, + "rewards/kidney_reward/mean": 0.6643511056900024, + "rewards/kidney_reward/std": 1.3800365924835205, + "rewards/length2tails_reward/mean": 0.8439217805862427, + "rewards/length2tails_reward/std": 0.22160586714744568, + "rewards/thermo_reward/mean": 0.659561812877655, + "rewards/thermo_reward/std": 1.6633926630020142, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.13854139670729637, + "epoch": 1.8519999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5805995464324951, + "learning_rate": 3.224453595233756e-08, + "loss": -0.0124, + "num_tokens": 8098025.0, + "reward": 6.881084442138672, + "reward_std": 2.2928292751312256, + "rewards/fitness_reward/mean": 5.905848503112793, + "rewards/fitness_reward/std": 2.290864944458008, + "rewards/kidney_reward/mean": 0.5628536939620972, + "rewards/kidney_reward/std": 1.2252541780471802, + "rewards/length2tails_reward/mean": 0.7827929258346558, + "rewards/length2tails_reward/std": 0.2992306053638458, + "rewards/thermo_reward/mean": 0.9962208867073059, + "rewards/thermo_reward/std": 1.4137659072875977, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.1604912392795086, + "epoch": 1.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6376395225524902, + "learning_rate": 3.141683887136892e-08, + "loss": 0.0009, + "num_tokens": 8106752.0, + "reward": 6.256638526916504, + "reward_std": 2.544832468032837, + "rewards/fitness_reward/mean": 6.20842170715332, + "rewards/fitness_reward/std": 2.1658761501312256, + "rewards/kidney_reward/mean": -0.20843149721622467, + "rewards/kidney_reward/std": 1.3011715412139893, + "rewards/length2tails_reward/mean": 0.8740514516830444, + "rewards/length2tails_reward/std": 0.13756117224693298, + "rewards/thermo_reward/mean": -0.13216084241867065, + "rewards/thermo_reward/std": 1.9434444904327393, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.16501410491764545, + "epoch": 1.8559999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.319856882095337, + "learning_rate": 3.0599734060669626e-08, + "loss": -0.016, + "num_tokens": 8115442.0, + "reward": 6.095468521118164, + "reward_std": 3.1856777667999268, + "rewards/fitness_reward/mean": 5.663470268249512, + "rewards/fitness_reward/std": 2.9017081260681152, + "rewards/kidney_reward/mean": 0.22629426419734955, + "rewards/kidney_reward/std": 1.1162402629852295, + "rewards/length2tails_reward/mean": 0.8293063640594482, + "rewards/length2tails_reward/std": 0.2396925985813141, + "rewards/thermo_reward/mean": 0.22304965555667877, + "rewards/thermo_reward/std": 1.9373891353607178, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14276581443846226, + "epoch": 1.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3562889099121094, + "learning_rate": 2.9793230455966933e-08, + "loss": -0.0043, + "num_tokens": 8124207.0, + "reward": 6.063336372375488, + "reward_std": 2.666656494140625, + "rewards/fitness_reward/mean": 6.110805988311768, + "rewards/fitness_reward/std": 2.1953189373016357, + "rewards/kidney_reward/mean": -0.20782563090324402, + "rewards/kidney_reward/std": 1.3456701040267944, + "rewards/length2tails_reward/mean": 0.7628924250602722, + "rewards/length2tails_reward/std": 0.3469874858856201, + "rewards/thermo_reward/mean": -0.2685595154762268, + "rewards/thermo_reward/std": 2.1554925441741943, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13766338024288416, + "epoch": 1.8599999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7447472214698792, + "learning_rate": 2.899733687705519e-08, + "loss": -0.0009, + "num_tokens": 8132866.0, + "reward": 7.308985233306885, + "reward_std": 1.2130883932113647, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.372771680355072, + "rewards/kidney_reward/std": 1.365782618522644, + "rewards/length2tails_reward/mean": 0.7648360729217529, + "rewards/length2tails_reward/std": 0.2774202823638916, + "rewards/thermo_reward/mean": 0.6801839470863342, + "rewards/thermo_reward/std": 1.5270103216171265, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 276.15625, + "completions/mean_terminated_length": 276.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1571539742872119, + "epoch": 1.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9150276184082031, + "learning_rate": 2.821206202769899e-08, + "loss": 0.0321, + "num_tokens": 8141735.0, + "reward": 7.159791946411133, + "reward_std": 1.2407819032669067, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.005715738981962204, + "rewards/kidney_reward/std": 1.41999351978302, + "rewards/length2tails_reward/mean": 0.812694787979126, + "rewards/length2tails_reward/std": 0.302489697933197, + "rewards/thermo_reward/mean": 0.7249242067337036, + "rewards/thermo_reward/std": 1.9012635946273804, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.179732377640903, + "epoch": 1.8639999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2649433612823486, + "learning_rate": 2.7437414495538268e-08, + "loss": 0.0293, + "num_tokens": 8150530.0, + "reward": 6.815646171569824, + "reward_std": 2.067653179168701, + "rewards/fitness_reward/mean": 6.250324249267578, + "rewards/fitness_reward/std": 1.928839921951294, + "rewards/kidney_reward/mean": 0.2132837325334549, + "rewards/kidney_reward/std": 1.3482290506362915, + "rewards/length2tails_reward/mean": 0.7969915270805359, + "rewards/length2tails_reward/std": 0.27745360136032104, + "rewards/thermo_reward/mean": 0.5188645124435425, + "rewards/thermo_reward/std": 1.7175190448760986, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.13339482620358467, + "epoch": 1.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.485048532485962, + "learning_rate": 2.6673402751994255e-08, + "loss": -0.0711, + "num_tokens": 8159042.0, + "reward": 6.450176239013672, + "reward_std": 2.2387845516204834, + "rewards/fitness_reward/mean": 6.216984272003174, + "rewards/fitness_reward/std": 2.1174392700195312, + "rewards/kidney_reward/mean": 0.4534660279750824, + "rewards/kidney_reward/std": 1.3501545190811157, + "rewards/length2tails_reward/mean": 0.7150939106941223, + "rewards/length2tails_reward/std": 0.3303479254245758, + "rewards/thermo_reward/mean": -0.34462833404541016, + "rewards/thermo_reward/std": 2.178049325942993, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12634824588894844, + "epoch": 1.8679999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0149636268615723, + "learning_rate": 2.5920035152176888e-08, + "loss": -0.0062, + "num_tokens": 8167703.0, + "reward": 6.959228992462158, + "reward_std": 2.12825345993042, + "rewards/fitness_reward/mean": 6.192783355712891, + "rewards/fitness_reward/std": 1.7525182962417603, + "rewards/kidney_reward/mean": 0.522956371307373, + "rewards/kidney_reward/std": 1.215049386024475, + "rewards/length2tails_reward/mean": 0.7069827318191528, + "rewards/length2tails_reward/std": 0.32622021436691284, + "rewards/thermo_reward/mean": 0.6564444303512573, + "rewards/thermo_reward/std": 1.7787535190582275, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14050300978124142, + "epoch": 1.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1499427556991577, + "learning_rate": 2.5177319934793995e-08, + "loss": -0.0018, + "num_tokens": 8176404.0, + "reward": 6.47938871383667, + "reward_std": 2.063114881515503, + "rewards/fitness_reward/mean": 6.2062859535217285, + "rewards/fitness_reward/std": 1.6806658506393433, + "rewards/kidney_reward/mean": 0.3686825633049011, + "rewards/kidney_reward/std": 1.3746600151062012, + "rewards/length2tails_reward/mean": 0.764694333076477, + "rewards/length2tails_reward/std": 0.33083832263946533, + "rewards/thermo_reward/mean": -0.20482441782951355, + "rewards/thermo_reward/std": 2.022061347961426, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.4375, + "completions/mean_terminated_length": 267.4375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.1446819854900241, + "epoch": 1.8719999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1421144008636475, + "learning_rate": 2.4445265222059696e-08, + "loss": -0.0373, + "num_tokens": 8184994.0, + "reward": 6.529933929443359, + "reward_std": 2.5808632373809814, + "rewards/fitness_reward/mean": 6.11342191696167, + "rewards/fitness_reward/std": 2.1810550689697266, + "rewards/kidney_reward/mean": 0.4963648319244385, + "rewards/kidney_reward/std": 1.2529908418655396, + "rewards/length2tails_reward/mean": 0.7412991523742676, + "rewards/length2tails_reward/std": 0.2942882776260376, + "rewards/thermo_reward/mean": -0.03399108350276947, + "rewards/thermo_reward/std": 1.8310524225234985, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.13848150335252285, + "epoch": 1.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9806099534034729, + "learning_rate": 2.3723879019607372e-08, + "loss": 0.0094, + "num_tokens": 8193673.0, + "reward": 5.797906875610352, + "reward_std": 3.6753249168395996, + "rewards/fitness_reward/mean": 5.525224208831787, + "rewards/fitness_reward/std": 3.0931413173675537, + "rewards/kidney_reward/mean": -0.13821545243263245, + "rewards/kidney_reward/std": 1.3629382848739624, + "rewards/length2tails_reward/mean": 0.8245513439178467, + "rewards/length2tails_reward/std": 0.2758937180042267, + "rewards/thermo_reward/mean": 0.27130526304244995, + "rewards/thermo_reward/std": 1.8035014867782593, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14592111762613058, + "epoch": 1.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4143927991390228, + "learning_rate": 2.301316921640073e-08, + "loss": 0.0012, + "num_tokens": 8202338.0, + "reward": 6.765290260314941, + "reward_std": 1.3325190544128418, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.25032171607017517, + "rewards/kidney_reward/std": 1.3222986459732056, + "rewards/length2tails_reward/mean": 0.752405047416687, + "rewards/length2tails_reward/std": 0.2956734895706177, + "rewards/thermo_reward/mean": 0.13341672718524933, + "rewards/thermo_reward/std": 1.6025434732437134, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1734840516000986, + "epoch": 1.8780000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.312769889831543, + "learning_rate": 2.231314358464842e-08, + "loss": 0.0181, + "num_tokens": 8211118.0, + "reward": 5.865645408630371, + "reward_std": 3.3251376152038574, + "rewards/fitness_reward/mean": 5.342177391052246, + "rewards/fitness_reward/std": 3.3306922912597656, + "rewards/kidney_reward/mean": 0.059922389686107635, + "rewards/kidney_reward/std": 1.5671170949935913, + "rewards/length2tails_reward/mean": 0.7989176511764526, + "rewards/length2tails_reward/std": 0.29013344645500183, + "rewards/thermo_reward/mean": 0.5875534415245056, + "rewards/thermo_reward/std": 1.8112404346466064, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14410028886049986, + "epoch": 1.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.665503740310669, + "learning_rate": 2.162380977971867e-08, + "loss": -0.0024, + "num_tokens": 8219864.0, + "reward": 6.580451965332031, + "reward_std": 1.2298115491867065, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.011381879448890686, + "rewards/kidney_reward/std": 1.29026460647583, + "rewards/length2tails_reward/mean": 0.7866833806037903, + "rewards/length2tails_reward/std": 0.30524909496307373, + "rewards/thermo_reward/mean": -0.19767357409000397, + "rewards/thermo_reward/std": 2.1044399738311768, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 277.90625, + "completions/mean_terminated_length": 277.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.17427287716418505, + "epoch": 1.8820000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1263842582702637, + "learning_rate": 2.0945175340055356e-08, + "loss": 0.0159, + "num_tokens": 8228789.0, + "reward": 6.641036033630371, + "reward_std": 2.3909459114074707, + "rewards/fitness_reward/mean": 6.147917747497559, + "rewards/fitness_reward/std": 1.993681788444519, + "rewards/kidney_reward/mean": 0.237917959690094, + "rewards/kidney_reward/std": 1.372987985610962, + "rewards/length2tails_reward/mean": 0.8562071323394775, + "rewards/length2tails_reward/std": 0.24018478393554688, + "rewards/thermo_reward/mean": 0.32021546363830566, + "rewards/thermo_reward/std": 2.029773712158203, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12821525242179632, + "epoch": 1.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5536962747573853, + "learning_rate": 2.0277247687096156e-08, + "loss": 0.0016, + "num_tokens": 8237462.0, + "reward": 7.233901023864746, + "reward_std": 1.3383896350860596, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5777004957199097, + "rewards/kidney_reward/std": 1.2210582494735718, + "rewards/length2tails_reward/mean": 0.6944358348846436, + "rewards/length2tails_reward/std": 0.3006775379180908, + "rewards/thermo_reward/mean": 0.5662657618522644, + "rewards/thermo_reward/std": 1.7290948629379272, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 289.3125, + "completions/mean_terminated_length": 289.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17242048867046833, + "epoch": 1.8860000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.910305976867676, + "learning_rate": 1.9620034125190643e-08, + "loss": 0.2218, + "num_tokens": 8246752.0, + "reward": 6.562683582305908, + "reward_std": 2.351170063018799, + "rewards/fitness_reward/mean": 6.120943069458008, + "rewards/fitness_reward/std": 2.1400814056396484, + "rewards/kidney_reward/mean": 0.45652854442596436, + "rewards/kidney_reward/std": 1.4538203477859497, + "rewards/length2tails_reward/mean": 0.7753629088401794, + "rewards/length2tails_reward/std": 0.25896573066711426, + "rewards/thermo_reward/mean": 0.03927020728588104, + "rewards/thermo_reward/std": 1.696890115737915, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 279.78125, + "completions/mean_terminated_length": 279.78125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1254733633249998, + "epoch": 1.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3429434299468994, + "learning_rate": 1.8973541841521335e-08, + "loss": 0.0086, + "num_tokens": 8255737.0, + "reward": 7.076135635375977, + "reward_std": 2.0791258811950684, + "rewards/fitness_reward/mean": 6.267436981201172, + "rewards/fitness_reward/std": 1.832034945487976, + "rewards/kidney_reward/mean": 0.4168388843536377, + "rewards/kidney_reward/std": 1.405630111694336, + "rewards/length2tails_reward/mean": 0.7340405583381653, + "rewards/length2tails_reward/std": 0.33415549993515015, + "rewards/thermo_reward/mean": 0.8335381746292114, + "rewards/thermo_reward/std": 1.5960578918457031, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.13880954030901194, + "epoch": 1.8900000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5699567794799805, + "learning_rate": 1.8337777906023978e-08, + "loss": 0.0019, + "num_tokens": 8264404.0, + "reward": 6.177238464355469, + "reward_std": 3.55812668800354, + "rewards/fitness_reward/mean": 5.752303600311279, + "rewards/fitness_reward/std": 2.9278998374938965, + "rewards/kidney_reward/mean": 0.2940111756324768, + "rewards/kidney_reward/std": 1.353538990020752, + "rewards/length2tails_reward/mean": 0.7436470985412598, + "rewards/length2tails_reward/std": 0.31428641080856323, + "rewards/thermo_reward/mean": 0.1840343326330185, + "rewards/thermo_reward/std": 1.7772465944290161, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13904016464948654, + "epoch": 1.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3630752563476562, + "learning_rate": 1.771274927131139e-08, + "loss": 0.0165, + "num_tokens": 8273181.0, + "reward": 7.021105766296387, + "reward_std": 1.1301379203796387, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.15205594897270203, + "rewards/kidney_reward/std": 1.2428613901138306, + "rewards/length2tails_reward/mean": 0.7818559408187866, + "rewards/length2tails_reward/std": 0.2699720561504364, + "rewards/thermo_reward/mean": 0.5226083397865295, + "rewards/thermo_reward/std": 1.7218000888824463, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1288521196693182, + "epoch": 1.8940000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.571827232837677, + "learning_rate": 1.70984627725963e-08, + "loss": 0.0059, + "num_tokens": 8281883.0, + "reward": 6.850930690765381, + "reward_std": 1.3462207317352295, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.17735858261585236, + "rewards/kidney_reward/std": 1.3910008668899536, + "rewards/length2tails_reward/mean": 0.8206785917282104, + "rewards/length2tails_reward/std": 0.24182750284671783, + "rewards/thermo_reward/mean": 0.28628480434417725, + "rewards/thermo_reward/std": 2.0024726390838623, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12647749204188585, + "epoch": 1.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6499899625778198, + "learning_rate": 1.6494925127617632e-08, + "loss": 0.0006, + "num_tokens": 8290558.0, + "reward": 6.7706298828125, + "reward_std": 1.20357346534729, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.14062845706939697, + "rewards/kidney_reward/std": 1.410576581954956, + "rewards/length2tails_reward/mean": 0.7547130584716797, + "rewards/length2tails_reward/std": 0.2741793394088745, + "rewards/thermo_reward/mean": 0.3279137909412384, + "rewards/thermo_reward/std": 1.476394534111023, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 267.1875, + "completions/mean_terminated_length": 267.1875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.16196651756763458, + "epoch": 1.8980000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5557541847229004, + "learning_rate": 1.5902142936566333e-08, + "loss": -0.011, + "num_tokens": 8299140.0, + "reward": 6.2072224617004395, + "reward_std": 3.536663055419922, + "rewards/fitness_reward/mean": 5.421911716461182, + "rewards/fitness_reward/std": 3.388042688369751, + "rewards/kidney_reward/mean": 0.7025492191314697, + "rewards/kidney_reward/std": 1.4041755199432373, + "rewards/length2tails_reward/mean": 0.775486171245575, + "rewards/length2tails_reward/std": 0.2873021960258484, + "rewards/thermo_reward/mean": 0.48032817244529724, + "rewards/thermo_reward/std": 1.6371543407440186, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14380111452192068, + "epoch": 1.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9278229475021362, + "learning_rate": 1.5320122682013436e-08, + "loss": -0.0021, + "num_tokens": 8307820.0, + "reward": 6.878548622131348, + "reward_std": 0.9973581433296204, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.08399184048175812, + "rewards/kidney_reward/std": 1.498388648033142, + "rewards/length2tails_reward/mean": 0.8148068189620972, + "rewards/length2tails_reward/std": 0.19870658218860626, + "rewards/thermo_reward/mean": 0.45706725120544434, + "rewards/thermo_reward/std": 1.5130258798599243, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.1445015063509345, + "epoch": 1.9020000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4408010244369507, + "learning_rate": 1.4748870728839346e-08, + "loss": 0.0001, + "num_tokens": 8316486.0, + "reward": 7.336445331573486, + "reward_std": 1.1107890605926514, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5126566886901855, + "rewards/kidney_reward/std": 1.1569898128509521, + "rewards/length2tails_reward/mean": 0.8199333548545837, + "rewards/length2tails_reward/std": 0.2412588745355606, + "rewards/thermo_reward/mean": 0.5676702260971069, + "rewards/thermo_reward/std": 1.607494592666626, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.25, + "completions/mean_terminated_length": 263.25, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.13587166368961334, + "epoch": 1.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7369056344032288, + "learning_rate": 1.418839332416366e-08, + "loss": -0.1034, + "num_tokens": 8324942.0, + "reward": 6.689387321472168, + "reward_std": 2.189192533493042, + "rewards/fitness_reward/mean": 6.232475280761719, + "rewards/fitness_reward/std": 2.0298075675964355, + "rewards/kidney_reward/mean": 0.392331600189209, + "rewards/kidney_reward/std": 1.4163700342178345, + "rewards/length2tails_reward/mean": 0.7279583811759949, + "rewards/length2tails_reward/std": 0.29974088072776794, + "rewards/thermo_reward/mean": 0.15751248598098755, + "rewards/thermo_reward/std": 1.9366596937179565, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1603428991511464, + "epoch": 1.9060000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2207043170928955, + "learning_rate": 1.3638696597277677e-08, + "loss": -0.0007, + "num_tokens": 8333600.0, + "reward": 6.097584247589111, + "reward_std": 2.687563419342041, + "rewards/fitness_reward/mean": 5.846081733703613, + "rewards/fitness_reward/std": 2.5933988094329834, + "rewards/kidney_reward/mean": 0.09651625156402588, + "rewards/kidney_reward/std": 1.3750468492507935, + "rewards/length2tails_reward/mean": 0.659031867980957, + "rewards/length2tails_reward/std": 0.3770754039287567, + "rewards/thermo_reward/mean": 0.07697249948978424, + "rewards/thermo_reward/std": 2.1390299797058105, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.13993188831955194, + "epoch": 1.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6311830282211304, + "learning_rate": 1.3099786559576553e-08, + "loss": -0.0041, + "num_tokens": 8342301.0, + "reward": 5.810885429382324, + "reward_std": 3.713392734527588, + "rewards/fitness_reward/mean": 5.401959419250488, + "rewards/fitness_reward/std": 3.1684982776641846, + "rewards/kidney_reward/mean": 0.24048873782157898, + "rewards/kidney_reward/std": 1.3586989641189575, + "rewards/length2tails_reward/mean": 0.8623353242874146, + "rewards/length2tails_reward/std": 0.2329462468624115, + "rewards/thermo_reward/mean": 0.14619500935077667, + "rewards/thermo_reward/std": 1.9154062271118164, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17670791130512953, + "epoch": 1.9100000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.211955547332764, + "learning_rate": 1.2571669104494253e-08, + "loss": 0.022, + "num_tokens": 8351064.0, + "reward": 6.751359939575195, + "reward_std": 2.1091582775115967, + "rewards/fitness_reward/mean": 6.240219593048096, + "rewards/fitness_reward/std": 1.9860002994537354, + "rewards/kidney_reward/mean": 0.5815110206604004, + "rewards/kidney_reward/std": 1.173783779144287, + "rewards/length2tails_reward/mean": 0.7230339646339417, + "rewards/length2tails_reward/std": 0.31141066551208496, + "rewards/thermo_reward/mean": 0.0792519599199295, + "rewards/thermo_reward/std": 1.7948559522628784, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.9375, + "completions/mean_terminated_length": 268.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12876643519848585, + "epoch": 1.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23859953880310059, + "learning_rate": 1.2054350007438707e-08, + "loss": -0.0027, + "num_tokens": 8359702.0, + "reward": 6.874235153198242, + "reward_std": 2.0526840686798096, + "rewards/fitness_reward/mean": 6.157598972320557, + "rewards/fitness_reward/std": 1.9413694143295288, + "rewards/kidney_reward/mean": 0.3022220730781555, + "rewards/kidney_reward/std": 1.2430528402328491, + "rewards/length2tails_reward/mean": 0.6893478631973267, + "rewards/length2tails_reward/std": 0.2837839424610138, + "rewards/thermo_reward/mean": 0.786375880241394, + "rewards/thermo_reward/std": 1.280945897102356, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.16495475824922323, + "epoch": 1.9140000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.778078317642212, + "learning_rate": 1.1547834925728529e-08, + "loss": -0.0098, + "num_tokens": 8368378.0, + "reward": 6.672608375549316, + "reward_std": 2.1042580604553223, + "rewards/fitness_reward/mean": 6.23095703125, + "rewards/fitness_reward/std": 2.0383970737457275, + "rewards/kidney_reward/mean": 0.25353166460990906, + "rewards/kidney_reward/std": 1.3146024942398071, + "rewards/length2tails_reward/mean": 0.8096996545791626, + "rewards/length2tails_reward/std": 0.2513669431209564, + "rewards/thermo_reward/mean": 0.2249206304550171, + "rewards/thermo_reward/std": 1.8566633462905884, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1538853645324707, + "epoch": 1.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.990831732749939, + "learning_rate": 1.1052129398531506e-08, + "loss": 0.0026, + "num_tokens": 8377186.0, + "reward": 6.762454986572266, + "reward_std": 1.5834013223648071, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.3770258128643036, + "rewards/kidney_reward/std": 1.3981250524520874, + "rewards/length2tails_reward/mean": 0.7407108545303345, + "rewards/length2tails_reward/std": 0.31770798563957214, + "rewards/thermo_reward/mean": 0.006888486444950104, + "rewards/thermo_reward/std": 1.8492512702941895, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 274.03125, + "completions/mean_terminated_length": 274.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15242012217640877, + "epoch": 1.9180000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.574830949306488, + "learning_rate": 1.0567238846803995e-08, + "loss": -0.0102, + "num_tokens": 8385987.0, + "reward": 7.116518974304199, + "reward_std": 1.3782036304473877, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5639160871505737, + "rewards/kidney_reward/std": 1.3345348834991455, + "rewards/length2tails_reward/mean": 0.7730758786201477, + "rewards/length2tails_reward/std": 0.2996107339859009, + "rewards/thermo_reward/mean": 0.305965781211853, + "rewards/thermo_reward/std": 1.7593438625335693, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1460734736174345, + "epoch": 1.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6456813216209412, + "learning_rate": 1.0093168573231392e-08, + "loss": -0.0015, + "num_tokens": 8394714.0, + "reward": 7.087253570556641, + "reward_std": 1.490665078163147, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.33409491181373596, + "rewards/kidney_reward/std": 1.2545254230499268, + "rewards/length2tails_reward/mean": 0.8459086418151855, + "rewards/length2tails_reward/std": 0.23028500378131866, + "rewards/thermo_reward/mean": 0.6468173265457153, + "rewards/thermo_reward/std": 1.6957744359970093, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 277.40625, + "completions/mean_terminated_length": 277.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15890014078468084, + "epoch": 1.9220000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.470006227493286, + "learning_rate": 9.629923762170089e-09, + "loss": 0.1093, + "num_tokens": 8403623.0, + "reward": 7.169706344604492, + "reward_std": 2.429119825363159, + "rewards/fitness_reward/mean": 6.208606719970703, + "rewards/fitness_reward/std": 2.1648309230804443, + "rewards/kidney_reward/mean": 0.5683605670928955, + "rewards/kidney_reward/std": 1.3351720571517944, + "rewards/length2tails_reward/mean": 0.783476710319519, + "rewards/length2tails_reward/std": 0.27854791283607483, + "rewards/thermo_reward/mean": 0.9621009826660156, + "rewards/thermo_reward/std": 1.4489096403121948, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13510340731590986, + "epoch": 1.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5293810367584229, + "learning_rate": 9.177509479591172e-09, + "loss": -0.0005, + "num_tokens": 8412320.0, + "reward": 6.762943744659424, + "reward_std": 2.1946609020233154, + "rewards/fitness_reward/mean": 6.204280853271484, + "rewards/fitness_reward/std": 1.691309928894043, + "rewards/kidney_reward/mean": 0.6458013653755188, + "rewards/kidney_reward/std": 1.1593345403671265, + "rewards/length2tails_reward/mean": 0.7527172565460205, + "rewards/length2tails_reward/std": 0.32795438170433044, + "rewards/thermo_reward/mean": 0.0951644629240036, + "rewards/thermo_reward/std": 1.9184809923171997, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 278.53125, + "completions/mean_terminated_length": 278.53125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.14739348739385605, + "epoch": 1.9260000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328462839126587, + "learning_rate": 8.735930673024805e-09, + "loss": -0.0411, + "num_tokens": 8421265.0, + "reward": 6.972890377044678, + "reward_std": 0.9983483552932739, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.042522259056568146, + "rewards/kidney_reward/std": 1.1645569801330566, + "rewards/length2tails_reward/mean": 0.754568874835968, + "rewards/length2tails_reward/std": 0.28561484813690186, + "rewards/thermo_reward/mean": 0.34337741136550903, + "rewards/thermo_reward/std": 1.8097610473632812, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13897535763680935, + "epoch": 1.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.269151210784912, + "learning_rate": 8.305192171506047e-09, + "loss": -0.001, + "num_tokens": 8429947.0, + "reward": 6.4728474617004395, + "reward_std": 2.936232805252075, + "rewards/fitness_reward/mean": 5.991489410400391, + "rewards/fitness_reward/std": 2.3668925762176514, + "rewards/kidney_reward/mean": 0.31890788674354553, + "rewards/kidney_reward/std": 1.329576015472412, + "rewards/length2tails_reward/mean": 0.7545139193534851, + "rewards/length2tails_reward/std": 0.28510746359825134, + "rewards/thermo_reward/mean": 0.2665518522262573, + "rewards/thermo_reward/std": 1.779471755027771, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.16893195919692516, + "epoch": 1.9300000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0434110164642334, + "learning_rate": 7.885298685522235e-09, + "loss": -0.0263, + "num_tokens": 8438708.0, + "reward": 5.675036907196045, + "reward_std": 4.0992021560668945, + "rewards/fitness_reward/mean": 5.013741970062256, + "rewards/fitness_reward/std": 3.720815658569336, + "rewards/kidney_reward/mean": 0.22295311093330383, + "rewards/kidney_reward/std": 1.560072422027588, + "rewards/length2tails_reward/mean": 0.8102078437805176, + "rewards/length2tails_reward/std": 0.2832273840904236, + "rewards/thermo_reward/mean": 0.6945326328277588, + "rewards/thermo_reward/std": 1.606467843055725, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1341484859585762, + "epoch": 1.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.204648017883301, + "learning_rate": 7.476254806961013e-09, + "loss": 0.0374, + "num_tokens": 8447457.0, + "reward": 6.475647926330566, + "reward_std": 2.279242992401123, + "rewards/fitness_reward/mean": 6.227713108062744, + "rewards/fitness_reward/std": 2.0567479133605957, + "rewards/kidney_reward/mean": 0.31183722615242004, + "rewards/kidney_reward/std": 1.2472810745239258, + "rewards/length2tails_reward/mean": 0.7391623854637146, + "rewards/length2tails_reward/std": 0.3342468738555908, + "rewards/thermo_reward/mean": -0.18554893136024475, + "rewards/thermo_reward/std": 1.875700831413269, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 275.03125, + "completions/mean_terminated_length": 275.03125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.13423006981611252, + "epoch": 1.9340000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6570208072662354, + "learning_rate": 7.07806500906094e-09, + "loss": 0.0656, + "num_tokens": 8456290.0, + "reward": 6.393370628356934, + "reward_std": 2.6049020290374756, + "rewards/fitness_reward/mean": 5.847797393798828, + "rewards/fitness_reward/std": 2.56826114654541, + "rewards/kidney_reward/mean": 0.3454914093017578, + "rewards/kidney_reward/std": 1.3565188646316528, + "rewards/length2tails_reward/mean": 0.7164158821105957, + "rewards/length2tails_reward/std": 0.3534887433052063, + "rewards/thermo_reward/mean": 0.38744595646858215, + "rewards/thermo_reward/std": 1.7472296953201294, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 261.9375, + "completions/mean_terminated_length": 261.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.1777421524748206, + "epoch": 1.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7004880905151367, + "learning_rate": 6.690733646361857e-09, + "loss": -0.1168, + "num_tokens": 8464704.0, + "reward": 6.54952335357666, + "reward_std": 2.5894832611083984, + "rewards/fitness_reward/mean": 6.136770248413086, + "rewards/fitness_reward/std": 2.0540735721588135, + "rewards/kidney_reward/mean": 0.7571455240249634, + "rewards/kidney_reward/std": 1.1609498262405396, + "rewards/length2tails_reward/mean": 0.6343657374382019, + "rewards/length2tails_reward/std": 0.3522615134716034, + "rewards/thermo_reward/mean": -0.2488224059343338, + "rewards/thermo_reward/std": 1.936090111732483, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14278618898242712, + "epoch": 1.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.497648149728775, + "learning_rate": 6.3142649546572556e-09, + "loss": 0.0022, + "num_tokens": 8473397.0, + "reward": 6.889687538146973, + "reward_std": 2.141871213912964, + "rewards/fitness_reward/mean": 6.114391326904297, + "rewards/fitness_reward/std": 2.175769805908203, + "rewards/kidney_reward/mean": 0.28055596351623535, + "rewards/kidney_reward/std": 1.1416901350021362, + "rewards/length2tails_reward/mean": 0.7848101854324341, + "rewards/length2tails_reward/std": 0.24613912403583527, + "rewards/thermo_reward/mean": 0.8776310086250305, + "rewards/thermo_reward/std": 1.4135630130767822, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14457148406654596, + "epoch": 1.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5096526145935059, + "learning_rate": 5.9486630509487656e-09, + "loss": -0.0024, + "num_tokens": 8482100.0, + "reward": 7.406411647796631, + "reward_std": 1.1937562227249146, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.7429991364479065, + "rewards/kidney_reward/std": 1.4751752614974976, + "rewards/length2tails_reward/mean": 0.7410036325454712, + "rewards/length2tails_reward/std": 0.27641117572784424, + "rewards/thermo_reward/mean": 0.7227034568786621, + "rewards/thermo_reward/std": 1.5439373254776, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.1412757895886898, + "epoch": 1.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6617432236671448, + "learning_rate": 5.593931933399853e-09, + "loss": 0.0011, + "num_tokens": 8490813.0, + "reward": 6.486895561218262, + "reward_std": 2.1942341327667236, + "rewards/fitness_reward/mean": 6.063414573669434, + "rewards/fitness_reward/std": 1.9585680961608887, + "rewards/kidney_reward/mean": 0.2579805850982666, + "rewards/kidney_reward/std": 1.3123995065689087, + "rewards/length2tails_reward/mean": 0.7991708517074585, + "rewards/length2tails_reward/std": 0.28088587522506714, + "rewards/thermo_reward/mean": 0.18939608335494995, + "rewards/thermo_reward/std": 1.8844977617263794, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 259.96875, + "completions/mean_terminated_length": 259.96875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.17512109503149986, + "epoch": 1.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.834974765777588, + "learning_rate": 5.2500754812935255e-09, + "loss": -0.0722, + "num_tokens": 8499164.0, + "reward": 5.162237644195557, + "reward_std": 4.027027606964111, + "rewards/fitness_reward/mean": 4.8625946044921875, + "rewards/fitness_reward/std": 3.861126661300659, + "rewards/kidney_reward/mean": -0.0016990229487419128, + "rewards/kidney_reward/std": 1.3565279245376587, + "rewards/length2tails_reward/mean": 0.724521279335022, + "rewards/length2tails_reward/std": 0.3177802562713623, + "rewards/thermo_reward/mean": 0.23872420191764832, + "rewards/thermo_reward/std": 1.8257545232772827, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13331248704344034, + "epoch": 1.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2248444557189941, + "learning_rate": 4.917097454988584e-09, + "loss": -0.003, + "num_tokens": 8507825.0, + "reward": 7.373482704162598, + "reward_std": 1.1651809215545654, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.48615729808807373, + "rewards/kidney_reward/std": 1.4306813478469849, + "rewards/length2tails_reward/mean": 0.7148531675338745, + "rewards/length2tails_reward/std": 0.3141395151615143, + "rewards/thermo_reward/mean": 0.926762580871582, + "rewards/thermo_reward/std": 1.5720893144607544, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13873163983225822, + "epoch": 1.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.888440728187561, + "learning_rate": 4.595001495879547e-09, + "loss": 0.0015, + "num_tokens": 8516522.0, + "reward": 6.947755813598633, + "reward_std": 2.5576391220092773, + "rewards/fitness_reward/mean": 6.024421215057373, + "rewards/fitness_reward/std": 2.1613428592681885, + "rewards/kidney_reward/mean": 0.6351155042648315, + "rewards/kidney_reward/std": 1.4090044498443604, + "rewards/length2tails_reward/mean": 0.7617043852806091, + "rewards/length2tails_reward/std": 0.32511842250823975, + "rewards/thermo_reward/mean": 0.8307015895843506, + "rewards/thermo_reward/std": 1.5262112617492676, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 257.40625, + "completions/mean_terminated_length": 257.40625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.14579259417951107, + "epoch": 1.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8553688526153564, + "learning_rate": 4.28379112635624e-09, + "loss": -0.1524, + "num_tokens": 8524791.0, + "reward": 6.329962730407715, + "reward_std": 3.87939190864563, + "rewards/fitness_reward/mean": 5.379385948181152, + "rewards/fitness_reward/std": 3.5193686485290527, + "rewards/kidney_reward/mean": 0.6601889133453369, + "rewards/kidney_reward/std": 1.496724247932434, + "rewards/length2tails_reward/mean": 0.784470796585083, + "rewards/length2tails_reward/std": 0.27918195724487305, + "rewards/thermo_reward/mean": 0.8487285375595093, + "rewards/thermo_reward/std": 1.3128643035888672, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 293.4375, + "completions/mean_terminated_length": 293.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1697930544614792, + "epoch": 1.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0914337635040283, + "learning_rate": 3.983469749765267e-09, + "loss": 0.1849, + "num_tokens": 8534213.0, + "reward": 5.23145866394043, + "reward_std": 3.992888927459717, + "rewards/fitness_reward/mean": 5.051059246063232, + "rewards/fitness_reward/std": 3.8726022243499756, + "rewards/kidney_reward/mean": -0.35731613636016846, + "rewards/kidney_reward/std": 1.267041563987732, + "rewards/length2tails_reward/mean": 0.8290630578994751, + "rewards/length2tails_reward/std": 0.22550584375858307, + "rewards/thermo_reward/mean": 0.3035840392112732, + "rewards/thermo_reward/std": 1.82564115524292, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 279.71875, + "completions/mean_terminated_length": 279.71875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13200210873037577, + "epoch": 1.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4034455418586731, + "learning_rate": 3.6940406503731536e-09, + "loss": -0.006, + "num_tokens": 8543196.0, + "reward": 7.0576090812683105, + "reward_std": 1.6878362894058228, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.49719035625457764, + "rewards/kidney_reward/std": 1.5422898530960083, + "rewards/length2tails_reward/mean": 0.7089413404464722, + "rewards/length2tails_reward/std": 0.2897905111312866, + "rewards/thermo_reward/mean": 0.2869378924369812, + "rewards/thermo_reward/std": 1.9030399322509766, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1465275790542364, + "epoch": 1.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6455718278884888, + "learning_rate": 3.415506993330153e-09, + "loss": 0.0029, + "num_tokens": 8551904.0, + "reward": 6.497686386108398, + "reward_std": 2.1245858669281006, + "rewards/fitness_reward/mean": 6.101364612579346, + "rewards/fitness_reward/std": 1.7653228044509888, + "rewards/kidney_reward/mean": -0.2941461503505707, + "rewards/kidney_reward/std": 1.2977572679519653, + "rewards/length2tails_reward/mean": 0.7674025297164917, + "rewards/length2tails_reward/std": 0.2881282567977905, + "rewards/thermo_reward/mean": 0.7030870914459229, + "rewards/thermo_reward/std": 1.489495873451233, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14045925345271826, + "epoch": 1.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9108378887176514, + "learning_rate": 3.147871824635717e-09, + "loss": -0.001, + "num_tokens": 8560618.0, + "reward": 6.498957633972168, + "reward_std": 2.654280662536621, + "rewards/fitness_reward/mean": 5.817835330963135, + "rewards/fitness_reward/std": 2.2584965229034424, + "rewards/kidney_reward/mean": 0.5320569276809692, + "rewards/kidney_reward/std": 1.2327033281326294, + "rewards/length2tails_reward/mean": 0.7533510327339172, + "rewards/length2tails_reward/std": 0.3051977753639221, + "rewards/thermo_reward/mean": 0.453511506319046, + "rewards/thermo_reward/std": 1.6546717882156372, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1423749178647995, + "epoch": 1.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.860144853591919, + "learning_rate": 2.8911380711051926e-09, + "loss": 0.0, + "num_tokens": 8569292.0, + "reward": 7.016634464263916, + "reward_std": 1.170081377029419, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5771361589431763, + "rewards/kidney_reward/std": 1.3184949159622192, + "rewards/length2tails_reward/mean": 0.7746425867080688, + "rewards/length2tails_reward/std": 0.27873504161834717, + "rewards/thermo_reward/mean": -0.11378508806228638, + "rewards/thermo_reward/std": 2.0121984481811523, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13332789577543736, + "epoch": 1.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2614786624908447, + "learning_rate": 2.645308540337843e-09, + "loss": 0.0119, + "num_tokens": 8577963.0, + "reward": 7.004083156585693, + "reward_std": 1.9548842906951904, + "rewards/fitness_reward/mean": 6.309075355529785, + "rewards/fitness_reward/std": 1.5964937210083008, + "rewards/kidney_reward/mean": 0.37824541330337524, + "rewards/kidney_reward/std": 1.4799045324325562, + "rewards/length2tails_reward/mean": 0.7553489208221436, + "rewards/length2tails_reward/std": 0.22973304986953735, + "rewards/thermo_reward/mean": 0.6340956687927246, + "rewards/thermo_reward/std": 1.5909950733184814, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.12256316281855106, + "epoch": 1.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8833571672439575, + "learning_rate": 2.4103859206857646e-09, + "loss": -0.0716, + "num_tokens": 8586487.0, + "reward": 6.177903652191162, + "reward_std": 3.6157333850860596, + "rewards/fitness_reward/mean": 5.307476997375488, + "rewards/fitness_reward/std": 3.1408731937408447, + "rewards/kidney_reward/mean": 0.523635983467102, + "rewards/kidney_reward/std": 1.3970985412597656, + "rewards/length2tails_reward/mean": 0.7733547687530518, + "rewards/length2tails_reward/std": 0.3167370557785034, + "rewards/thermo_reward/mean": 0.830540657043457, + "rewards/thermo_reward/std": 1.589536190032959, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.0625, + "completions/mean_terminated_length": 265.0625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.12876670714467764, + "epoch": 1.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33229905366897583, + "learning_rate": 2.186372781225465e-09, + "loss": -0.0764, + "num_tokens": 8595001.0, + "reward": 6.409072399139404, + "reward_std": 3.0007176399230957, + "rewards/fitness_reward/mean": 5.817593574523926, + "rewards/fitness_reward/std": 2.6983437538146973, + "rewards/kidney_reward/mean": 0.23583148419857025, + "rewards/kidney_reward/std": 1.3426491022109985, + "rewards/length2tails_reward/mean": 0.7170102596282959, + "rewards/length2tails_reward/std": 0.3426230251789093, + "rewards/thermo_reward/mean": 0.5886213779449463, + "rewards/thermo_reward/std": 1.7685461044311523, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.14222273230552673, + "epoch": 1.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7764557600021362, + "learning_rate": 1.973271571728441e-09, + "loss": 0.0024, + "num_tokens": 8603715.0, + "reward": 6.988193511962891, + "reward_std": 1.4371585845947266, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3408289849758148, + "rewards/kidney_reward/std": 1.5297675132751465, + "rewards/length2tails_reward/mean": 0.7079341411590576, + "rewards/length2tails_reward/std": 0.3303755223751068, + "rewards/thermo_reward/mean": 0.3049730062484741, + "rewards/thermo_reward/std": 1.7036911249160767, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 274.71875, + "completions/mean_terminated_length": 274.71875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.14168483205139637, + "epoch": 1.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.752838134765625, + "learning_rate": 1.7710846226355324e-09, + "loss": 0.0408, + "num_tokens": 8612538.0, + "reward": 6.691742897033691, + "reward_std": 3.174473524093628, + "rewards/fitness_reward/mean": 5.871668815612793, + "rewards/fitness_reward/std": 2.8317642211914062, + "rewards/kidney_reward/mean": 0.20092883706092834, + "rewards/kidney_reward/std": 1.3441205024719238, + "rewards/length2tails_reward/mean": 0.7975866794586182, + "rewards/length2tails_reward/std": 0.27011606097221375, + "rewards/thermo_reward/mean": 1.0404248237609863, + "rewards/thermo_reward/std": 1.3371044397354126, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13731133844703436, + "epoch": 1.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7989093661308289, + "learning_rate": 1.5798141450307222e-09, + "loss": -0.0064, + "num_tokens": 8621255.0, + "reward": 6.877331733703613, + "reward_std": 1.3518424034118652, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.2687470316886902, + "rewards/kidney_reward/std": 1.3318414688110352, + "rewards/length2tails_reward/mean": 0.8222988843917847, + "rewards/length2tails_reward/std": 0.286232590675354, + "rewards/thermo_reward/mean": 0.3041267395019531, + "rewards/thermo_reward/std": 1.7883063554763794, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 277.03125, + "completions/mean_terminated_length": 277.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14333887957036495, + "epoch": 1.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7833943963050842, + "learning_rate": 1.3994622306173764e-09, + "loss": 0.0043, + "num_tokens": 8630152.0, + "reward": 6.3205389976501465, + "reward_std": 2.4729175567626953, + "rewards/fitness_reward/mean": 6.126613616943359, + "rewards/fitness_reward/std": 2.109231948852539, + "rewards/kidney_reward/mean": -0.23827064037322998, + "rewards/kidney_reward/std": 1.0368083715438843, + "rewards/length2tails_reward/mean": 0.7699821591377258, + "rewards/length2tails_reward/std": 0.3016716539859772, + "rewards/thermo_reward/mean": 0.2411297708749771, + "rewards/thermo_reward/std": 1.830080509185791, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.1397005682811141, + "epoch": 1.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.412941336631775, + "learning_rate": 1.2300308516952628e-09, + "loss": -0.0056, + "num_tokens": 8638793.0, + "reward": 6.3557634353637695, + "reward_std": 3.2753348350524902, + "rewards/fitness_reward/mean": 5.816671371459961, + "rewards/fitness_reward/std": 2.6990725994110107, + "rewards/kidney_reward/mean": 0.22815299034118652, + "rewards/kidney_reward/std": 1.3506786823272705, + "rewards/length2tails_reward/mean": 0.7859964966773987, + "rewards/length2tails_reward/std": 0.29051700234413147, + "rewards/thermo_reward/mean": 0.4570317566394806, + "rewards/thermo_reward/std": 1.9421643018722534, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.0625, + "completions/mean_terminated_length": 265.0625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.15622761566191912, + "epoch": 1.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5199759006500244, + "learning_rate": 1.071521861138458e-09, + "loss": -0.0731, + "num_tokens": 8647307.0, + "reward": 6.576416969299316, + "reward_std": 2.463517665863037, + "rewards/fitness_reward/mean": 6.047708988189697, + "rewards/fitness_reward/std": 2.0397956371307373, + "rewards/kidney_reward/mean": 0.44981682300567627, + "rewards/kidney_reward/std": 1.55868399143219, + "rewards/length2tails_reward/mean": 0.7272934317588806, + "rewards/length2tails_reward/std": 0.2920091152191162, + "rewards/thermo_reward/mean": 0.2439517378807068, + "rewards/thermo_reward/std": 1.7475476264953613, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 277.96875, + "completions/mean_terminated_length": 277.96875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16981867235153913, + "epoch": 1.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1260294914245605, + "learning_rate": 9.239369923762508e-10, + "loss": 0.1504, + "num_tokens": 8656234.0, + "reward": 7.21594762802124, + "reward_std": 2.2391276359558105, + "rewards/fitness_reward/mean": 6.243906497955322, + "rewards/fitness_reward/std": 1.965144395828247, + "rewards/kidney_reward/mean": 0.5612843036651611, + "rewards/kidney_reward/std": 1.3982468843460083, + "rewards/length2tails_reward/mean": 0.7463132739067078, + "rewards/length2tails_reward/std": 0.29110872745513916, + "rewards/thermo_reward/mean": 1.009641170501709, + "rewards/thermo_reward/std": 1.0832643508911133, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 279.78125, + "completions/mean_terminated_length": 279.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16158413514494896, + "epoch": 1.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3589365482330322, + "learning_rate": 7.872778593728257e-10, + "loss": 0.0831, + "num_tokens": 8665219.0, + "reward": 5.645939826965332, + "reward_std": 3.246232032775879, + "rewards/fitness_reward/mean": 5.57554817199707, + "rewards/fitness_reward/std": 2.8704137802124023, + "rewards/kidney_reward/mean": 0.25519901514053345, + "rewards/kidney_reward/std": 1.3954426050186157, + "rewards/length2tails_reward/mean": 0.8234630823135376, + "rewards/length2tails_reward/std": 0.2541621923446655, + "rewards/thermo_reward/mean": -0.5261476039886475, + "rewards/thermo_reward/std": 2.1192498207092285, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13131759129464626, + "epoch": 1.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4174315631389618, + "learning_rate": 6.615459566108317e-10, + "loss": 0.0012, + "num_tokens": 8673872.0, + "reward": 6.760034084320068, + "reward_std": 1.2639107704162598, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.04447201266884804, + "rewards/kidney_reward/std": 1.3067702054977417, + "rewards/length2tails_reward/mean": 0.7073255777359009, + "rewards/length2tails_reward/std": 0.32474252581596375, + "rewards/thermo_reward/mean": -0.060663118958473206, + "rewards/thermo_reward/std": 1.955005407333374, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.1439357027411461, + "epoch": 1.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4547436535358429, + "learning_rate": 5.46742659073951e-10, + "loss": -0.0022, + "num_tokens": 8682548.0, + "reward": 7.087684631347656, + "reward_std": 1.245467185974121, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.011648640036582947, + "rewards/kidney_reward/std": 1.4016153812408447, + "rewards/length2tails_reward/mean": 0.793976902961731, + "rewards/length2tails_reward/std": 0.27520573139190674, + "rewards/thermo_reward/mean": 0.8134108781814575, + "rewards/thermo_reward/std": 1.263830304145813, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1455384586006403, + "epoch": 1.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8107917308807373, + "learning_rate": 4.4286922223291113e-10, + "loss": -0.0039, + "num_tokens": 8691196.0, + "reward": 6.902514457702637, + "reward_std": 2.2672624588012695, + "rewards/fitness_reward/mean": 6.154026985168457, + "rewards/fitness_reward/std": 1.960654377937317, + "rewards/kidney_reward/mean": 0.24663349986076355, + "rewards/kidney_reward/std": 1.490953803062439, + "rewards/length2tails_reward/mean": 0.7305619716644287, + "rewards/length2tails_reward/std": 0.34179335832595825, + "rewards/thermo_reward/mean": 0.8850600719451904, + "rewards/thermo_reward/std": 1.7797703742980957, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 277.15625, + "completions/mean_terminated_length": 277.15625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.16037398017942905, + "epoch": 1.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9811228513717651, + "learning_rate": 3.4992678203071835e-10, + "loss": 0.0299, + "num_tokens": 8700097.0, + "reward": 6.01025390625, + "reward_std": 3.615368127822876, + "rewards/fitness_reward/mean": 5.56502103805542, + "rewards/fitness_reward/std": 3.262300968170166, + "rewards/kidney_reward/mean": 0.08462587743997574, + "rewards/kidney_reward/std": 1.3246054649353027, + "rewards/length2tails_reward/mean": 0.8482843637466431, + "rewards/length2tails_reward/std": 0.23385828733444214, + "rewards/thermo_reward/mean": 0.3816964626312256, + "rewards/thermo_reward/std": 1.7316465377807617, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1387176290154457, + "epoch": 1.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3029630184173584, + "learning_rate": 2.6791635487122265e-10, + "loss": 0.0295, + "num_tokens": 8709001.0, + "reward": 6.606256484985352, + "reward_std": 2.234869956970215, + "rewards/fitness_reward/mean": 6.119274139404297, + "rewards/fitness_reward/std": 2.149167537689209, + "rewards/kidney_reward/mean": 0.23796230554580688, + "rewards/kidney_reward/std": 1.2390286922454834, + "rewards/length2tails_reward/mean": 0.7743632793426514, + "rewards/length2tails_reward/std": 0.3019777834415436, + "rewards/thermo_reward/mean": 0.34881943464279175, + "rewards/thermo_reward/std": 1.8036251068115234, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.1713090566918254, + "epoch": 1.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.826282978057861, + "learning_rate": 1.9683883760723829e-10, + "loss": 0.0112, + "num_tokens": 8717715.0, + "reward": 5.834600448608398, + "reward_std": 3.596980571746826, + "rewards/fitness_reward/mean": 5.253976821899414, + "rewards/fitness_reward/std": 3.6114492416381836, + "rewards/kidney_reward/mean": 0.08025416731834412, + "rewards/kidney_reward/std": 1.4779082536697388, + "rewards/length2tails_reward/mean": 0.7445960640907288, + "rewards/length2tails_reward/std": 0.3318171203136444, + "rewards/thermo_reward/mean": 0.708694577217102, + "rewards/thermo_reward/std": 1.6918680667877197, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14164281729608774, + "epoch": 1.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36646345257759094, + "learning_rate": 1.3669500753099582e-10, + "loss": -0.0003, + "num_tokens": 8726407.0, + "reward": 6.91595983505249, + "reward_std": 1.4034615755081177, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.0707603394985199, + "rewards/kidney_reward/std": 1.380049705505371, + "rewards/length2tails_reward/mean": 0.7668553590774536, + "rewards/length2tails_reward/std": 0.26946744322776794, + "rewards/thermo_reward/mean": 0.5426340103149414, + "rewards/thermo_reward/std": 1.5267906188964844, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12591685727238655, + "epoch": 1.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7351046800613403, + "learning_rate": 8.748552236603757e-11, + "loss": -0.0027, + "num_tokens": 8735060.0, + "reward": 7.033902168273926, + "reward_std": 1.4914648532867432, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.36085501313209534, + "rewards/kidney_reward/std": 1.2216144800186157, + "rewards/length2tails_reward/mean": 0.6885391473770142, + "rewards/length2tails_reward/std": 0.31900057196617126, + "rewards/thermo_reward/mean": 0.5920403003692627, + "rewards/thermo_reward/std": 1.595741868019104, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.16086839232593775, + "epoch": 2.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4666213989257812, + "learning_rate": 4.9210920259112884e-11, + "loss": -0.0184, + "num_tokens": 8743721.0, + "reward": 5.754283905029297, + "reward_std": 3.096195936203003, + "rewards/fitness_reward/mean": 5.39070987701416, + "rewards/fitness_reward/std": 3.201241970062256, + "rewards/kidney_reward/mean": -0.31286269426345825, + "rewards/kidney_reward/std": 1.1052496433258057, + "rewards/length2tails_reward/mean": 0.8245663642883301, + "rewards/length2tails_reward/std": 0.2187739759683609, + "rewards/thermo_reward/mean": 0.6277287006378174, + "rewards/thermo_reward/std": 1.7101068496704102, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.18065129965543747, + "epoch": 2.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.705759286880493, + "learning_rate": 2.1871619775404304e-11, + "loss": 0.0092, + "num_tokens": 8752478.0, + "reward": 6.072945594787598, + "reward_std": 3.231497287750244, + "rewards/fitness_reward/mean": 5.765591621398926, + "rewards/fitness_reward/std": 2.8769266605377197, + "rewards/kidney_reward/mean": 0.03829430788755417, + "rewards/kidney_reward/std": 1.5032477378845215, + "rewards/length2tails_reward/mean": 0.8247227668762207, + "rewards/length2tails_reward/std": 0.24547211825847626, + "rewards/thermo_reward/mean": 0.16405287384986877, + "rewards/thermo_reward/std": 1.9016590118408203, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.146556805819273, + "epoch": 2.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1743465662002563, + "learning_rate": 5.335068882077925e-07, + "loss": 0.0197, + "num_tokens": 8761240.0, + "reward": 6.12457275390625, + "reward_std": 2.874589204788208, + "rewards/fitness_reward/mean": 5.706284523010254, + "rewards/fitness_reward/std": 2.8013696670532227, + "rewards/kidney_reward/mean": 0.28543713688850403, + "rewards/kidney_reward/std": 1.410324215888977, + "rewards/length2tails_reward/mean": 0.7576284408569336, + "rewards/length2tails_reward/std": 0.306517094373703, + "rewards/thermo_reward/mean": 0.17232564091682434, + "rewards/thermo_reward/std": 1.9014241695404053, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 280.09375, + "completions/mean_terminated_length": 280.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.17651361506432295, + "epoch": 2.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.253736972808838, + "learning_rate": 5.3159155930021e-07, + "loss": 0.0284, + "num_tokens": 8770235.0, + "reward": 6.582602500915527, + "reward_std": 2.3814492225646973, + "rewards/fitness_reward/mean": 6.131250381469727, + "rewards/fitness_reward/std": 2.0840365886688232, + "rewards/kidney_reward/mean": 0.5142073035240173, + "rewards/kidney_reward/std": 1.5123721361160278, + "rewards/length2tails_reward/mean": 0.8203818202018738, + "rewards/length2tails_reward/std": 0.2657608091831207, + "rewards/thermo_reward/mean": -0.021693453192710876, + "rewards/thermo_reward/std": 1.92860746383667, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.1370587982237339, + "epoch": 2.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5654973983764648, + "learning_rate": 5.296784292055645e-07, + "loss": 0.0111, + "num_tokens": 8778879.0, + "reward": 7.150189399719238, + "reward_std": 1.1193904876708984, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.38423269987106323, + "rewards/kidney_reward/std": 1.2884421348571777, + "rewards/length2tails_reward/mean": 0.7920459508895874, + "rewards/length2tails_reward/std": 0.2746240496635437, + "rewards/thermo_reward/mean": 0.33752650022506714, + "rewards/thermo_reward/std": 1.8023523092269897, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1421522106975317, + "epoch": 2.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5743650794029236, + "learning_rate": 5.277675069045116e-07, + "loss": -0.0118, + "num_tokens": 8787650.0, + "reward": 6.969651222229004, + "reward_std": 1.2634530067443848, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.30543041229248047, + "rewards/kidney_reward/std": 1.3223390579223633, + "rewards/length2tails_reward/mean": 0.7702364921569824, + "rewards/length2tails_reward/std": 0.2765919268131256, + "rewards/thermo_reward/mean": 0.27213478088378906, + "rewards/thermo_reward/std": 1.7751259803771973, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.17656460963189602, + "epoch": 2.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8673627376556396, + "learning_rate": 5.258588013673444e-07, + "loss": -0.0198, + "num_tokens": 8796342.0, + "reward": 6.75380802154541, + "reward_std": 2.699786901473999, + "rewards/fitness_reward/mean": 6.132498741149902, + "rewards/fitness_reward/std": 2.077256441116333, + "rewards/kidney_reward/mean": 0.5581216812133789, + "rewards/kidney_reward/std": 1.5172115564346313, + "rewards/length2tails_reward/mean": 0.8007129430770874, + "rewards/length2tails_reward/std": 0.2791340947151184, + "rewards/thermo_reward/mean": 0.28413981199264526, + "rewards/thermo_reward/std": 1.7684566974639893, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1381118129938841, + "epoch": 2.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3306095004081726, + "learning_rate": 5.239523215539491e-07, + "loss": 0.0007, + "num_tokens": 8805020.0, + "reward": 7.531400203704834, + "reward_std": 0.7371511459350586, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5756287574768066, + "rewards/kidney_reward/std": 1.3534458875656128, + "rewards/length2tails_reward/mean": 0.7543559670448303, + "rewards/length2tails_reward/std": 0.23938201367855072, + "rewards/thermo_reward/mean": 0.9273970127105713, + "rewards/thermo_reward/std": 1.1465744972229004, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14849392883479595, + "epoch": 2.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.038259744644165, + "learning_rate": 5.220480764137635e-07, + "loss": 0.0203, + "num_tokens": 8813813.0, + "reward": 7.1274237632751465, + "reward_std": 1.2729523181915283, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2676663398742676, + "rewards/kidney_reward/std": 1.457764983177185, + "rewards/length2tails_reward/mean": 0.7976438403129578, + "rewards/length2tails_reward/std": 0.2521284520626068, + "rewards/thermo_reward/mean": 0.40576279163360596, + "rewards/thermo_reward/std": 1.7218644618988037, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 264.5625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.1335236243903637, + "epoch": 2.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0158857107162476, + "learning_rate": 5.201460748857368e-07, + "loss": -0.0758, + "num_tokens": 8822311.0, + "reward": 6.338800430297852, + "reward_std": 2.406759738922119, + "rewards/fitness_reward/mean": 6.113437652587891, + "rewards/fitness_reward/std": 2.180967092514038, + "rewards/kidney_reward/mean": 0.3218197822570801, + "rewards/kidney_reward/std": 1.241258978843689, + "rewards/length2tails_reward/mean": 0.745987057685852, + "rewards/length2tails_reward/std": 0.3025417625904083, + "rewards/thermo_reward/mean": -0.24408863484859467, + "rewards/thermo_reward/std": 1.954667091369629, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1291707633063197, + "epoch": 2.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8104145526885986, + "learning_rate": 5.182463258982846e-07, + "loss": 0.0018, + "num_tokens": 8830985.0, + "reward": 6.746305465698242, + "reward_std": 1.381626009941101, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.0006652399897575378, + "rewards/kidney_reward/std": 1.2213475704193115, + "rewards/length2tails_reward/mean": 0.7403728365898132, + "rewards/length2tails_reward/std": 0.28268083930015564, + "rewards/thermo_reward/mean": 0.5570966005325317, + "rewards/thermo_reward/std": 1.5892870426177979, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.12860173545777798, + "epoch": 2.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3056609630584717, + "learning_rate": 5.163488383692498e-07, + "loss": -0.0104, + "num_tokens": 8839655.0, + "reward": 6.64842414855957, + "reward_std": 2.5638086795806885, + "rewards/fitness_reward/mean": 6.12932014465332, + "rewards/fitness_reward/std": 2.0945234298706055, + "rewards/kidney_reward/mean": 0.44590768218040466, + "rewards/kidney_reward/std": 1.2501837015151978, + "rewards/length2tails_reward/mean": 0.8183757662773132, + "rewards/length2tails_reward/std": 0.27306491136550903, + "rewards/thermo_reward/mean": 0.18311208486557007, + "rewards/thermo_reward/std": 1.777248740196228, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13803936913609505, + "epoch": 2.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70677649974823, + "learning_rate": 5.14453621205859e-07, + "loss": -0.0024, + "num_tokens": 8848364.0, + "reward": 7.2003607749938965, + "reward_std": 0.9274915456771851, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.10395548492670059, + "rewards/kidney_reward/std": 1.252205491065979, + "rewards/length2tails_reward/mean": 0.7211539149284363, + "rewards/length2tails_reward/std": 0.3456481993198395, + "rewards/thermo_reward/mean": 0.753592848777771, + "rewards/thermo_reward/std": 1.50831139087677, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.14095101039856672, + "epoch": 2.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5184077620506287, + "learning_rate": 5.125606833046809e-07, + "loss": -0.0056, + "num_tokens": 8857018.0, + "reward": 7.197516441345215, + "reward_std": 1.146944522857666, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.444333553314209, + "rewards/kidney_reward/std": 1.53773832321167, + "rewards/length2tails_reward/mean": 0.7341883778572083, + "rewards/length2tails_reward/std": 0.281776487827301, + "rewards/thermo_reward/mean": 0.40100783109664917, + "rewards/thermo_reward/std": 1.7158565521240234, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.14430040586739779, + "epoch": 2.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5178224444389343, + "learning_rate": 5.10670033551585e-07, + "loss": -0.0035, + "num_tokens": 8865736.0, + "reward": 7.221906661987305, + "reward_std": 1.2299948930740356, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.3076668381690979, + "rewards/kidney_reward/std": 1.42197847366333, + "rewards/length2tails_reward/mean": 0.8056067824363708, + "rewards/length2tails_reward/std": 0.29334819316864014, + "rewards/thermo_reward/mean": 0.5507456064224243, + "rewards/thermo_reward/std": 1.7214829921722412, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.1360547747462988, + "epoch": 2.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5048738718032837, + "learning_rate": 5.087816808217005e-07, + "loss": -0.0072, + "num_tokens": 8874412.0, + "reward": 6.321559429168701, + "reward_std": 2.758941411972046, + "rewards/fitness_reward/mean": 6.007652282714844, + "rewards/fitness_reward/std": 2.296945095062256, + "rewards/kidney_reward/mean": 0.15560796856880188, + "rewards/kidney_reward/std": 1.418445110321045, + "rewards/length2tails_reward/mean": 0.8135966062545776, + "rewards/length2tails_reward/std": 0.25863197445869446, + "rewards/thermo_reward/mean": 0.06540745496749878, + "rewards/thermo_reward/std": 1.8245373964309692, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.139171889051795, + "epoch": 2.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.467204213142395, + "learning_rate": 5.068956339793729e-07, + "loss": -0.0021, + "num_tokens": 8883125.0, + "reward": 6.411627769470215, + "reward_std": 2.601857900619507, + "rewards/fitness_reward/mean": 5.908493995666504, + "rewards/fitness_reward/std": 2.330082654953003, + "rewards/kidney_reward/mean": 0.15019190311431885, + "rewards/kidney_reward/std": 1.3077176809310913, + "rewards/length2tails_reward/mean": 0.8175090551376343, + "rewards/length2tails_reward/std": 0.21576936542987823, + "rewards/thermo_reward/mean": 0.4473201334476471, + "rewards/thermo_reward/std": 1.5371614694595337, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 285.84375, + "completions/mean_terminated_length": 270.7419128417969, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.18575789779424667, + "epoch": 2.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.596858024597168, + "learning_rate": 5.050119018781232e-07, + "loss": 0.0869, + "num_tokens": 8892304.0, + "reward": 6.052281379699707, + "reward_std": 3.941791296005249, + "rewards/fitness_reward/mean": 5.1664958000183105, + "rewards/fitness_reward/std": 3.558793783187866, + "rewards/kidney_reward/mean": 0.18012037873268127, + "rewards/kidney_reward/std": 1.382004976272583, + "rewards/length2tails_reward/mean": 0.9148824214935303, + "rewards/length2tails_reward/std": 0.1406431645154953, + "rewards/thermo_reward/mean": 1.134009599685669, + "rewards/thermo_reward/std": 1.2101317644119263, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1177364056929946, + "epoch": 2.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34429147839546204, + "learning_rate": 5.031304933606079e-07, + "loss": -0.0047, + "num_tokens": 8900949.0, + "reward": 7.072885513305664, + "reward_std": 1.2893093824386597, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.39102882146835327, + "rewards/kidney_reward/std": 1.3922938108444214, + "rewards/length2tails_reward/mean": 0.6944444179534912, + "rewards/length2tails_reward/std": 0.32499122619628906, + "rewards/thermo_reward/mean": 0.6368792653083801, + "rewards/thermo_reward/std": 1.591661810874939, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.4375, + "completions/mean_terminated_length": 267.4375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.13640966173261404, + "epoch": 2.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.478261113166809, + "learning_rate": 5.012514172585743e-07, + "loss": -0.045, + "num_tokens": 8909539.0, + "reward": 6.289767265319824, + "reward_std": 3.241828441619873, + "rewards/fitness_reward/mean": 5.732573509216309, + "rewards/fitness_reward/std": 3.0037834644317627, + "rewards/kidney_reward/mean": 0.30768224596977234, + "rewards/kidney_reward/std": 1.4869787693023682, + "rewards/length2tails_reward/mean": 0.8040010929107666, + "rewards/length2tails_reward/std": 0.29502782225608826, + "rewards/thermo_reward/mean": 0.40470457077026367, + "rewards/thermo_reward/std": 1.8686271905899048, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.14380425121635199, + "epoch": 2.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.067531108856201, + "learning_rate": 4.993746823928225e-07, + "loss": -0.0603, + "num_tokens": 8918246.0, + "reward": 6.643622398376465, + "reward_std": 2.1346213817596436, + "rewards/fitness_reward/mean": 6.22418212890625, + "rewards/fitness_reward/std": 2.0767228603363037, + "rewards/kidney_reward/mean": 0.1659611612558365, + "rewards/kidney_reward/std": 1.2983551025390625, + "rewards/length2tails_reward/mean": 0.7676478624343872, + "rewards/length2tails_reward/std": 0.2882402241230011, + "rewards/thermo_reward/mean": 0.2890951633453369, + "rewards/thermo_reward/std": 1.8776684999465942, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13559523690491915, + "epoch": 2.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4033116400241852, + "learning_rate": 4.975002975731613e-07, + "loss": -0.0045, + "num_tokens": 8926879.0, + "reward": 7.048352241516113, + "reward_std": 1.3205783367156982, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.19255104660987854, + "rewards/kidney_reward/std": 1.3897212743759155, + "rewards/length2tails_reward/mean": 0.7244784832000732, + "rewards/length2tails_reward/std": 0.3199754059314728, + "rewards/thermo_reward/mean": 0.5652961730957031, + "rewards/thermo_reward/std": 1.5974398851394653, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 279.0, + "completions/mean_terminated_length": 279.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14934919960796833, + "epoch": 2.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9944722652435303, + "learning_rate": 4.956282715983676e-07, + "loss": 0.1101, + "num_tokens": 8935839.0, + "reward": 6.356311321258545, + "reward_std": 2.982609510421753, + "rewards/fitness_reward/mean": 5.7888336181640625, + "rewards/fitness_reward/std": 2.7868924140930176, + "rewards/kidney_reward/mean": 0.42405635118484497, + "rewards/kidney_reward/std": 1.410830020904541, + "rewards/length2tails_reward/mean": 0.7570521831512451, + "rewards/length2tails_reward/std": 0.3036043047904968, + "rewards/thermo_reward/mean": 0.332373708486557, + "rewards/thermo_reward/std": 1.613415241241455, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 284.34375, + "completions/mean_terminated_length": 284.34375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.1542113833129406, + "epoch": 2.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.622277736663818, + "learning_rate": 4.93758613256146e-07, + "loss": 0.1722, + "num_tokens": 8944970.0, + "reward": 6.475259780883789, + "reward_std": 3.02022647857666, + "rewards/fitness_reward/mean": 5.855088233947754, + "rewards/fitness_reward/std": 2.8969881534576416, + "rewards/kidney_reward/mean": 0.42711734771728516, + "rewards/kidney_reward/std": 1.2462999820709229, + "rewards/length2tails_reward/mean": 0.776665449142456, + "rewards/length2tails_reward/std": 0.29678577184677124, + "rewards/thermo_reward/mean": 0.4248921871185303, + "rewards/thermo_reward/std": 1.7062424421310425, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 282.3125, + "completions/mean_terminated_length": 282.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.171330819837749, + "epoch": 2.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9251850247383118, + "learning_rate": 4.918913313230872e-07, + "loss": -0.0407, + "num_tokens": 8954036.0, + "reward": 7.037768840789795, + "reward_std": 2.081699848175049, + "rewards/fitness_reward/mean": 6.210823059082031, + "rewards/fitness_reward/std": 1.6566131114959717, + "rewards/kidney_reward/mean": 0.3910766839981079, + "rewards/kidney_reward/std": 1.5224268436431885, + "rewards/length2tails_reward/mean": 0.8150783777236938, + "rewards/length2tails_reward/std": 0.2721725404262543, + "rewards/thermo_reward/mean": 0.8552758097648621, + "rewards/thermo_reward/std": 1.2768155336380005, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14151266776025295, + "epoch": 2.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6221606731414795, + "learning_rate": 4.900264345646252e-07, + "loss": 0.0061, + "num_tokens": 8962819.0, + "reward": 6.020671844482422, + "reward_std": 2.9835360050201416, + "rewards/fitness_reward/mean": 5.836017608642578, + "rewards/fitness_reward/std": 2.637022018432617, + "rewards/kidney_reward/mean": 0.06206980347633362, + "rewards/kidney_reward/std": 1.3039110898971558, + "rewards/length2tails_reward/mean": 0.7436022758483887, + "rewards/length2tails_reward/std": 0.3127517104148865, + "rewards/thermo_reward/mean": -0.06456241011619568, + "rewards/thermo_reward/std": 1.8445780277252197, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14390545524656773, + "epoch": 2.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6293951272964478, + "learning_rate": 4.881639317349993e-07, + "loss": -0.0016, + "num_tokens": 8971518.0, + "reward": 7.242187976837158, + "reward_std": 1.1841943264007568, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.37018877267837524, + "rewards/kidney_reward/std": 1.434311032295227, + "rewards/length2tails_reward/mean": 0.7551816701889038, + "rewards/length2tails_reward/std": 0.2715857923030853, + "rewards/thermo_reward/mean": 0.7599782347679138, + "rewards/thermo_reward/std": 1.4652702808380127, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.1406329283490777, + "epoch": 2.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7526205778121948, + "learning_rate": 4.863038315772098e-07, + "loss": -0.0081, + "num_tokens": 8980212.0, + "reward": 6.749802589416504, + "reward_std": 2.3985395431518555, + "rewards/fitness_reward/mean": 6.165776252746582, + "rewards/fitness_reward/std": 1.897290825843811, + "rewards/kidney_reward/mean": 0.34197118878364563, + "rewards/kidney_reward/std": 1.4800212383270264, + "rewards/length2tails_reward/mean": 0.7930814027786255, + "rewards/length2tails_reward/std": 0.3019961714744568, + "rewards/thermo_reward/mean": 0.42954063415527344, + "rewards/thermo_reward/std": 1.6752840280532837, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14669146947562695, + "epoch": 2.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.730690598487854, + "learning_rate": 4.844461428229781e-07, + "loss": -0.0, + "num_tokens": 8988909.0, + "reward": 7.121438503265381, + "reward_std": 1.1037371158599854, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.37236011028289795, + "rewards/kidney_reward/std": 1.4141560792922974, + "rewards/length2tails_reward/mean": 0.8111867308616638, + "rewards/length2tails_reward/std": 0.24467790126800537, + "rewards/thermo_reward/mean": 0.28232720494270325, + "rewards/thermo_reward/std": 1.6805909872055054, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14412975870072842, + "epoch": 2.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5269250273704529, + "learning_rate": 4.825908741927075e-07, + "loss": 0.0009, + "num_tokens": 8997608.0, + "reward": 7.224687099456787, + "reward_std": 1.6029869318008423, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.22163981199264526, + "rewards/kidney_reward/std": 1.3422000408172607, + "rewards/length2tails_reward/mean": 0.8514060974121094, + "rewards/length2tails_reward/std": 0.1646190881729126, + "rewards/thermo_reward/mean": 0.825412392616272, + "rewards/thermo_reward/std": 1.6672929525375366, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1454497305676341, + "epoch": 2.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5121469497680664, + "learning_rate": 4.807380343954394e-07, + "loss": 0.0049, + "num_tokens": 9006323.0, + "reward": 7.05455207824707, + "reward_std": 1.2464817762374878, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.06355097889900208, + "rewards/kidney_reward/std": 1.3287028074264526, + "rewards/length2tails_reward/mean": 0.7980195879936218, + "rewards/length2tails_reward/std": 0.26677194237709045, + "rewards/thermo_reward/mean": 0.5910481214523315, + "rewards/thermo_reward/std": 1.912475347518921, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15601969044655561, + "epoch": 2.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6845623850822449, + "learning_rate": 4.78887632128814e-07, + "loss": -0.0268, + "num_tokens": 9015247.0, + "reward": 7.127263069152832, + "reward_std": 1.262903094291687, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5093056559562683, + "rewards/kidney_reward/std": 1.1974871158599854, + "rewards/length2tails_reward/mean": 0.8209215998649597, + "rewards/length2tails_reward/std": 0.19257116317749023, + "rewards/thermo_reward/mean": 0.35814082622528076, + "rewards/thermo_reward/std": 1.6598974466323853, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 278.0625, + "completions/mean_terminated_length": 278.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15400866977870464, + "epoch": 2.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4840766489505768, + "learning_rate": 4.770396760790289e-07, + "loss": 0.0097, + "num_tokens": 9024177.0, + "reward": 7.5246381759643555, + "reward_std": 0.8974347114562988, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.3858160376548767, + "rewards/kidney_reward/std": 1.2422213554382324, + "rewards/length2tails_reward/mean": 0.805591344833374, + "rewards/length2tails_reward/std": 0.2311784327030182, + "rewards/thermo_reward/mean": 1.0780693292617798, + "rewards/thermo_reward/std": 1.3589732646942139, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1455208994448185, + "epoch": 2.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6580639481544495, + "learning_rate": 4.751941749207995e-07, + "loss": -0.003, + "num_tokens": 9032855.0, + "reward": 6.987330436706543, + "reward_std": 2.369500160217285, + "rewards/fitness_reward/mean": 6.312489986419678, + "rewards/fitness_reward/std": 1.577176570892334, + "rewards/kidney_reward/mean": 0.26905617117881775, + "rewards/kidney_reward/std": 1.5094667673110962, + "rewards/length2tails_reward/mean": 0.7001799941062927, + "rewards/length2tails_reward/std": 0.29981929063796997, + "rewards/thermo_reward/mean": 0.7305335402488708, + "rewards/thermo_reward/std": 1.5335500240325928, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 277.90625, + "completions/mean_terminated_length": 277.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.20435741264373064, + "epoch": 2.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.110538005828857, + "learning_rate": 4.7335113731731647e-07, + "loss": 0.1045, + "num_tokens": 9041780.0, + "reward": 6.7514543533325195, + "reward_std": 2.0884392261505127, + "rewards/fitness_reward/mean": 6.251962661743164, + "rewards/fitness_reward/std": 1.9195725917816162, + "rewards/kidney_reward/mean": 0.448459267616272, + "rewards/kidney_reward/std": 1.2899540662765503, + "rewards/length2tails_reward/mean": 0.7787557244300842, + "rewards/length2tails_reward/std": 0.28578972816467285, + "rewards/thermo_reward/mean": 0.16114729642868042, + "rewards/thermo_reward/std": 1.622487187385559, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 277.6875, + "completions/mean_terminated_length": 277.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15604436211287975, + "epoch": 2.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5699193477630615, + "learning_rate": 4.7151057192020704e-07, + "loss": 0.1168, + "num_tokens": 9050698.0, + "reward": 6.861456871032715, + "reward_std": 1.9871493577957153, + "rewards/fitness_reward/mean": 6.284705638885498, + "rewards/fitness_reward/std": 1.734348177909851, + "rewards/kidney_reward/mean": 0.0003357790410518646, + "rewards/kidney_reward/std": 1.4158393144607544, + "rewards/length2tails_reward/mean": 0.8217647075653076, + "rewards/length2tails_reward/std": 0.21845299005508423, + "rewards/thermo_reward/mean": 0.7422834038734436, + "rewards/thermo_reward/std": 1.6536952257156372, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.12805430870503187, + "epoch": 2.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42426878213882446, + "learning_rate": 4.6967248736949225e-07, + "loss": 0.0006, + "num_tokens": 9059344.0, + "reward": 6.356057167053223, + "reward_std": 2.8867955207824707, + "rewards/fitness_reward/mean": 5.916177749633789, + "rewards/fitness_reward/std": 2.2993054389953613, + "rewards/kidney_reward/mean": -0.041913919150829315, + "rewards/kidney_reward/std": 1.2173104286193848, + "rewards/length2tails_reward/mean": 0.6832467317581177, + "rewards/length2tails_reward/std": 0.3525755703449249, + "rewards/thermo_reward/mean": 0.5800493955612183, + "rewards/thermo_reward/std": 1.6294580698013306, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1418345794081688, + "epoch": 2.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2959369719028473, + "learning_rate": 4.678368922935477e-07, + "loss": -0.0021, + "num_tokens": 9068109.0, + "reward": 6.880620002746582, + "reward_std": 1.9925130605697632, + "rewards/fitness_reward/mean": 6.315413475036621, + "rewards/fitness_reward/std": 1.5606398582458496, + "rewards/kidney_reward/mean": 0.1010267361998558, + "rewards/kidney_reward/std": 1.34903883934021, + "rewards/length2tails_reward/mean": 0.8397651314735413, + "rewards/length2tails_reward/std": 0.20872771739959717, + "rewards/thermo_reward/mean": 0.6095039248466492, + "rewards/thermo_reward/std": 1.655917763710022, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14336485415697098, + "epoch": 2.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.512673556804657, + "learning_rate": 4.6600379530906387e-07, + "loss": -0.0001, + "num_tokens": 9076841.0, + "reward": 7.265508651733398, + "reward_std": 1.050194501876831, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.29955047369003296, + "rewards/kidney_reward/std": 1.2635717391967773, + "rewards/length2tails_reward/mean": 0.8542990684509277, + "rewards/length2tails_reward/std": 0.17856182157993317, + "rewards/thermo_reward/mean": 0.8276984691619873, + "rewards/thermo_reward/std": 1.4903786182403564, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.14059667382389307, + "epoch": 2.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0099124908447266, + "learning_rate": 4.641732050210031e-07, + "loss": -0.0126, + "num_tokens": 9085504.0, + "reward": 6.333549499511719, + "reward_std": 2.951432704925537, + "rewards/fitness_reward/mean": 5.879779815673828, + "rewards/fitness_reward/std": 2.4490509033203125, + "rewards/kidney_reward/mean": 0.3323780298233032, + "rewards/kidney_reward/std": 1.2020114660263062, + "rewards/length2tails_reward/mean": 0.7716349363327026, + "rewards/length2tails_reward/std": 0.2995572090148926, + "rewards/thermo_reward/mean": 0.18934288620948792, + "rewards/thermo_reward/std": 1.8173725605010986, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13878013286739588, + "epoch": 2.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6474786996841431, + "learning_rate": 4.623451300225627e-07, + "loss": 0.0047, + "num_tokens": 9094200.0, + "reward": 6.6942548751831055, + "reward_std": 2.107776641845703, + "rewards/fitness_reward/mean": 6.179112911224365, + "rewards/fitness_reward/std": 1.8256361484527588, + "rewards/kidney_reward/mean": 0.168605774641037, + "rewards/kidney_reward/std": 1.2290352582931519, + "rewards/length2tails_reward/mean": 0.7750190496444702, + "rewards/length2tails_reward/std": 0.2799600064754486, + "rewards/thermo_reward/mean": 0.4741685390472412, + "rewards/thermo_reward/std": 1.8475559949874878, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 274.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1635871296748519, + "epoch": 2.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.860499918460846, + "learning_rate": 4.605195788951299e-07, + "loss": 0.0128, + "num_tokens": 9103012.0, + "reward": 6.698207855224609, + "reward_std": 1.5176252126693726, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.10077647119760513, + "rewards/kidney_reward/std": 1.3980679512023926, + "rewards/length2tails_reward/mean": 0.8218721151351929, + "rewards/length2tails_reward/std": 0.2692514955997467, + "rewards/thermo_reward/mean": 0.320041298866272, + "rewards/thermo_reward/std": 1.713622808456421, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.16335375607013702, + "epoch": 2.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40944766998291, + "learning_rate": 4.586965602082472e-07, + "loss": 0.0521, + "num_tokens": 9111872.0, + "reward": 5.924317836761475, + "reward_std": 3.1954243183135986, + "rewards/fitness_reward/mean": 5.727382183074951, + "rewards/fitness_reward/std": 3.024923324584961, + "rewards/kidney_reward/mean": 0.09149547666311264, + "rewards/kidney_reward/std": 1.3352373838424683, + "rewards/length2tails_reward/mean": 0.8347629308700562, + "rewards/length2tails_reward/std": 0.2557745575904846, + "rewards/thermo_reward/mean": -0.11500567197799683, + "rewards/thermo_reward/std": 1.9267534017562866, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.5625, + "completions/mean_terminated_length": 263.5625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.13376879505813122, + "epoch": 2.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5651292204856873, + "learning_rate": 4.568760825195671e-07, + "loss": -0.1013, + "num_tokens": 9120338.0, + "reward": 6.283308982849121, + "reward_std": 2.4769811630249023, + "rewards/fitness_reward/mean": 5.924806594848633, + "rewards/fitness_reward/std": 2.1942782402038574, + "rewards/kidney_reward/mean": 0.013618044555187225, + "rewards/kidney_reward/std": 1.1817971467971802, + "rewards/length2tails_reward/mean": 0.7464162111282349, + "rewards/length2tails_reward/std": 0.30035126209259033, + "rewards/thermo_reward/mean": 0.33017855882644653, + "rewards/thermo_reward/std": 1.8270559310913086, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 281.75, + "completions/mean_terminated_length": 281.75, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.1582850143313408, + "epoch": 2.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.323350429534912, + "learning_rate": 4.550581543748161e-07, + "loss": 0.2044, + "num_tokens": 9129386.0, + "reward": 6.346465587615967, + "reward_std": 2.581454277038574, + "rewards/fitness_reward/mean": 6.041278839111328, + "rewards/fitness_reward/std": 2.073233127593994, + "rewards/kidney_reward/mean": 0.2528759837150574, + "rewards/kidney_reward/std": 1.4060814380645752, + "rewards/length2tails_reward/mean": 0.7778013944625854, + "rewards/length2tails_reward/std": 0.27179858088493347, + "rewards/thermo_reward/mean": -0.03140313923358917, + "rewards/thermo_reward/std": 1.7661538124084473, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1321554696187377, + "epoch": 2.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7698079347610474, + "learning_rate": 4.532427843077499e-07, + "loss": -0.0072, + "num_tokens": 9138074.0, + "reward": 6.875519752502441, + "reward_std": 1.209743857383728, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1187310740351677, + "rewards/kidney_reward/std": 1.248063564300537, + "rewards/length2tails_reward/mean": 0.6648907661437988, + "rewards/length2tails_reward/std": 0.3353273570537567, + "rewards/thermo_reward/mean": 0.5292226076126099, + "rewards/thermo_reward/std": 1.6276687383651733, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1388536784797907, + "epoch": 2.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9407142400741577, + "learning_rate": 4.514299808401184e-07, + "loss": 0.0082, + "num_tokens": 9146804.0, + "reward": 6.539276123046875, + "reward_std": 2.3540897369384766, + "rewards/fitness_reward/mean": 6.254793643951416, + "rewards/fitness_reward/std": 1.9035568237304688, + "rewards/kidney_reward/mean": 0.24786502122879028, + "rewards/kidney_reward/std": 1.319318413734436, + "rewards/length2tails_reward/mean": 0.7805852890014648, + "rewards/length2tails_reward/std": 0.32910311222076416, + "rewards/thermo_reward/mean": -0.0691927969455719, + "rewards/thermo_reward/std": 2.0602610111236572, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.14801343716681004, + "epoch": 2.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1686818599700928, + "learning_rate": 4.4961975248162285e-07, + "loss": -0.0077, + "num_tokens": 9155467.0, + "reward": 7.095392227172852, + "reward_std": 2.0793514251708984, + "rewards/fitness_reward/mean": 6.2586493492126465, + "rewards/fitness_reward/std": 1.881745457649231, + "rewards/kidney_reward/mean": 0.48106059432029724, + "rewards/kidney_reward/std": 1.4661235809326172, + "rewards/length2tails_reward/mean": 0.7982726693153381, + "rewards/length2tails_reward/std": 0.2307547628879547, + "rewards/thermo_reward/mean": 0.7932881116867065, + "rewards/thermo_reward/std": 1.5425667762756348, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1529099326580763, + "epoch": 2.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8789366483688354, + "learning_rate": 4.478121077298751e-07, + "loss": -0.0037, + "num_tokens": 9164226.0, + "reward": 6.994852066040039, + "reward_std": 2.0529611110687256, + "rewards/fitness_reward/mean": 6.308650016784668, + "rewards/fitness_reward/std": 1.5988991260528564, + "rewards/kidney_reward/mean": 0.3753259778022766, + "rewards/kidney_reward/std": 1.5336309671401978, + "rewards/length2tails_reward/mean": 0.7723013162612915, + "rewards/length2tails_reward/std": 0.25798913836479187, + "rewards/thermo_reward/mean": 0.6109280586242676, + "rewards/thermo_reward/std": 1.6980129480361938, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1489074667915702, + "epoch": 2.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.108884572982788, + "learning_rate": 4.460070550703612e-07, + "loss": -0.0024, + "num_tokens": 9172986.0, + "reward": 6.931676387786865, + "reward_std": 2.9280824661254883, + "rewards/fitness_reward/mean": 6.1156768798828125, + "rewards/fitness_reward/std": 2.1687655448913574, + "rewards/kidney_reward/mean": 0.6510825157165527, + "rewards/kidney_reward/std": 1.371528148651123, + "rewards/length2tails_reward/mean": 0.7745834589004517, + "rewards/length2tails_reward/std": 0.2728259861469269, + "rewards/thermo_reward/mean": 0.5936248898506165, + "rewards/thermo_reward/std": 1.6564568281173706, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.09375, + "completions/mean_terminated_length": 269.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11904279608279467, + "epoch": 2.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.929303765296936, + "learning_rate": 4.442046029763964e-07, + "loss": -0.0005, + "num_tokens": 9181629.0, + "reward": 7.031017303466797, + "reward_std": 1.5965406894683838, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.731277346611023, + "rewards/kidney_reward/std": 1.3532460927963257, + "rewards/length2tails_reward/mean": 0.6934307813644409, + "rewards/length2tails_reward/std": 0.32266533374786377, + "rewards/thermo_reward/mean": 0.21340101957321167, + "rewards/thermo_reward/std": 1.7386853694915771, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1478506438434124, + "epoch": 2.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2824796438217163, + "learning_rate": 4.4240475990909097e-07, + "loss": -0.0032, + "num_tokens": 9190355.0, + "reward": 6.377861976623535, + "reward_std": 1.7648645639419556, + "rewards/fitness_reward/mean": 5.973363876342773, + "rewards/fitness_reward/std": 1.3069151639938354, + "rewards/kidney_reward/mean": 0.016602501273155212, + "rewards/kidney_reward/std": 1.252076268196106, + "rewards/length2tails_reward/mean": 0.6819515228271484, + "rewards/length2tails_reward/std": 0.35075509548187256, + "rewards/thermo_reward/mean": 0.4514170289039612, + "rewards/thermo_reward/std": 1.693482518196106, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1303664706647396, + "epoch": 2.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.654961109161377, + "learning_rate": 4.4060753431730625e-07, + "loss": -0.0049, + "num_tokens": 9199024.0, + "reward": 7.219334602355957, + "reward_std": 1.0902973413467407, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.547929048538208, + "rewards/kidney_reward/std": 1.2476272583007812, + "rewards/length2tails_reward/mean": 0.6591386795043945, + "rewards/length2tails_reward/std": 0.3145107924938202, + "rewards/thermo_reward/mean": 0.3785743713378906, + "rewards/thermo_reward/std": 1.622428297996521, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.14913088083267212, + "epoch": 2.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7827045917510986, + "learning_rate": 4.388129346376177e-07, + "loss": 0.0336, + "num_tokens": 9207795.0, + "reward": 5.995694637298584, + "reward_std": 3.1616592407226562, + "rewards/fitness_reward/mean": 5.664544105529785, + "rewards/fitness_reward/std": 2.899955987930298, + "rewards/kidney_reward/mean": 0.3902604877948761, + "rewards/kidney_reward/std": 1.205466628074646, + "rewards/length2tails_reward/mean": 0.7591409683227539, + "rewards/length2tails_reward/std": 0.3001365661621094, + "rewards/thermo_reward/mean": -0.10752996802330017, + "rewards/thermo_reward/std": 2.1284523010253906, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 279.0625, + "completions/mean_terminated_length": 279.0625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.1522047258913517, + "epoch": 2.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.136735200881958, + "learning_rate": 4.370209692942732e-07, + "loss": 0.0449, + "num_tokens": 9216757.0, + "reward": 6.766518592834473, + "reward_std": 2.9295687675476074, + "rewards/fitness_reward/mean": 5.870096683502197, + "rewards/fitness_reward/std": 2.8411402702331543, + "rewards/kidney_reward/mean": 0.47631680965423584, + "rewards/kidney_reward/std": 1.3965466022491455, + "rewards/length2tails_reward/mean": 0.7997990846633911, + "rewards/length2tails_reward/std": 0.2738811671733856, + "rewards/thermo_reward/mean": 0.9166274070739746, + "rewards/thermo_reward/std": 1.5720754861831665, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1497397581115365, + "epoch": 2.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3835788667201996, + "learning_rate": 4.352316466991549e-07, + "loss": 0.0002, + "num_tokens": 9225467.0, + "reward": 7.368613243103027, + "reward_std": 0.9548452496528625, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.09489268809556961, + "rewards/kidney_reward/std": 1.223229169845581, + "rewards/length2tails_reward/mean": 0.773529052734375, + "rewards/length2tails_reward/std": 0.3031114339828491, + "rewards/thermo_reward/mean": 1.0729725360870361, + "rewards/thermo_reward/std": 1.3538482189178467, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14060856774449348, + "epoch": 2.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2172234058380127, + "learning_rate": 4.33444975251739e-07, + "loss": 0.0093, + "num_tokens": 9234169.0, + "reward": 6.5072150230407715, + "reward_std": 3.0782957077026367, + "rewards/fitness_reward/mean": 5.865594387054443, + "rewards/fitness_reward/std": 2.8563311100006104, + "rewards/kidney_reward/mean": 0.17112207412719727, + "rewards/kidney_reward/std": 1.348432183265686, + "rewards/length2tails_reward/mean": 0.7674511075019836, + "rewards/length2tails_reward/std": 0.2756768465042114, + "rewards/thermo_reward/mean": 0.7283921837806702, + "rewards/thermo_reward/std": 1.3867164850234985, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12399530690163374, + "epoch": 2.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5940439701080322, + "learning_rate": 4.3166096333905743e-07, + "loss": -0.0007, + "num_tokens": 9242883.0, + "reward": 6.542070388793945, + "reward_std": 1.4841806888580322, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.07396991550922394, + "rewards/kidney_reward/std": 1.2442100048065186, + "rewards/length2tails_reward/mean": 0.840146541595459, + "rewards/length2tails_reward/std": 0.21365833282470703, + "rewards/thermo_reward/mean": -0.3865211009979248, + "rewards/thermo_reward/std": 1.8812166452407837, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 285.0625, + "completions/mean_terminated_length": 285.0625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.190596092492342, + "epoch": 2.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.351926326751709, + "learning_rate": 4.2987961933565653e-07, + "loss": 0.1273, + "num_tokens": 9252037.0, + "reward": 6.373692512512207, + "reward_std": 3.046121120452881, + "rewards/fitness_reward/mean": 5.859956741333008, + "rewards/fitness_reward/std": 2.877810478210449, + "rewards/kidney_reward/mean": 0.06937101483345032, + "rewards/kidney_reward/std": 1.4734275341033936, + "rewards/length2tails_reward/mean": 0.7976754903793335, + "rewards/length2tails_reward/std": 0.31712231040000916, + "rewards/thermo_reward/mean": 0.5592616200447083, + "rewards/thermo_reward/std": 1.6919496059417725, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.16870292089879513, + "epoch": 2.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5312401056289673, + "learning_rate": 4.281009516035602e-07, + "loss": -0.0762, + "num_tokens": 9260821.0, + "reward": 6.317414283752441, + "reward_std": 3.3071558475494385, + "rewards/fitness_reward/mean": 5.676151752471924, + "rewards/fitness_reward/std": 2.855370283126831, + "rewards/kidney_reward/mean": 0.3555218279361725, + "rewards/kidney_reward/std": 1.412030577659607, + "rewards/length2tails_reward/mean": 0.8570415377616882, + "rewards/length2tails_reward/std": 0.24160458147525787, + "rewards/thermo_reward/mean": 0.49848252534866333, + "rewards/thermo_reward/std": 1.7229719161987305, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13013274781405926, + "epoch": 2.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6609305143356323, + "learning_rate": 4.263249684922281e-07, + "loss": 0.0069, + "num_tokens": 9269534.0, + "reward": 6.0955095291137695, + "reward_std": 2.933845281600952, + "rewards/fitness_reward/mean": 5.685418605804443, + "rewards/fitness_reward/std": 2.4676737785339355, + "rewards/kidney_reward/mean": -0.24257102608680725, + "rewards/kidney_reward/std": 1.2613190412521362, + "rewards/length2tails_reward/mean": 0.7583092451095581, + "rewards/length2tails_reward/std": 0.32557952404022217, + "rewards/thermo_reward/mean": 0.6835988759994507, + "rewards/thermo_reward/std": 1.4879592657089233, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14275843556970358, + "epoch": 2.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4426646828651428, + "learning_rate": 4.24551678338518e-07, + "loss": -0.0028, + "num_tokens": 9278239.0, + "reward": 6.526113033294678, + "reward_std": 1.5891051292419434, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.14219215512275696, + "rewards/kidney_reward/std": 1.4604437351226807, + "rewards/length2tails_reward/mean": 0.8299112319946289, + "rewards/length2tails_reward/std": 0.25633201003074646, + "rewards/thermo_reward/mean": -0.19715654850006104, + "rewards/thermo_reward/std": 1.9208016395568848, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 285.8125, + "completions/mean_terminated_length": 285.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18135285377502441, + "epoch": 2.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.010969161987305, + "learning_rate": 4.2278108946664714e-07, + "loss": 0.1759, + "num_tokens": 9287417.0, + "reward": 6.668601989746094, + "reward_std": 3.064962387084961, + "rewards/fitness_reward/mean": 5.763694763183594, + "rewards/fitness_reward/std": 2.885802984237671, + "rewards/kidney_reward/mean": 0.5370638966560364, + "rewards/kidney_reward/std": 1.1695818901062012, + "rewards/length2tails_reward/mean": 0.7554937601089478, + "rewards/length2tails_reward/std": 0.2684459984302521, + "rewards/thermo_reward/mean": 0.8950035572052002, + "rewards/thermo_reward/std": 1.482263207435608, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1311612306162715, + "epoch": 2.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7452146410942078, + "learning_rate": 4.210132101881515e-07, + "loss": 0.0045, + "num_tokens": 9296110.0, + "reward": 6.8865790367126465, + "reward_std": 1.4528340101242065, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.030711792409420013, + "rewards/kidney_reward/std": 1.4708689451217651, + "rewards/length2tails_reward/mean": 0.7890916466712952, + "rewards/length2tails_reward/std": 0.26916781067848206, + "rewards/thermo_reward/mean": 0.1653042733669281, + "rewards/thermo_reward/std": 1.922910213470459, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16036415845155716, + "epoch": 2.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6204653978347778, + "learning_rate": 4.192480488018477e-07, + "loss": 0.0095, + "num_tokens": 9304904.0, + "reward": 6.976659774780273, + "reward_std": 1.1679787635803223, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.07483702898025513, + "rewards/kidney_reward/std": 1.328316330909729, + "rewards/length2tails_reward/mean": 0.7744359970092773, + "rewards/length2tails_reward/std": 0.2793138325214386, + "rewards/thermo_reward/mean": 0.514646053314209, + "rewards/thermo_reward/std": 1.537448525428772, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 278.09375, + "completions/mean_terminated_length": 278.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.18581388425081968, + "epoch": 2.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.675136923789978, + "learning_rate": 4.1748561359379395e-07, + "loss": 0.0236, + "num_tokens": 9313835.0, + "reward": 7.2237958908081055, + "reward_std": 1.074021339416504, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.3321935534477234, + "rewards/kidney_reward/std": 1.2390544414520264, + "rewards/length2tails_reward/mean": 0.8383061289787292, + "rewards/length2tails_reward/std": 0.22828836739063263, + "rewards/thermo_reward/mean": 0.5136481523513794, + "rewards/thermo_reward/std": 1.7810319662094116, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12908640317618847, + "epoch": 2.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45260632038116455, + "learning_rate": 4.15725912837252e-07, + "loss": 0.0028, + "num_tokens": 9322511.0, + "reward": 6.571354389190674, + "reward_std": 1.3069359064102173, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1280260980129242, + "rewards/kidney_reward/std": 1.4540979862213135, + "rewards/length2tails_reward/mean": 0.7143810987472534, + "rewards/length2tails_reward/std": 0.32245200872421265, + "rewards/thermo_reward/mean": -0.5251045227050781, + "rewards/thermo_reward/std": 1.8070964813232422, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.09375, + "completions/mean_terminated_length": 269.09375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.13585994392633438, + "epoch": 2.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4435308873653412, + "learning_rate": 4.1396895479264603e-07, + "loss": 0.0028, + "num_tokens": 9331154.0, + "reward": 7.2967000007629395, + "reward_std": 1.451534390449524, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.6194279193878174, + "rewards/kidney_reward/std": 1.3039216995239258, + "rewards/length2tails_reward/mean": 0.7233110070228577, + "rewards/length2tails_reward/std": 0.29798802733421326, + "rewards/thermo_reward/mean": 0.8416768908500671, + "rewards/thermo_reward/std": 1.4932119846343994, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.16679678577929735, + "epoch": 2.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.622966766357422, + "learning_rate": 4.1221474770752696e-07, + "loss": -0.0152, + "num_tokens": 9339901.0, + "reward": 5.919442176818848, + "reward_std": 3.924933910369873, + "rewards/fitness_reward/mean": 5.175980567932129, + "rewards/fitness_reward/std": 3.8182079792022705, + "rewards/kidney_reward/mean": 0.16596902906894684, + "rewards/kidney_reward/std": 1.2695493698120117, + "rewards/length2tails_reward/mean": 0.8167648911476135, + "rewards/length2tails_reward/std": 0.2352745532989502, + "rewards/thermo_reward/mean": 0.9125717878341675, + "rewards/thermo_reward/std": 1.2919182777404785, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1500767981633544, + "epoch": 2.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6282027959823608, + "learning_rate": 4.1046329981653085e-07, + "loss": 0.0021, + "num_tokens": 9348670.0, + "reward": 6.912932872772217, + "reward_std": 1.4704110622406006, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3062431216239929, + "rewards/kidney_reward/std": 1.4361602067947388, + "rewards/length2tails_reward/mean": 0.7091963291168213, + "rewards/length2tails_reward/std": 0.3606708347797394, + "rewards/thermo_reward/mean": 0.1884058117866516, + "rewards/thermo_reward/std": 1.8143515586853027, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15921393502503633, + "epoch": 2.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.351198434829712, + "learning_rate": 4.0871461934134156e-07, + "loss": -0.0056, + "num_tokens": 9357450.0, + "reward": 5.995494842529297, + "reward_std": 3.5377626419067383, + "rewards/fitness_reward/mean": 5.550123691558838, + "rewards/fitness_reward/std": 3.3063604831695557, + "rewards/kidney_reward/mean": 0.27938735485076904, + "rewards/kidney_reward/std": 1.4755336046218872, + "rewards/length2tails_reward/mean": 0.8054314851760864, + "rewards/length2tails_reward/std": 0.29145053029060364, + "rewards/thermo_reward/mean": 0.20863863825798035, + "rewards/thermo_reward/std": 1.7887850999832153, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13788711186498404, + "epoch": 2.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4235988259315491, + "learning_rate": 4.069687144906532e-07, + "loss": -0.0004, + "num_tokens": 9366166.0, + "reward": 6.857729911804199, + "reward_std": 1.6800484657287598, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.11377771198749542, + "rewards/kidney_reward/std": 1.411139726638794, + "rewards/length2tails_reward/mean": 0.8295034170150757, + "rewards/length2tails_reward/std": 0.25271517038345337, + "rewards/thermo_reward/mean": 0.4162905812263489, + "rewards/thermo_reward/std": 1.919075608253479, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.1441486943513155, + "epoch": 2.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9590158462524414, + "learning_rate": 4.05225593460129e-07, + "loss": -0.0028, + "num_tokens": 9374850.0, + "reward": 6.457082748413086, + "reward_std": 3.3118820190429688, + "rewards/fitness_reward/mean": 5.8487443923950195, + "rewards/fitness_reward/std": 2.923922300338745, + "rewards/kidney_reward/mean": 0.16359946131706238, + "rewards/kidney_reward/std": 1.2666887044906616, + "rewards/length2tails_reward/mean": 0.8118967413902283, + "rewards/length2tails_reward/std": 0.258676141500473, + "rewards/thermo_reward/mean": 0.6471285820007324, + "rewards/thermo_reward/std": 1.5702543258666992, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13847491145133972, + "epoch": 2.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.343187928199768, + "learning_rate": 4.0348526443236606e-07, + "loss": 0.0258, + "num_tokens": 9383600.0, + "reward": 7.186614036560059, + "reward_std": 2.465003728866577, + "rewards/fitness_reward/mean": 6.223491668701172, + "rewards/fitness_reward/std": 2.0806260108947754, + "rewards/kidney_reward/mean": 0.7199074029922485, + "rewards/kidney_reward/std": 1.3911585807800293, + "rewards/length2tails_reward/mean": 0.8268277049064636, + "rewards/length2tails_reward/std": 0.23231400549411774, + "rewards/thermo_reward/mean": 0.792922854423523, + "rewards/thermo_reward/std": 1.5512793064117432, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1332150986418128, + "epoch": 2.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44490352272987366, + "learning_rate": 4.0174773557685283e-07, + "loss": 0.0008, + "num_tokens": 9392282.0, + "reward": 7.190145969390869, + "reward_std": 1.1982930898666382, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3189336955547333, + "rewards/kidney_reward/std": 1.3217718601226807, + "rewards/length2tails_reward/mean": 0.8060603141784668, + "rewards/length2tails_reward/std": 0.20847615599632263, + "rewards/thermo_reward/mean": 0.6817101836204529, + "rewards/thermo_reward/std": 1.6188526153564453, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.21875, + "completions/mean_terminated_length": 263.21875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.13945792708545923, + "epoch": 2.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7005243897438049, + "learning_rate": 4.000130150499355e-07, + "loss": -0.1119, + "num_tokens": 9400737.0, + "reward": 6.276940822601318, + "reward_std": 2.83335542678833, + "rewards/fitness_reward/mean": 5.829273223876953, + "rewards/fitness_reward/std": 2.6423723697662354, + "rewards/kidney_reward/mean": 0.43410342931747437, + "rewards/kidney_reward/std": 1.3488463163375854, + "rewards/length2tails_reward/mean": 0.7987202405929565, + "rewards/length2tails_reward/std": 0.25400641560554504, + "rewards/thermo_reward/mean": 0.06187206506729126, + "rewards/thermo_reward/std": 1.9264873266220093, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.375, + "completions/mean_terminated_length": 268.375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.14116911962628365, + "epoch": 2.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5681545734405518, + "learning_rate": 3.9828111099477544e-07, + "loss": -0.0276, + "num_tokens": 9409357.0, + "reward": 6.884678840637207, + "reward_std": 2.441939353942871, + "rewards/fitness_reward/mean": 6.239100933074951, + "rewards/fitness_reward/std": 1.9923285245895386, + "rewards/kidney_reward/mean": 0.40994834899902344, + "rewards/kidney_reward/std": 1.3353699445724487, + "rewards/length2tails_reward/mean": 0.7378535270690918, + "rewards/length2tails_reward/std": 0.27312368154525757, + "rewards/thermo_reward/mean": 0.5122808218002319, + "rewards/thermo_reward/std": 1.5303328037261963, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14342812448740005, + "epoch": 2.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1632758378982544, + "learning_rate": 3.965520315413149e-07, + "loss": 0.0035, + "num_tokens": 9418050.0, + "reward": 6.916120529174805, + "reward_std": 2.487173557281494, + "rewards/fitness_reward/mean": 6.017578601837158, + "rewards/fitness_reward/std": 2.197279930114746, + "rewards/kidney_reward/mean": 0.41827717423439026, + "rewards/kidney_reward/std": 1.3443899154663086, + "rewards/length2tails_reward/mean": 0.7684807777404785, + "rewards/length2tails_reward/std": 0.32451915740966797, + "rewards/thermo_reward/mean": 0.9945659637451172, + "rewards/thermo_reward/std": 1.3899034261703491, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 277.4375, + "completions/mean_terminated_length": 277.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.19726300425827503, + "epoch": 2.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7620906829833984, + "learning_rate": 3.948257848062351e-07, + "loss": 0.0852, + "num_tokens": 9426960.0, + "reward": 6.373462200164795, + "reward_std": 2.476536750793457, + "rewards/fitness_reward/mean": 5.795304775238037, + "rewards/fitness_reward/std": 2.770754814147949, + "rewards/kidney_reward/mean": 0.26029521226882935, + "rewards/kidney_reward/std": 1.3003872632980347, + "rewards/length2tails_reward/mean": 0.7188592553138733, + "rewards/length2tails_reward/std": 0.33041077852249146, + "rewards/thermo_reward/mean": 0.5365903377532959, + "rewards/thermo_reward/std": 1.6458954811096191, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14220660645514727, + "epoch": 2.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.58907151222229, + "learning_rate": 3.9310237889292e-07, + "loss": 0.0097, + "num_tokens": 9435706.0, + "reward": 5.9869384765625, + "reward_std": 3.8137810230255127, + "rewards/fitness_reward/mean": 5.333393096923828, + "rewards/fitness_reward/std": 3.3923747539520264, + "rewards/kidney_reward/mean": 0.4614916145801544, + "rewards/kidney_reward/std": 1.2004516124725342, + "rewards/length2tails_reward/mean": 0.8033137321472168, + "rewards/length2tails_reward/std": 0.270455926656723, + "rewards/thermo_reward/mean": 0.4439413845539093, + "rewards/thermo_reward/std": 1.864505648612976, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 275.15625, + "completions/mean_terminated_length": 275.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1509252917021513, + "epoch": 2.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.746227979660034, + "learning_rate": 3.913818218914193e-07, + "loss": 0.0795, + "num_tokens": 9444543.0, + "reward": 6.469642162322998, + "reward_std": 3.120642900466919, + "rewards/fitness_reward/mean": 5.863880157470703, + "rewards/fitness_reward/std": 2.8623623847961426, + "rewards/kidney_reward/mean": 0.47719845175743103, + "rewards/kidney_reward/std": 1.3986775875091553, + "rewards/length2tails_reward/mean": 0.7543799877166748, + "rewards/length2tails_reward/std": 0.2777114510536194, + "rewards/thermo_reward/mean": 0.35713544487953186, + "rewards/thermo_reward/std": 1.7676435708999634, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 280.40625, + "completions/mean_terminated_length": 280.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14305216632783413, + "epoch": 2.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6391445398330688, + "learning_rate": 3.8966412187840803e-07, + "loss": 0.0411, + "num_tokens": 9453548.0, + "reward": 6.351337432861328, + "reward_std": 2.7923762798309326, + "rewards/fitness_reward/mean": 6.104561805725098, + "rewards/fitness_reward/std": 2.229396343231201, + "rewards/kidney_reward/mean": 0.2081633359193802, + "rewards/kidney_reward/std": 1.2935794591903687, + "rewards/length2tails_reward/mean": 0.7403035759925842, + "rewards/length2tails_reward/std": 0.28035199642181396, + "rewards/thermo_reward/mean": -0.08476439118385315, + "rewards/thermo_reward/std": 1.6950254440307617, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 285.90625, + "completions/mean_terminated_length": 270.80645751953125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1517588933929801, + "epoch": 2.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.950101852416992, + "learning_rate": 3.879492869171512e-07, + "loss": 0.245, + "num_tokens": 9462729.0, + "reward": 6.259055137634277, + "reward_std": 2.3865511417388916, + "rewards/fitness_reward/mean": 6.216929912567139, + "rewards/fitness_reward/std": 2.117746591567993, + "rewards/kidney_reward/mean": -0.21039927005767822, + "rewards/kidney_reward/std": 1.415784239768982, + "rewards/length2tails_reward/mean": 0.7745667099952698, + "rewards/length2tails_reward/std": 0.2926861345767975, + "rewards/thermo_reward/mean": -0.09263443946838379, + "rewards/thermo_reward/std": 2.0424654483795166, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14166564494371414, + "epoch": 2.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5536825656890869, + "learning_rate": 3.8623732505746254e-07, + "loss": 0.0039, + "num_tokens": 9471428.0, + "reward": 6.850225925445557, + "reward_std": 2.6650822162628174, + "rewards/fitness_reward/mean": 6.235195159912109, + "rewards/fitness_reward/std": 2.0144236087799072, + "rewards/kidney_reward/mean": 0.5434186458587646, + "rewards/kidney_reward/std": 1.3087977170944214, + "rewards/length2tails_reward/mean": 0.7975249290466309, + "rewards/length2tails_reward/std": 0.2540622651576996, + "rewards/thermo_reward/mean": 0.2878812551498413, + "rewards/thermo_reward/std": 1.7817225456237793, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13525793887674809, + "epoch": 2.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4625759720802307, + "learning_rate": 3.845282443356711e-07, + "loss": -0.003, + "num_tokens": 9480110.0, + "reward": 7.222718238830566, + "reward_std": 1.3861621618270874, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3318258225917816, + "rewards/kidney_reward/std": 1.230282187461853, + "rewards/length2tails_reward/mean": 0.7717716097831726, + "rewards/length2tails_reward/std": 0.2539806067943573, + "rewards/thermo_reward/mean": 0.7511072158813477, + "rewards/thermo_reward/std": 1.6587716341018677, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 275.90625, + "completions/mean_terminated_length": 275.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1521181659772992, + "epoch": 2.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4035966396331787, + "learning_rate": 3.8282205277457956e-07, + "loss": 0.067, + "num_tokens": 9488971.0, + "reward": 6.2151923179626465, + "reward_std": 2.7291853427886963, + "rewards/fitness_reward/mean": 5.726511001586914, + "rewards/fitness_reward/std": 2.6746671199798584, + "rewards/kidney_reward/mean": 0.3242270350456238, + "rewards/kidney_reward/std": 1.081840991973877, + "rewards/length2tails_reward/mean": 0.7839233875274658, + "rewards/length2tails_reward/std": 0.2741847634315491, + "rewards/thermo_reward/mean": 0.2611736059188843, + "rewards/thermo_reward/std": 1.8001337051391602, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13877547718584538, + "epoch": 2.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.343034029006958, + "learning_rate": 3.8111875838342954e-07, + "loss": 0.0037, + "num_tokens": 9497763.0, + "reward": 6.740494728088379, + "reward_std": 2.1605401039123535, + "rewards/fitness_reward/mean": 6.289388656616211, + "rewards/fitness_reward/std": 1.7078571319580078, + "rewards/kidney_reward/mean": 0.2869397699832916, + "rewards/kidney_reward/std": 1.3796329498291016, + "rewards/length2tails_reward/mean": 0.7512634992599487, + "rewards/length2tails_reward/std": 0.2976720929145813, + "rewards/thermo_reward/mean": 0.23964014649391174, + "rewards/thermo_reward/std": 1.946712851524353, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13112556841224432, + "epoch": 2.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3061606884002686, + "learning_rate": 3.7941836915786175e-07, + "loss": -0.0004, + "num_tokens": 9506426.0, + "reward": 7.144230365753174, + "reward_std": 1.202386736869812, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5496199727058411, + "rewards/kidney_reward/std": 1.501986026763916, + "rewards/length2tails_reward/mean": 0.7209525108337402, + "rewards/length2tails_reward/std": 0.31163740158081055, + "rewards/thermo_reward/mean": 0.19576820731163025, + "rewards/thermo_reward/std": 1.7851938009262085, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1460349252447486, + "epoch": 2.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34133243560791, + "learning_rate": 3.777208930798793e-07, + "loss": 0.0391, + "num_tokens": 9515222.0, + "reward": 6.785855293273926, + "reward_std": 2.4483165740966797, + "rewards/fitness_reward/mean": 6.125822067260742, + "rewards/fitness_reward/std": 2.113539695739746, + "rewards/kidney_reward/mean": 0.6374526023864746, + "rewards/kidney_reward/std": 1.2205655574798584, + "rewards/length2tails_reward/mean": 0.8312607407569885, + "rewards/length2tails_reward/std": 0.3061812222003937, + "rewards/thermo_reward/mean": 0.2669835686683655, + "rewards/thermo_reward/std": 1.924043893814087, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1438216408714652, + "epoch": 2.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8875055909156799, + "learning_rate": 3.760263381178116e-07, + "loss": -0.0033, + "num_tokens": 9523902.0, + "reward": 7.184237480163574, + "reward_std": 0.8892032504081726, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.051064785569906235, + "rewards/kidney_reward/std": 1.3600190877914429, + "rewards/length2tails_reward/mean": 0.8025528788566589, + "rewards/length2tails_reward/std": 0.2329147458076477, + "rewards/thermo_reward/mean": 0.8356671333312988, + "rewards/thermo_reward/std": 1.301501750946045, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 268.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.14491372276097536, + "epoch": 2.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6386216878890991, + "learning_rate": 3.743347122262741e-07, + "loss": -0.0432, + "num_tokens": 9532534.0, + "reward": 6.409244537353516, + "reward_std": 2.512324810028076, + "rewards/fitness_reward/mean": 6.131818771362305, + "rewards/fitness_reward/std": 2.080949306488037, + "rewards/kidney_reward/mean": -0.0422045961022377, + "rewards/kidney_reward/std": 1.2203233242034912, + "rewards/length2tails_reward/mean": 0.8081647157669067, + "rewards/length2tails_reward/std": 0.26469770073890686, + "rewards/thermo_reward/mean": 0.1929737627506256, + "rewards/thermo_reward/std": 1.919042944908142, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1531241275370121, + "epoch": 2.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765179395675659, + "learning_rate": 3.7264602334613384e-07, + "loss": -0.0, + "num_tokens": 9541199.0, + "reward": 6.996410369873047, + "reward_std": 2.2399685382843018, + "rewards/fitness_reward/mean": 6.243576526641846, + "rewards/fitness_reward/std": 1.9670097827911377, + "rewards/kidney_reward/mean": 0.5287564396858215, + "rewards/kidney_reward/std": 1.1935288906097412, + "rewards/length2tails_reward/mean": 0.7260756492614746, + "rewards/length2tails_reward/std": 0.3149781823158264, + "rewards/thermo_reward/mean": 0.6138721704483032, + "rewards/thermo_reward/std": 1.7304421663284302, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 279.46875, + "completions/mean_terminated_length": 279.46875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.2065716851502657, + "epoch": 2.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1193106174468994, + "learning_rate": 3.709602794044702e-07, + "loss": -0.0257, + "num_tokens": 9550174.0, + "reward": 6.248079299926758, + "reward_std": 2.8515186309814453, + "rewards/fitness_reward/mean": 5.888064384460449, + "rewards/fitness_reward/std": 2.7744197845458984, + "rewards/kidney_reward/mean": 0.32587796449661255, + "rewards/kidney_reward/std": 1.29982590675354, + "rewards/length2tails_reward/mean": 0.818302571773529, + "rewards/length2tails_reward/std": 0.23894526064395905, + "rewards/thermo_reward/mean": -0.014998994767665863, + "rewards/thermo_reward/std": 1.8893496990203857, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13967485073953867, + "epoch": 2.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5013249516487122, + "learning_rate": 3.692774883145383e-07, + "loss": 0.0027, + "num_tokens": 9558859.0, + "reward": 7.048587322235107, + "reward_std": 1.5385327339172363, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.48319754004478455, + "rewards/kidney_reward/std": 1.5139840841293335, + "rewards/length2tails_reward/mean": 0.7465558052062988, + "rewards/length2tails_reward/std": 0.29535987973213196, + "rewards/thermo_reward/mean": 0.676037073135376, + "rewards/thermo_reward/std": 1.6419163942337036, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13858924712985754, + "epoch": 2.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48351436853408813, + "learning_rate": 3.6759765797573204e-07, + "loss": 0.0006, + "num_tokens": 9567557.0, + "reward": 6.970153331756592, + "reward_std": 1.2168828248977661, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.393543004989624, + "rewards/kidney_reward/std": 1.1136780977249146, + "rewards/length2tails_reward/mean": 0.786371111869812, + "rewards/length2tails_reward/std": 0.2911362648010254, + "rewards/thermo_reward/mean": 0.3829381465911865, + "rewards/thermo_reward/std": 1.7513307332992554, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13385429698973894, + "epoch": 2.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6461921334266663, + "learning_rate": 3.6592079627354745e-07, + "loss": -0.0053, + "num_tokens": 9576229.0, + "reward": 7.154108047485352, + "reward_std": 1.340390682220459, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.49607527256011963, + "rewards/kidney_reward/std": 1.3398762941360474, + "rewards/length2tails_reward/mean": 0.7494277954101562, + "rewards/length2tails_reward/std": 0.27494508028030396, + "rewards/thermo_reward/mean": 0.4608082175254822, + "rewards/thermo_reward/std": 1.5341694355010986, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 263.25, + "completions/mean_terminated_length": 263.25, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.12642492912709713, + "epoch": 2.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9419066309928894, + "learning_rate": 3.642469110795444e-07, + "loss": -0.0904, + "num_tokens": 9584685.0, + "reward": 6.67075252532959, + "reward_std": 2.8817691802978516, + "rewards/fitness_reward/mean": 5.9534687995910645, + "rewards/fitness_reward/std": 2.5222983360290527, + "rewards/kidney_reward/mean": 0.21264344453811646, + "rewards/kidney_reward/std": 1.2178295850753784, + "rewards/length2tails_reward/mean": 0.6898723840713501, + "rewards/length2tails_reward/std": 0.30631107091903687, + "rewards/thermo_reward/mean": 0.8769875764846802, + "rewards/thermo_reward/std": 1.5118794441223145, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.13914308696985245, + "epoch": 2.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7642078399658203, + "learning_rate": 3.625760102513102e-07, + "loss": -0.0547, + "num_tokens": 9593256.0, + "reward": 7.222716331481934, + "reward_std": 2.2101850509643555, + "rewards/fitness_reward/mean": 6.2409820556640625, + "rewards/fitness_reward/std": 1.981685996055603, + "rewards/kidney_reward/mean": 0.7720296382904053, + "rewards/kidney_reward/std": 1.0629948377609253, + "rewards/length2tails_reward/mean": 0.7280055284500122, + "rewards/length2tails_reward/std": 0.29946184158325195, + "rewards/thermo_reward/mean": 0.8274354934692383, + "rewards/thermo_reward/std": 1.3762067556381226, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 277.40625, + "completions/mean_terminated_length": 277.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16253278031945229, + "epoch": 2.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9272940754890442, + "learning_rate": 3.6090810163242425e-07, + "loss": -0.0187, + "num_tokens": 9602165.0, + "reward": 7.166613578796387, + "reward_std": 1.167151927947998, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.19176791608333588, + "rewards/kidney_reward/std": 1.3263390064239502, + "rewards/length2tails_reward/mean": 0.8280547857284546, + "rewards/length2tails_reward/std": 0.2669452428817749, + "rewards/thermo_reward/mean": 0.7508133053779602, + "rewards/thermo_reward/std": 1.8895303010940552, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14848066587001085, + "epoch": 2.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7057750225067139, + "learning_rate": 3.592431930524179e-07, + "loss": 0.0008, + "num_tokens": 9610899.0, + "reward": 7.100525856018066, + "reward_std": 1.3476861715316772, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.10337407886981964, + "rewards/kidney_reward/std": 1.3951269388198853, + "rewards/length2tails_reward/mean": 0.8227828741073608, + "rewards/length2tails_reward/std": 0.24749942123889923, + "rewards/thermo_reward/mean": 0.709667444229126, + "rewards/thermo_reward/std": 1.6096922159194946, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 278.0, + "completions/mean_terminated_length": 278.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1508835256099701, + "epoch": 2.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7836153507232666, + "learning_rate": 3.575812923267415e-07, + "loss": -0.002, + "num_tokens": 9619827.0, + "reward": 7.060356140136719, + "reward_std": 1.2770882844924927, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.29777950048446655, + "rewards/kidney_reward/std": 1.372957706451416, + "rewards/length2tails_reward/mean": 0.7647676467895508, + "rewards/length2tails_reward/std": 0.2822011709213257, + "rewards/thermo_reward/mean": 0.2579517960548401, + "rewards/thermo_reward/std": 1.9245244264602661, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.15346932224929333, + "epoch": 2.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5173766613006592, + "learning_rate": 3.5592240725672475e-07, + "loss": -0.0251, + "num_tokens": 9628481.0, + "reward": 6.752187728881836, + "reward_std": 2.4208948612213135, + "rewards/fitness_reward/mean": 6.136200904846191, + "rewards/fitness_reward/std": 2.057163953781128, + "rewards/kidney_reward/mean": 0.336108922958374, + "rewards/kidney_reward/std": 1.4907042980194092, + "rewards/length2tails_reward/mean": 0.8159559369087219, + "rewards/length2tails_reward/std": 0.24038255214691162, + "rewards/thermo_reward/mean": 0.4878869652748108, + "rewards/thermo_reward/std": 1.7251250743865967, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 276.75, + "completions/mean_terminated_length": 276.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17597046587616205, + "epoch": 2.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9278579950332642, + "learning_rate": 3.542665456295415e-07, + "loss": 0.0529, + "num_tokens": 9637369.0, + "reward": 5.954558372497559, + "reward_std": 3.3535354137420654, + "rewards/fitness_reward/mean": 5.490334510803223, + "rewards/fitness_reward/std": 3.190995454788208, + "rewards/kidney_reward/mean": 0.2634471654891968, + "rewards/kidney_reward/std": 1.4119513034820557, + "rewards/length2tails_reward/mean": 0.8064690828323364, + "rewards/length2tails_reward/std": 0.3227982819080353, + "rewards/thermo_reward/mean": 0.26176607608795166, + "rewards/thermo_reward/std": 1.696439266204834, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16252513974905014, + "epoch": 2.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43885520100593567, + "learning_rate": 3.526137152181724e-07, + "loss": 0.0002, + "num_tokens": 9646229.0, + "reward": 6.859776973724365, + "reward_std": 1.0955801010131836, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.07548826187849045, + "rewards/kidney_reward/std": 1.4456524848937988, + "rewards/length2tails_reward/mean": 0.8495239019393921, + "rewards/length2tails_reward/std": 0.21926330029964447, + "rewards/thermo_reward/mean": 0.3936619162559509, + "rewards/thermo_reward/std": 1.724249243736267, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14238363411277533, + "epoch": 2.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2649893760681152, + "learning_rate": 3.5096392378137006e-07, + "loss": 0.0061, + "num_tokens": 9654925.0, + "reward": 6.9731764793396, + "reward_std": 1.158166527748108, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.003428816795349121, + "rewards/kidney_reward/std": 1.2014305591583252, + "rewards/length2tails_reward/mean": 0.7851458787918091, + "rewards/length2tails_reward/std": 0.2875399887561798, + "rewards/thermo_reward/mean": 0.5805901885032654, + "rewards/thermo_reward/std": 1.6769318580627441, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14654514379799366, + "epoch": 2.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43511831760406494, + "learning_rate": 3.493171790636202e-07, + "loss": 0.0043, + "num_tokens": 9663607.0, + "reward": 7.261923789978027, + "reward_std": 1.084303379058838, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.2344544231891632, + "rewards/kidney_reward/std": 1.25545072555542, + "rewards/length2tails_reward/mean": 0.7774105668067932, + "rewards/length2tails_reward/std": 0.2298664152622223, + "rewards/thermo_reward/mean": 1.187000036239624, + "rewards/thermo_reward/std": 1.407812476158142, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 283.6875, + "completions/mean_terminated_length": 283.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14620005246251822, + "epoch": 2.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0439192056655884, + "learning_rate": 3.476734887951078e-07, + "loss": -0.0325, + "num_tokens": 9672717.0, + "reward": 6.861195087432861, + "reward_std": 2.3683199882507324, + "rewards/fitness_reward/mean": 6.016753196716309, + "rewards/fitness_reward/std": 2.201620578765869, + "rewards/kidney_reward/mean": 0.5505621433258057, + "rewards/kidney_reward/std": 1.2925567626953125, + "rewards/length2tails_reward/mean": 0.8337277173995972, + "rewards/length2tails_reward/std": 0.275934100151062, + "rewards/thermo_reward/mean": 0.7214579582214355, + "rewards/thermo_reward/std": 1.6141769886016846, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 284.84375, + "completions/mean_terminated_length": 284.84375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.19888399075716734, + "epoch": 2.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8729356527328491, + "learning_rate": 3.460328606916787e-07, + "loss": -0.0116, + "num_tokens": 9681864.0, + "reward": 5.157289505004883, + "reward_std": 4.282762050628662, + "rewards/fitness_reward/mean": 4.8189544677734375, + "rewards/fitness_reward/std": 4.188338756561279, + "rewards/kidney_reward/mean": 0.01907338947057724, + "rewards/kidney_reward/std": 1.3484104871749878, + "rewards/length2tails_reward/mean": 0.7960008382797241, + "rewards/length2tails_reward/std": 0.2947131395339966, + "rewards/thermo_reward/mean": 0.25959593057632446, + "rewards/thermo_reward/std": 1.9093317985534668, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13348298985511065, + "epoch": 2.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2781744599342346, + "learning_rate": 3.4439530245480396e-07, + "loss": 0.0008, + "num_tokens": 9690528.0, + "reward": 6.853076934814453, + "reward_std": 2.3068912029266357, + "rewards/fitness_reward/mean": 6.255723476409912, + "rewards/fitness_reward/std": 1.8982963562011719, + "rewards/kidney_reward/mean": 0.3211251497268677, + "rewards/kidney_reward/std": 1.3331384658813477, + "rewards/length2tails_reward/mean": 0.7651264667510986, + "rewards/length2tails_reward/std": 0.2786720097064972, + "rewards/thermo_reward/mean": 0.49101772904396057, + "rewards/thermo_reward/std": 1.6684995889663696, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.16157339233905077, + "epoch": 2.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5176587104797363, + "learning_rate": 3.427608217715453e-07, + "loss": -0.0064, + "num_tokens": 9699240.0, + "reward": 6.732970714569092, + "reward_std": 2.2612903118133545, + "rewards/fitness_reward/mean": 6.186356544494629, + "rewards/fitness_reward/std": 1.7868486642837524, + "rewards/kidney_reward/mean": 0.43403953313827515, + "rewards/kidney_reward/std": 1.3948837518692017, + "rewards/length2tails_reward/mean": 0.8456356525421143, + "rewards/length2tails_reward/std": 0.2207704484462738, + "rewards/thermo_reward/mean": 0.23637159168720245, + "rewards/thermo_reward/std": 1.9618502855300903, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.9375, + "completions/mean_terminated_length": 262.9375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.13817725982517004, + "epoch": 2.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.817155659198761, + "learning_rate": 3.411294263145166e-07, + "loss": -0.1237, + "num_tokens": 9707686.0, + "reward": 7.067214012145996, + "reward_std": 2.0182809829711914, + "rewards/fitness_reward/mean": 6.259011745452881, + "rewards/fitness_reward/std": 1.8796956539154053, + "rewards/kidney_reward/mean": 0.5211696624755859, + "rewards/kidney_reward/std": 1.4164283275604248, + "rewards/length2tails_reward/mean": 0.7623540163040161, + "rewards/length2tails_reward/std": 0.2749207615852356, + "rewards/thermo_reward/mean": 0.7140581607818604, + "rewards/thermo_reward/std": 1.5527390241622925, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1406607497483492, + "epoch": 2.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37816351652145386, + "learning_rate": 3.3950112374184934e-07, + "loss": 0.0009, + "num_tokens": 9716359.0, + "reward": 6.766355514526367, + "reward_std": 1.9650306701660156, + "rewards/fitness_reward/mean": 6.203991889953613, + "rewards/fitness_reward/std": 1.6928445100784302, + "rewards/kidney_reward/mean": 0.16239503026008606, + "rewards/kidney_reward/std": 1.4602676630020142, + "rewards/length2tails_reward/mean": 0.7919414639472961, + "rewards/length2tails_reward/std": 0.25844424962997437, + "rewards/thermo_reward/mean": 0.5663615465164185, + "rewards/thermo_reward/std": 1.6233750581741333, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 263.3125, + "completions/mean_terminated_length": 263.3125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.14786599576473236, + "epoch": 2.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9131942987442017, + "learning_rate": 3.3787592169715606e-07, + "loss": -0.0976, + "num_tokens": 9724817.0, + "reward": 6.523420333862305, + "reward_std": 2.302664279937744, + "rewards/fitness_reward/mean": 6.1267876625061035, + "rewards/fitness_reward/std": 2.108288049697876, + "rewards/kidney_reward/mean": -0.35021859407424927, + "rewards/kidney_reward/std": 1.269707441329956, + "rewards/length2tails_reward/mean": 0.7799986600875854, + "rewards/length2tails_reward/std": 0.2437790036201477, + "rewards/thermo_reward/mean": 0.7534854412078857, + "rewards/thermo_reward/std": 1.5090445280075073, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 275.03125, + "completions/mean_terminated_length": 275.03125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.16802167426794767, + "epoch": 2.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5088887214660645, + "learning_rate": 3.3625382780949575e-07, + "loss": 0.0145, + "num_tokens": 9733650.0, + "reward": 6.382846355438232, + "reward_std": 2.161682605743408, + "rewards/fitness_reward/mean": 6.159786701202393, + "rewards/fitness_reward/std": 1.9295669794082642, + "rewards/kidney_reward/mean": 0.10734623670578003, + "rewards/kidney_reward/std": 1.3029874563217163, + "rewards/length2tails_reward/mean": 0.8087965250015259, + "rewards/length2tails_reward/std": 0.2802841365337372, + "rewards/thermo_reward/mean": -0.06562555581331253, + "rewards/thermo_reward/std": 1.8809643983840942, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.14004128519445658, + "epoch": 2.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5020992755889893, + "learning_rate": 3.346348496933354e-07, + "loss": -0.0119, + "num_tokens": 9742302.0, + "reward": 6.610579490661621, + "reward_std": 2.528299570083618, + "rewards/fitness_reward/mean": 6.11955451965332, + "rewards/fitness_reward/std": 2.147643804550171, + "rewards/kidney_reward/mean": 0.07609789073467255, + "rewards/kidney_reward/std": 1.254422903060913, + "rewards/length2tails_reward/mean": 0.744000256061554, + "rewards/length2tails_reward/std": 0.31254515051841736, + "rewards/thermo_reward/mean": 0.5339514017105103, + "rewards/thermo_reward/std": 1.8591021299362183, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13200193643569946, + "epoch": 2.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5952981114387512, + "learning_rate": 3.330189949485176e-07, + "loss": -0.0066, + "num_tokens": 9750998.0, + "reward": 6.893965721130371, + "reward_std": 2.284776449203491, + "rewards/fitness_reward/mean": 6.276169776916504, + "rewards/fitness_reward/std": 1.782636284828186, + "rewards/kidney_reward/mean": 0.10508643835783005, + "rewards/kidney_reward/std": 1.3219181299209595, + "rewards/length2tails_reward/mean": 0.774809718132019, + "rewards/length2tails_reward/std": 0.2963670790195465, + "rewards/thermo_reward/mean": 0.7431015968322754, + "rewards/thermo_reward/std": 1.5359654426574707, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.3125, + "completions/mean_terminated_length": 268.3125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.1448118006810546, + "epoch": 2.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4387843608856201, + "learning_rate": 3.314062711602219e-07, + "loss": 0.0086, + "num_tokens": 9759616.0, + "reward": 6.527230262756348, + "reward_std": 2.4334514141082764, + "rewards/fitness_reward/mean": 6.101813316345215, + "rewards/fitness_reward/std": 2.2444071769714355, + "rewards/kidney_reward/mean": -0.0010488182306289673, + "rewards/kidney_reward/std": 1.397708773612976, + "rewards/length2tails_reward/mean": 0.7927632331848145, + "rewards/length2tails_reward/std": 0.2553251087665558, + "rewards/thermo_reward/mean": 0.45550161600112915, + "rewards/thermo_reward/std": 1.629749059677124, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14702480472624302, + "epoch": 2.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0844124555587769, + "learning_rate": 3.297966858989306e-07, + "loss": 0.0149, + "num_tokens": 9768356.0, + "reward": 7.20062255859375, + "reward_std": 1.3701928853988647, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3820679783821106, + "rewards/kidney_reward/std": 1.3558621406555176, + "rewards/length2tails_reward/mean": 0.7445996999740601, + "rewards/length2tails_reward/std": 0.26277369260787964, + "rewards/thermo_reward/mean": 0.6702588200569153, + "rewards/thermo_reward/std": 1.55410897731781, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 278.5625, + "completions/mean_terminated_length": 278.5625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.17868938017636538, + "epoch": 2.2359999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359875202178955, + "learning_rate": 3.2819024672039397e-07, + "loss": 0.0189, + "num_tokens": 9777302.0, + "reward": 6.515703201293945, + "reward_std": 1.8160476684570312, + "rewards/fitness_reward/mean": 6.107875347137451, + "rewards/fitness_reward/std": 1.7326784133911133, + "rewards/kidney_reward/mean": 0.27751773595809937, + "rewards/kidney_reward/std": 1.513783574104309, + "rewards/length2tails_reward/mean": 0.7718020677566528, + "rewards/length2tails_reward/std": 0.3188757300376892, + "rewards/thermo_reward/mean": 0.15223775804042816, + "rewards/thermo_reward/std": 2.026869535446167, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 275.34375, + "completions/mean_terminated_length": 275.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1252358015626669, + "epoch": 2.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.221222162246704, + "learning_rate": 3.2658696116559246e-07, + "loss": 0.0278, + "num_tokens": 9786145.0, + "reward": 7.290962219238281, + "reward_std": 1.0013223886489868, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.490835964679718, + "rewards/kidney_reward/std": 1.2843295335769653, + "rewards/length2tails_reward/mean": 0.7329350709915161, + "rewards/length2tails_reward/std": 0.30049464106559753, + "rewards/thermo_reward/mean": 0.5420238971710205, + "rewards/thermo_reward/std": 1.814892292022705, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13905903976410627, + "epoch": 2.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44827377796173096, + "learning_rate": 3.249868367607046e-07, + "loss": 0.0013, + "num_tokens": 9794848.0, + "reward": 6.8745927810668945, + "reward_std": 1.7337543964385986, + "rewards/fitness_reward/mean": 6.314060688018799, + "rewards/fitness_reward/std": 1.5682917833328247, + "rewards/kidney_reward/mean": 0.1424044370651245, + "rewards/kidney_reward/std": 1.142085075378418, + "rewards/length2tails_reward/mean": 0.8225421905517578, + "rewards/length2tails_reward/std": 0.2541775703430176, + "rewards/thermo_reward/mean": 0.5673877000808716, + "rewards/thermo_reward/std": 1.4604969024658203, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1330010173842311, + "epoch": 2.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3640967905521393, + "learning_rate": 3.233898810170672e-07, + "loss": -0.0014, + "num_tokens": 9803545.0, + "reward": 6.383605003356934, + "reward_std": 2.4613773822784424, + "rewards/fitness_reward/mean": 6.014791965484619, + "rewards/fitness_reward/std": 2.270177125930786, + "rewards/kidney_reward/mean": 0.05119447410106659, + "rewards/kidney_reward/std": 1.379941701889038, + "rewards/length2tails_reward/mean": 0.7586827874183655, + "rewards/length2tails_reward/std": 0.2753525972366333, + "rewards/thermo_reward/mean": 0.30708959698677063, + "rewards/thermo_reward/std": 1.9418647289276123, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 279.0, + "completions/mean_terminated_length": 279.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.18136304151266813, + "epoch": 2.2439999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3095173835754395, + "learning_rate": 3.217961014311451e-07, + "loss": 0.0221, + "num_tokens": 9812505.0, + "reward": 6.982551574707031, + "reward_std": 2.1172678470611572, + "rewards/fitness_reward/mean": 6.2517900466918945, + "rewards/fitness_reward/std": 1.9205467700958252, + "rewards/kidney_reward/mean": 0.31968259811401367, + "rewards/kidney_reward/std": 1.2079168558120728, + "rewards/length2tails_reward/mean": 0.8015772700309753, + "rewards/length2tails_reward/std": 0.23252037167549133, + "rewards/thermo_reward/mean": 0.7410516738891602, + "rewards/thermo_reward/std": 1.396042823791504, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 264.90625, + "completions/mean_terminated_length": 264.90625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.14939401112496853, + "epoch": 2.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7552655935287476, + "learning_rate": 3.202055054844921e-07, + "loss": -0.121, + "num_tokens": 9821014.0, + "reward": 6.213498592376709, + "reward_std": 2.918909788131714, + "rewards/fitness_reward/mean": 5.943560600280762, + "rewards/fitness_reward/std": 2.0997536182403564, + "rewards/kidney_reward/mean": -0.0921488031744957, + "rewards/kidney_reward/std": 1.3236479759216309, + "rewards/length2tails_reward/mean": 0.734432578086853, + "rewards/length2tails_reward/std": 0.3230380117893219, + "rewards/thermo_reward/mean": 0.26480913162231445, + "rewards/thermo_reward/std": 1.7639621496200562, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1501129064708948, + "epoch": 2.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7631993293762207, + "learning_rate": 3.1861810064371817e-07, + "loss": 0.0761, + "num_tokens": 9829821.0, + "reward": 6.711342811584473, + "reward_std": 2.2005629539489746, + "rewards/fitness_reward/mean": 6.134913444519043, + "rewards/fitness_reward/std": 2.0641496181488037, + "rewards/kidney_reward/mean": 0.06089578568935394, + "rewards/kidney_reward/std": 1.4127472639083862, + "rewards/length2tails_reward/mean": 0.7560282945632935, + "rewards/length2tails_reward/std": 0.2609768211841583, + "rewards/thermo_reward/mean": 0.713949978351593, + "rewards/thermo_reward/std": 1.5502530336380005, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.15148264076560736, + "epoch": 2.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.407090663909912, + "learning_rate": 3.1703389436045304e-07, + "loss": -0.0184, + "num_tokens": 9838476.0, + "reward": 5.6651177406311035, + "reward_std": 3.9739935398101807, + "rewards/fitness_reward/mean": 5.512838840484619, + "rewards/fitness_reward/std": 3.415465831756592, + "rewards/kidney_reward/mean": -0.15681803226470947, + "rewards/kidney_reward/std": 1.3865338563919067, + "rewards/length2tails_reward/mean": 0.823821485042572, + "rewards/length2tails_reward/std": 0.21812903881072998, + "rewards/thermo_reward/mean": 0.04946497082710266, + "rewards/thermo_reward/std": 2.058556318283081, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1313493186607957, + "epoch": 2.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.443270444869995, + "learning_rate": 3.154528940713113e-07, + "loss": 0.0839, + "num_tokens": 9847283.0, + "reward": 6.752986907958984, + "reward_std": 2.4876766204833984, + "rewards/fitness_reward/mean": 6.119589328765869, + "rewards/fitness_reward/std": 2.1474530696868896, + "rewards/kidney_reward/mean": 0.3571246862411499, + "rewards/kidney_reward/std": 1.1678204536437988, + "rewards/length2tails_reward/mean": 0.7066287398338318, + "rewards/length2tails_reward/std": 0.3493049740791321, + "rewards/thermo_reward/mean": 0.5563567876815796, + "rewards/thermo_reward/std": 1.5851272344589233, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12769818119704723, + "epoch": 2.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5218151807785034, + "learning_rate": 3.1387510719785905e-07, + "loss": 0.0009, + "num_tokens": 9855968.0, + "reward": 7.2624006271362305, + "reward_std": 1.3915042877197266, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.7155039310455322, + "rewards/kidney_reward/std": 1.4847861528396606, + "rewards/length2tails_reward/mean": 0.7721256613731384, + "rewards/length2tails_reward/std": 0.290002703666687, + "rewards/thermo_reward/mean": 0.4466163218021393, + "rewards/thermo_reward/std": 1.6794196367263794, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 266.21875, + "completions/mean_terminated_length": 266.21875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.14394528977572918, + "epoch": 2.2560000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7367030382156372, + "learning_rate": 3.123005411465766e-07, + "loss": -0.0961, + "num_tokens": 9864519.0, + "reward": 6.965503692626953, + "reward_std": 2.262582778930664, + "rewards/fitness_reward/mean": 6.247040271759033, + "rewards/fitness_reward/std": 1.9474167823791504, + "rewards/kidney_reward/mean": 0.1800956130027771, + "rewards/kidney_reward/std": 1.3833717107772827, + "rewards/length2tails_reward/mean": 0.7764920592308044, + "rewards/length2tails_reward/std": 0.25169482827186584, + "rewards/thermo_reward/mean": 0.8685853481292725, + "rewards/thermo_reward/std": 1.2544481754302979, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 281.4375, + "completions/mean_terminated_length": 281.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17429085541516542, + "epoch": 2.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.2538480758667, + "learning_rate": 3.1072920330882646e-07, + "loss": 0.1443, + "num_tokens": 9873557.0, + "reward": 6.403508186340332, + "reward_std": 2.932964563369751, + "rewards/fitness_reward/mean": 5.69378662109375, + "rewards/fitness_reward/std": 2.7939910888671875, + "rewards/kidney_reward/mean": 0.2700694799423218, + "rewards/kidney_reward/std": 1.2925341129302979, + "rewards/length2tails_reward/mean": 0.7251986265182495, + "rewards/length2tails_reward/std": 0.30658435821533203, + "rewards/thermo_reward/mean": 0.786773681640625, + "rewards/thermo_reward/std": 1.2907570600509644, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 274.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14690136443823576, + "epoch": 2.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4197208881378174, + "learning_rate": 3.091611010608154e-07, + "loss": 0.0451, + "num_tokens": 9882378.0, + "reward": 5.54134464263916, + "reward_std": 3.5005712509155273, + "rewards/fitness_reward/mean": 5.627751350402832, + "rewards/fitness_reward/std": 3.036644220352173, + "rewards/kidney_reward/mean": -0.13826271891593933, + "rewards/kidney_reward/std": 1.1495153903961182, + "rewards/length2tails_reward/mean": 0.8459690809249878, + "rewards/length2tails_reward/std": 0.1850215494632721, + "rewards/thermo_reward/mean": -0.4575344920158386, + "rewards/thermo_reward/std": 2.021501064300537, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 276.0, + "completions/mean_terminated_length": 276.0, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.15107757970690727, + "epoch": 2.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8602656126022339, + "learning_rate": 3.0759624176356335e-07, + "loss": 0.0831, + "num_tokens": 9891242.0, + "reward": 6.555501937866211, + "reward_std": 3.2635607719421387, + "rewards/fitness_reward/mean": 5.755517959594727, + "rewards/fitness_reward/std": 2.9152650833129883, + "rewards/kidney_reward/mean": 0.5503237247467041, + "rewards/kidney_reward/std": 1.1461389064788818, + "rewards/length2tails_reward/mean": 0.8111335039138794, + "rewards/length2tails_reward/std": 0.23243021965026855, + "rewards/thermo_reward/mean": 0.6440776586532593, + "rewards/thermo_reward/std": 1.562057614326477, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 276.46875, + "completions/mean_terminated_length": 276.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1685433927923441, + "epoch": 2.2640000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.130997657775879, + "learning_rate": 3.060346327628657e-07, + "loss": 0.0672, + "num_tokens": 9900121.0, + "reward": 7.0970587730407715, + "reward_std": 2.550595760345459, + "rewards/fitness_reward/mean": 6.118082046508789, + "rewards/fitness_reward/std": 2.1556596755981445, + "rewards/kidney_reward/mean": 0.5129806399345398, + "rewards/kidney_reward/std": 1.4165408611297607, + "rewards/length2tails_reward/mean": 0.8275707364082336, + "rewards/length2tails_reward/std": 0.19341854751110077, + "rewards/thermo_reward/mean": 1.031186580657959, + "rewards/thermo_reward/std": 1.2018721103668213, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1363239772617817, + "epoch": 2.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5947807431221008, + "learning_rate": 3.044762813892615e-07, + "loss": 0.001, + "num_tokens": 9908803.0, + "reward": 6.945971488952637, + "reward_std": 1.3877177238464355, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.47257179021835327, + "rewards/kidney_reward/std": 1.2036300897598267, + "rewards/length2tails_reward/mean": 0.7745300531387329, + "rewards/length2tails_reward/std": 0.26186293363571167, + "rewards/thermo_reward/mean": 0.46744269132614136, + "rewards/thermo_reward/std": 1.7544705867767334, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14486193470656872, + "epoch": 2.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4698144495487213, + "learning_rate": 3.0292119495799697e-07, + "loss": -0.0061, + "num_tokens": 9917498.0, + "reward": 7.37183952331543, + "reward_std": 1.326612949371338, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4981863796710968, + "rewards/kidney_reward/std": 1.4668222665786743, + "rewards/length2tails_reward/mean": 0.7544223070144653, + "rewards/length2tails_reward/std": 0.34047284722328186, + "rewards/thermo_reward/mean": 0.8916619420051575, + "rewards/thermo_reward/std": 1.4170936346054077, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13047132920473814, + "epoch": 2.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.024953007698059, + "learning_rate": 3.0136938076899165e-07, + "loss": 0.0047, + "num_tokens": 9926210.0, + "reward": 6.963624000549316, + "reward_std": 1.2276817560195923, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.6006568074226379, + "rewards/kidney_reward/std": 1.296803593635559, + "rewards/length2tails_reward/mean": 0.7975627183914185, + "rewards/length2tails_reward/std": 0.2886776030063629, + "rewards/thermo_reward/mean": -0.048808708786964417, + "rewards/thermo_reward/std": 1.9422856569290161, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13062628731131554, + "epoch": 2.2720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.29659104347229, + "learning_rate": 2.998208461068057e-07, + "loss": 0.0001, + "num_tokens": 9934888.0, + "reward": 6.898680210113525, + "reward_std": 1.4114975929260254, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4596465229988098, + "rewards/kidney_reward/std": 1.4191508293151855, + "rewards/length2tails_reward/mean": 0.7444043159484863, + "rewards/length2tails_reward/std": 0.32103368639945984, + "rewards/thermo_reward/mean": -0.011106722056865692, + "rewards/thermo_reward/std": 1.873618483543396, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13647643569856882, + "epoch": 2.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1076407432556152, + "learning_rate": 2.982755982406031e-07, + "loss": 0.0549, + "num_tokens": 9943695.0, + "reward": 6.9025044441223145, + "reward_std": 2.53261399269104, + "rewards/fitness_reward/mean": 6.096895217895508, + "rewards/fitness_reward/std": 2.2712855339050293, + "rewards/kidney_reward/mean": 0.23024243116378784, + "rewards/kidney_reward/std": 1.316430926322937, + "rewards/length2tails_reward/mean": 0.8103106021881104, + "rewards/length2tails_reward/std": 0.25244706869125366, + "rewards/thermo_reward/mean": 0.9758197665214539, + "rewards/thermo_reward/std": 1.5387763977050781, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13468873128294945, + "epoch": 2.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5033718943595886, + "learning_rate": 2.9673364442411995e-07, + "loss": 0.0044, + "num_tokens": 9952412.0, + "reward": 6.83325719833374, + "reward_std": 1.0275230407714844, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.21671253442764282, + "rewards/kidney_reward/std": 1.2953304052352905, + "rewards/length2tails_reward/mean": 0.8311097025871277, + "rewards/length2tails_reward/std": 0.20803603529930115, + "rewards/thermo_reward/mean": 0.28507527709007263, + "rewards/thermo_reward/std": 1.809030532836914, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13675993122160435, + "epoch": 2.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6384673714637756, + "learning_rate": 2.951949918956288e-07, + "loss": 0.0013, + "num_tokens": 9961087.0, + "reward": 7.165763854980469, + "reward_std": 1.2653238773345947, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.012888401746749878, + "rewards/kidney_reward/std": 1.5287208557128906, + "rewards/length2tails_reward/mean": 0.7900882363319397, + "rewards/length2tails_reward/std": 0.25474897027015686, + "rewards/thermo_reward/mean": 0.7409989833831787, + "rewards/thermo_reward/std": 1.6703959703445435, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.14245271496474743, + "epoch": 2.2800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7950602769851685, + "learning_rate": 2.936596478779051e-07, + "loss": 0.002, + "num_tokens": 9969790.0, + "reward": 6.70292329788208, + "reward_std": 2.7593190670013428, + "rewards/fitness_reward/mean": 6.225475311279297, + "rewards/fitness_reward/std": 2.0694053173065186, + "rewards/kidney_reward/mean": 0.2980394661426544, + "rewards/kidney_reward/std": 1.2852166891098022, + "rewards/length2tails_reward/mean": 0.8485820293426514, + "rewards/length2tails_reward/std": 0.24332080781459808, + "rewards/thermo_reward/mean": 0.2325647473335266, + "rewards/thermo_reward/std": 2.0339057445526123, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14089277666062117, + "epoch": 2.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35755231976509094, + "learning_rate": 2.921276195781934e-07, + "loss": -0.0022, + "num_tokens": 9978467.0, + "reward": 6.993310928344727, + "reward_std": 2.2121758460998535, + "rewards/fitness_reward/mean": 6.187580108642578, + "rewards/fitness_reward/std": 1.7803035974502563, + "rewards/kidney_reward/mean": 0.5924762487411499, + "rewards/kidney_reward/std": 1.1589877605438232, + "rewards/length2tails_reward/mean": 0.7593258619308472, + "rewards/length2tails_reward/std": 0.2890802323818207, + "rewards/thermo_reward/mean": 0.63932204246521, + "rewards/thermo_reward/std": 1.4457496404647827, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 275.90625, + "completions/mean_terminated_length": 275.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16248072870075703, + "epoch": 2.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.812143087387085, + "learning_rate": 2.905989141881745e-07, + "loss": 0.074, + "num_tokens": 9987328.0, + "reward": 6.438668251037598, + "reward_std": 2.949441909790039, + "rewards/fitness_reward/mean": 5.766383171081543, + "rewards/fitness_reward/std": 2.5283544063568115, + "rewards/kidney_reward/mean": 0.4625045657157898, + "rewards/kidney_reward/std": 1.5662392377853394, + "rewards/length2tails_reward/mean": 0.750740647315979, + "rewards/length2tails_reward/std": 0.31301507353782654, + "rewards/thermo_reward/mean": 0.506695568561554, + "rewards/thermo_reward/std": 1.6849232912063599, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 275.625, + "completions/mean_terminated_length": 275.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1548888711258769, + "epoch": 2.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1333611011505127, + "learning_rate": 2.890735388839295e-07, + "loss": 0.092, + "num_tokens": 9996180.0, + "reward": 6.49191951751709, + "reward_std": 2.1634879112243652, + "rewards/fitness_reward/mean": 6.126874923706055, + "rewards/fitness_reward/std": 2.1078131198883057, + "rewards/kidney_reward/mean": 0.28500813245773315, + "rewards/kidney_reward/std": 1.3601386547088623, + "rewards/length2tails_reward/mean": 0.7393205761909485, + "rewards/length2tails_reward/std": 0.3163299560546875, + "rewards/thermo_reward/mean": 0.07542131841182709, + "rewards/thermo_reward/std": 1.817592978477478, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 262.59375, + "completions/mean_terminated_length": 262.59375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 0.14647703152149916, + "epoch": 2.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43914714455604553, + "learning_rate": 2.8755150082590783e-07, + "loss": -0.1284, + "num_tokens": 10004615.0, + "reward": 6.247187614440918, + "reward_std": 2.8103861808776855, + "rewards/fitness_reward/mean": 5.751132011413574, + "rewards/fitness_reward/std": 2.5774388313293457, + "rewards/kidney_reward/mean": 0.2969706952571869, + "rewards/kidney_reward/std": 1.332061767578125, + "rewards/length2tails_reward/mean": 0.7766377925872803, + "rewards/length2tails_reward/std": 0.25333479046821594, + "rewards/thermo_reward/mean": 0.3068217635154724, + "rewards/thermo_reward/std": 1.726491928100586, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 278.46875, + "completions/mean_terminated_length": 278.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14597509801387787, + "epoch": 2.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.909884452819824, + "learning_rate": 2.86032807158893e-07, + "loss": 0.1121, + "num_tokens": 10013558.0, + "reward": 6.522914409637451, + "reward_std": 2.8379745483398438, + "rewards/fitness_reward/mean": 5.811135292053223, + "rewards/fitness_reward/std": 2.7047131061553955, + "rewards/kidney_reward/mean": 0.17545320093631744, + "rewards/kidney_reward/std": 1.260187029838562, + "rewards/length2tails_reward/mean": 0.7454816102981567, + "rewards/length2tails_reward/std": 0.24381452798843384, + "rewards/thermo_reward/mean": 0.8753640651702881, + "rewards/thermo_reward/std": 1.8125619888305664, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1433148104697466, + "epoch": 2.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7803038954734802, + "learning_rate": 2.845174650119699e-07, + "loss": -0.0045, + "num_tokens": 10022241.0, + "reward": 7.159573554992676, + "reward_std": 1.2367409467697144, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.17590183019638062, + "rewards/kidney_reward/std": 1.2380211353302002, + "rewards/length2tails_reward/mean": 0.7624531388282776, + "rewards/length2tails_reward/std": 0.25679370760917664, + "rewards/thermo_reward/mean": 0.7854005098342896, + "rewards/thermo_reward/std": 1.5559492111206055, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.143145683221519, + "epoch": 2.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7482324838638306, + "learning_rate": 2.830054814984895e-07, + "loss": 0.0029, + "num_tokens": 10030941.0, + "reward": 7.165678024291992, + "reward_std": 1.2209903001785278, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.3634495437145233, + "rewards/kidney_reward/std": 1.4516096115112305, + "rewards/length2tails_reward/mean": 0.8056307435035706, + "rewards/length2tails_reward/std": 0.2336663007736206, + "rewards/thermo_reward/mean": 0.7944506406784058, + "rewards/thermo_reward/std": 1.5578457117080688, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 276.75, + "completions/mean_terminated_length": 276.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17255778145045042, + "epoch": 2.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.461302638053894, + "learning_rate": 2.8149686371603764e-07, + "loss": 0.0366, + "num_tokens": 10039829.0, + "reward": 7.177928924560547, + "reward_std": 1.0039269924163818, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2968369722366333, + "rewards/kidney_reward/std": 1.2228924036026, + "rewards/length2tails_reward/mean": 0.7555817365646362, + "rewards/length2tails_reward/std": 0.3429839015007019, + "rewards/thermo_reward/mean": 0.49863356351852417, + "rewards/thermo_reward/std": 1.6687123775482178, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13889530394226313, + "epoch": 2.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48695307970046997, + "learning_rate": 2.799916187464002e-07, + "loss": -0.0049, + "num_tokens": 10048545.0, + "reward": 7.308387756347656, + "reward_std": 1.3392767906188965, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4232565760612488, + "rewards/kidney_reward/std": 1.2407385110855103, + "rewards/length2tails_reward/mean": 0.8234907388687134, + "rewards/length2tails_reward/std": 0.2724844217300415, + "rewards/thermo_reward/mean": 0.8051555156707764, + "rewards/thermo_reward/std": 1.6539466381072998, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14189641643315554, + "epoch": 2.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.567063570022583, + "learning_rate": 2.7848975365552987e-07, + "loss": 0.0006, + "num_tokens": 10057268.0, + "reward": 6.469785690307617, + "reward_std": 3.2542052268981934, + "rewards/fitness_reward/mean": 5.685840129852295, + "rewards/fitness_reward/std": 2.8670804500579834, + "rewards/kidney_reward/mean": 0.006724223494529724, + "rewards/kidney_reward/std": 1.522915005683899, + "rewards/length2tails_reward/mean": 0.7490498423576355, + "rewards/length2tails_reward/std": 0.2965738773345947, + "rewards/thermo_reward/mean": 1.186640977859497, + "rewards/thermo_reward/std": 1.204087257385254, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12426550779491663, + "epoch": 2.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.488528251647949, + "learning_rate": 2.7699127549351455e-07, + "loss": -0.0027, + "num_tokens": 10065931.0, + "reward": 6.852128028869629, + "reward_std": 2.106959104537964, + "rewards/fitness_reward/mean": 6.086380958557129, + "rewards/fitness_reward/std": 1.8410536050796509, + "rewards/kidney_reward/mean": 0.31044989824295044, + "rewards/kidney_reward/std": 1.367721438407898, + "rewards/length2tails_reward/mean": 0.7637225389480591, + "rewards/length2tails_reward/std": 0.2808883786201477, + "rewards/thermo_reward/mean": 0.8391827344894409, + "rewards/thermo_reward/std": 1.4314473867416382, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14551053754985332, + "epoch": 2.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.514233112335205, + "learning_rate": 2.7549619129454205e-07, + "loss": 0.0551, + "num_tokens": 10074707.0, + "reward": 6.92626428604126, + "reward_std": 2.2399752140045166, + "rewards/fitness_reward/mean": 6.146989822387695, + "rewards/fitness_reward/std": 1.9987030029296875, + "rewards/kidney_reward/mean": 0.3568127453327179, + "rewards/kidney_reward/std": 1.4845975637435913, + "rewards/length2tails_reward/mean": 0.7753641605377197, + "rewards/length2tails_reward/std": 0.31029778718948364, + "rewards/thermo_reward/mean": 0.8140542507171631, + "rewards/thermo_reward/std": 1.629408836364746, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 279.375, + "completions/mean_terminated_length": 279.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15098983887583017, + "epoch": 2.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4000041484832764, + "learning_rate": 2.740045080768694e-07, + "loss": 0.1174, + "num_tokens": 10083679.0, + "reward": 7.016284942626953, + "reward_std": 2.6789653301239014, + "rewards/fitness_reward/mean": 6.223532199859619, + "rewards/fitness_reward/std": 2.0803983211517334, + "rewards/kidney_reward/mean": 0.26236528158187866, + "rewards/kidney_reward/std": 1.2897542715072632, + "rewards/length2tails_reward/mean": 0.8820062279701233, + "rewards/length2tails_reward/std": 0.14290156960487366, + "rewards/thermo_reward/mean": 0.8821379542350769, + "rewards/thermo_reward/std": 1.5455716848373413, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13005751743912697, + "epoch": 2.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4123654067516327, + "learning_rate": 2.725162328427868e-07, + "loss": -0.0024, + "num_tokens": 10092350.0, + "reward": 7.120850563049316, + "reward_std": 1.2107441425323486, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4530317485332489, + "rewards/kidney_reward/std": 1.4273309707641602, + "rewards/length2tails_reward/mean": 0.7901430726051331, + "rewards/length2tails_reward/std": 0.238103985786438, + "rewards/thermo_reward/mean": 0.41697895526885986, + "rewards/thermo_reward/std": 1.6289472579956055, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 281.40625, + "completions/mean_terminated_length": 281.40625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.1934234071522951, + "epoch": 2.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.138711929321289, + "learning_rate": 2.7103137257858863e-07, + "loss": 0.0683, + "num_tokens": 10101387.0, + "reward": 5.405478477478027, + "reward_std": 4.390636444091797, + "rewards/fitness_reward/mean": 4.621011734008789, + "rewards/fitness_reward/std": 4.17106294631958, + "rewards/kidney_reward/mean": 0.20216971635818481, + "rewards/kidney_reward/std": 1.4290484189987183, + "rewards/length2tails_reward/mean": 0.8238368034362793, + "rewards/length2tails_reward/std": 0.2821885645389557, + "rewards/thermo_reward/mean": 0.9548454284667969, + "rewards/thermo_reward/std": 1.355315923690796, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.135171290487051, + "epoch": 2.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9934455156326294, + "learning_rate": 2.695499342545372e-07, + "loss": 0.0099, + "num_tokens": 10110091.0, + "reward": 6.883745193481445, + "reward_std": 0.9391064047813416, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.09617552906274796, + "rewards/kidney_reward/std": 1.430647611618042, + "rewards/length2tails_reward/mean": 0.7151154279708862, + "rewards/length2tails_reward/std": 0.3018137812614441, + "rewards/thermo_reward/mean": 0.33713847398757935, + "rewards/thermo_reward/std": 1.5491206645965576, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13093197625130415, + "epoch": 2.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5965590476989746, + "learning_rate": 2.680719248248324e-07, + "loss": -0.0048, + "num_tokens": 10118792.0, + "reward": 7.082731246948242, + "reward_std": 1.3311866521835327, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.18591594696044922, + "rewards/kidney_reward/std": 1.3101365566253662, + "rewards/length2tails_reward/mean": 0.7724572420120239, + "rewards/length2tails_reward/std": 0.29545989632606506, + "rewards/thermo_reward/mean": 0.6166998744010925, + "rewards/thermo_reward/std": 1.6128522157669067, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16137641575187445, + "epoch": 2.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7524734735488892, + "learning_rate": 2.665973512275778e-07, + "loss": -0.0057, + "num_tokens": 10127548.0, + "reward": 7.302036762237549, + "reward_std": 1.6519207954406738, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.945309042930603, + "rewards/kidney_reward/std": 1.3692522048950195, + "rewards/length2tails_reward/mean": 0.8006963729858398, + "rewards/length2tails_reward/std": 0.30125951766967773, + "rewards/thermo_reward/mean": 0.48777538537979126, + "rewards/thermo_reward/std": 1.6223019361495972, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13204739429056644, + "epoch": 2.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7398700714111328, + "learning_rate": 2.6512622038474777e-07, + "loss": 0.0016, + "num_tokens": 10136224.0, + "reward": 7.105605602264404, + "reward_std": 1.1830435991287231, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.30743616819381714, + "rewards/kidney_reward/std": 1.4318221807479858, + "rewards/length2tails_reward/mean": 0.738235354423523, + "rewards/length2tails_reward/std": 0.30172234773635864, + "rewards/thermo_reward/mean": 0.35206061601638794, + "rewards/thermo_reward/std": 1.8971922397613525, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 279.0625, + "completions/mean_terminated_length": 279.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16808363143354654, + "epoch": 2.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.423923969268799, + "learning_rate": 2.6365853920215697e-07, + "loss": 0.1457, + "num_tokens": 10145186.0, + "reward": 6.135985374450684, + "reward_std": 2.684715986251831, + "rewards/fitness_reward/mean": 6.040144443511963, + "rewards/fitness_reward/std": 2.0791406631469727, + "rewards/kidney_reward/mean": -0.024739481508731842, + "rewards/kidney_reward/std": 1.3711817264556885, + "rewards/length2tails_reward/mean": 0.7351970672607422, + "rewards/length2tails_reward/std": 0.34171009063720703, + "rewards/thermo_reward/mean": -0.15117761492729187, + "rewards/thermo_reward/std": 1.868726372718811, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 274.71875, + "completions/mean_terminated_length": 274.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14421281218528748, + "epoch": 2.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6817800998687744, + "learning_rate": 2.6219431456942534e-07, + "loss": 0.0829, + "num_tokens": 10154009.0, + "reward": 6.730805397033691, + "reward_std": 2.322556257247925, + "rewards/fitness_reward/mean": 6.2560133934021, + "rewards/fitness_reward/std": 1.8966573476791382, + "rewards/kidney_reward/mean": 0.23124201595783234, + "rewards/kidney_reward/std": 1.4578009843826294, + "rewards/length2tails_reward/mean": 0.7351087331771851, + "rewards/length2tails_reward/std": 0.2897837460041046, + "rewards/thermo_reward/mean": 0.3507865369319916, + "rewards/thermo_reward/std": 1.6739004850387573, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14327289443463087, + "epoch": 2.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3221757411956787, + "learning_rate": 2.607335533599482e-07, + "loss": 0.0017, + "num_tokens": 10162711.0, + "reward": 6.812370300292969, + "reward_std": 1.8919966220855713, + "rewards/fitness_reward/mean": 6.31130838394165, + "rewards/fitness_reward/std": 1.5838603973388672, + "rewards/kidney_reward/mean": -0.035795390605926514, + "rewards/kidney_reward/std": 1.489255428314209, + "rewards/length2tails_reward/mean": 0.8602060079574585, + "rewards/length2tails_reward/std": 0.18451331555843353, + "rewards/thermo_reward/mean": 0.6078147888183594, + "rewards/thermo_reward/std": 1.736184000968933, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 279.3125, + "completions/mean_terminated_length": 279.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15709468722343445, + "epoch": 2.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0949467420578003, + "learning_rate": 2.5927626243086096e-07, + "loss": -0.0192, + "num_tokens": 10171681.0, + "reward": 6.538366317749023, + "reward_std": 2.906224489212036, + "rewards/fitness_reward/mean": 5.960531234741211, + "rewards/fitness_reward/std": 2.4845924377441406, + "rewards/kidney_reward/mean": 0.10158563405275345, + "rewards/kidney_reward/std": 1.453756332397461, + "rewards/length2tails_reward/mean": 0.81050705909729, + "rewards/length2tails_reward/std": 0.23362566530704498, + "rewards/thermo_reward/mean": 0.6488308906555176, + "rewards/thermo_reward/std": 1.668876051902771, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.14008358772844076, + "epoch": 2.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.509313702583313, + "learning_rate": 2.5782244862301095e-07, + "loss": -0.062, + "num_tokens": 10180253.0, + "reward": 6.314507007598877, + "reward_std": 2.6059350967407227, + "rewards/fitness_reward/mean": 6.029257297515869, + "rewards/fitness_reward/std": 2.1360023021698, + "rewards/kidney_reward/mean": 0.07857340574264526, + "rewards/kidney_reward/std": 1.463223934173584, + "rewards/length2tails_reward/mean": 0.8019428253173828, + "rewards/length2tails_reward/std": 0.27328240871429443, + "rewards/thermo_reward/mean": 0.09095507860183716, + "rewards/thermo_reward/std": 1.8658111095428467, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.14028505142778158, + "epoch": 2.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3969097137451172, + "learning_rate": 2.5637211876092137e-07, + "loss": -0.0129, + "num_tokens": 10188951.0, + "reward": 6.396058082580566, + "reward_std": 2.9196786880493164, + "rewards/fitness_reward/mean": 6.200154781341553, + "rewards/fitness_reward/std": 2.2126405239105225, + "rewards/kidney_reward/mean": 0.13743309676647186, + "rewards/kidney_reward/std": 1.4949477910995483, + "rewards/length2tails_reward/mean": 0.7746238708496094, + "rewards/length2tails_reward/std": 0.29806816577911377, + "rewards/thermo_reward/mean": -0.13293835520744324, + "rewards/thermo_reward/std": 2.1223702430725098, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.28125, + "completions/mean_terminated_length": 267.28125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.16052438039332628, + "epoch": 2.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.768127202987671, + "learning_rate": 2.5492527965276244e-07, + "loss": -0.0619, + "num_tokens": 10197536.0, + "reward": 6.725530624389648, + "reward_std": 2.6515731811523438, + "rewards/fitness_reward/mean": 6.023658275604248, + "rewards/fitness_reward/std": 2.165344476699829, + "rewards/kidney_reward/mean": 0.4591490924358368, + "rewards/kidney_reward/std": 1.3041044473648071, + "rewards/length2tails_reward/mean": 0.7860908508300781, + "rewards/length2tails_reward/std": 0.3125840425491333, + "rewards/thermo_reward/mean": 0.5515505075454712, + "rewards/thermo_reward/std": 1.685711145401001, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.15762784983962774, + "epoch": 2.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2458972930908203, + "learning_rate": 2.534819380903169e-07, + "loss": -0.0173, + "num_tokens": 10206178.0, + "reward": 6.825693130493164, + "reward_std": 2.421786308288574, + "rewards/fitness_reward/mean": 6.036433696746826, + "rewards/fitness_reward/std": 2.0984911918640137, + "rewards/kidney_reward/mean": 0.4622931182384491, + "rewards/kidney_reward/std": 1.2436137199401855, + "rewards/length2tails_reward/mean": 0.7606848478317261, + "rewards/length2tails_reward/std": 0.2535126507282257, + "rewards/thermo_reward/mean": 0.735883355140686, + "rewards/thermo_reward/std": 1.4157614707946777, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.144895083270967, + "epoch": 2.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9818193912506104, + "learning_rate": 2.520421008489494e-07, + "loss": -0.0317, + "num_tokens": 10214837.0, + "reward": 6.600707530975342, + "reward_std": 2.3616011142730713, + "rewards/fitness_reward/mean": 6.108759880065918, + "rewards/fitness_reward/std": 2.2064812183380127, + "rewards/kidney_reward/mean": 0.21070554852485657, + "rewards/kidney_reward/std": 1.3787206411361694, + "rewards/length2tails_reward/mean": 0.8537660241127014, + "rewards/length2tails_reward/std": 0.23176810145378113, + "rewards/thermo_reward/mean": 0.3463066816329956, + "rewards/thermo_reward/std": 1.789523959159851, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13474767562001944, + "epoch": 2.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.629767894744873, + "learning_rate": 2.5060577468757525e-07, + "loss": 0.0012, + "num_tokens": 10223525.0, + "reward": 6.506112575531006, + "reward_std": 2.1205883026123047, + "rewards/fitness_reward/mean": 6.103724479675293, + "rewards/fitness_reward/std": 1.7534701824188232, + "rewards/kidney_reward/mean": 0.21151992678642273, + "rewards/kidney_reward/std": 1.3654052019119263, + "rewards/length2tails_reward/mean": 0.7133897542953491, + "rewards/length2tails_reward/std": 0.3494080901145935, + "rewards/thermo_reward/mean": 0.23656029999256134, + "rewards/thermo_reward/std": 1.871753215789795, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 264.125, + "completions/mean_terminated_length": 264.125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.13885567523539066, + "epoch": 2.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1258292198181152, + "learning_rate": 2.4917296634862697e-07, + "loss": -0.1356, + "num_tokens": 10232009.0, + "reward": 6.651339054107666, + "reward_std": 2.509489059448242, + "rewards/fitness_reward/mean": 6.155874252319336, + "rewards/fitness_reward/std": 1.950677514076233, + "rewards/kidney_reward/mean": 0.21324065327644348, + "rewards/kidney_reward/std": 1.4928628206253052, + "rewards/length2tails_reward/mean": 0.8334078788757324, + "rewards/length2tails_reward/std": 0.24128709733486176, + "rewards/thermo_reward/mean": 0.36098411679267883, + "rewards/thermo_reward/std": 1.7862483263015747, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14994161296635866, + "epoch": 2.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9447140097618103, + "learning_rate": 2.477436825580248e-07, + "loss": -0.0027, + "num_tokens": 10240767.0, + "reward": 6.521512985229492, + "reward_std": 3.2114169597625732, + "rewards/fitness_reward/mean": 5.63543176651001, + "rewards/fitness_reward/std": 3.0584099292755127, + "rewards/kidney_reward/mean": 0.44677281379699707, + "rewards/kidney_reward/std": 1.4299124479293823, + "rewards/length2tails_reward/mean": 0.833909809589386, + "rewards/length2tails_reward/std": 0.19332611560821533, + "rewards/thermo_reward/mean": 0.9084339141845703, + "rewards/thermo_reward/std": 1.313307523727417, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 280.3125, + "completions/mean_terminated_length": 280.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14267082139849663, + "epoch": 2.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8712918162345886, + "learning_rate": 2.463179300251429e-07, + "loss": -0.0319, + "num_tokens": 10249769.0, + "reward": 7.132099151611328, + "reward_std": 1.214680552482605, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2600446343421936, + "rewards/kidney_reward/std": 1.4254100322723389, + "rewards/length2tails_reward/mean": 0.6544859409332275, + "rewards/length2tails_reward/std": 0.33333057165145874, + "rewards/thermo_reward/mean": 0.49431395530700684, + "rewards/thermo_reward/std": 1.9180667400360107, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 267.53125, + "completions/mean_terminated_length": 267.53125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.11769244819879532, + "epoch": 2.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7912671566009521, + "learning_rate": 2.4489571544277943e-07, + "loss": -0.0143, + "num_tokens": 10258362.0, + "reward": 6.051982879638672, + "reward_std": 3.009906053543091, + "rewards/fitness_reward/mean": 5.629584789276123, + "rewards/fitness_reward/std": 2.7008590698242188, + "rewards/kidney_reward/mean": 0.29773762822151184, + "rewards/kidney_reward/std": 1.3560631275177002, + "rewards/length2tails_reward/mean": 0.6026171445846558, + "rewards/length2tails_reward/std": 0.3598494827747345, + "rewards/thermo_reward/mean": 0.24575097858905792, + "rewards/thermo_reward/std": 1.6942367553710938, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 286.0, + "completions/mean_terminated_length": 286.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.20299713220447302, + "epoch": 2.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4531784057617188, + "learning_rate": 2.4347704548712434e-07, + "loss": 0.0996, + "num_tokens": 10267546.0, + "reward": 5.53247594833374, + "reward_std": 3.9168713092803955, + "rewards/fitness_reward/mean": 5.195167064666748, + "rewards/fitness_reward/std": 3.475686550140381, + "rewards/kidney_reward/mean": 0.006807215511798859, + "rewards/kidney_reward/std": 1.386347770690918, + "rewards/length2tails_reward/mean": 0.7619220018386841, + "rewards/length2tails_reward/std": 0.30291324853897095, + "rewards/thermo_reward/mean": 0.2868500351905823, + "rewards/thermo_reward/std": 1.6460710763931274, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13557896949350834, + "epoch": 2.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4185727834701538, + "learning_rate": 2.42061926817729e-07, + "loss": -0.0002, + "num_tokens": 10276220.0, + "reward": 6.594975471496582, + "reward_std": 2.83086895942688, + "rewards/fitness_reward/mean": 6.009364128112793, + "rewards/fitness_reward/std": 2.240542411804199, + "rewards/kidney_reward/mean": 0.2377687394618988, + "rewards/kidney_reward/std": 1.431420922279358, + "rewards/length2tails_reward/mean": 0.7403048276901245, + "rewards/length2tails_reward/std": 0.2908630073070526, + "rewards/thermo_reward/mean": 0.5633013248443604, + "rewards/thermo_reward/std": 1.5832595825195312, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 296.3125, + "completions/mean_terminated_length": 281.5483703613281, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.18004830460995436, + "epoch": 2.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.299211263656616, + "learning_rate": 2.406503660774735e-07, + "loss": 0.1171, + "num_tokens": 10285734.0, + "reward": 6.081445693969727, + "reward_std": 3.0942792892456055, + "rewards/fitness_reward/mean": 5.688260555267334, + "rewards/fitness_reward/std": 2.8182294368743896, + "rewards/kidney_reward/mean": 0.0362309068441391, + "rewards/kidney_reward/std": 1.2177637815475464, + "rewards/length2tails_reward/mean": 0.7353615760803223, + "rewards/length2tails_reward/std": 0.33561962842941284, + "rewards/thermo_reward/mean": 0.3824585974216461, + "rewards/thermo_reward/std": 1.8383256196975708, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.17096226196736097, + "epoch": 2.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5118963718414307, + "learning_rate": 2.3924236989253623e-07, + "loss": 0.0368, + "num_tokens": 10294590.0, + "reward": 6.513851165771484, + "reward_std": 2.838120698928833, + "rewards/fitness_reward/mean": 5.801671504974365, + "rewards/fitness_reward/std": 2.7391364574432373, + "rewards/kidney_reward/mean": 0.5089992880821228, + "rewards/kidney_reward/std": 1.289876937866211, + "rewards/length2tails_reward/mean": 0.7929275035858154, + "rewards/length2tails_reward/std": 0.2507036328315735, + "rewards/thermo_reward/mean": 0.5188964009284973, + "rewards/thermo_reward/std": 1.4083348512649536, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 276.71875, + "completions/mean_terminated_length": 276.71875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.15524767246097326, + "epoch": 2.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.261232852935791, + "learning_rate": 2.3783794487236365e-07, + "loss": 0.1132, + "num_tokens": 10303477.0, + "reward": 6.799949645996094, + "reward_std": 2.0617637634277344, + "rewards/fitness_reward/mean": 6.2409162521362305, + "rewards/fitness_reward/std": 1.9820597171783447, + "rewards/kidney_reward/mean": 0.32192671298980713, + "rewards/kidney_reward/std": 1.2968709468841553, + "rewards/length2tails_reward/mean": 0.7741771936416626, + "rewards/length2tails_reward/std": 0.3204895555973053, + "rewards/thermo_reward/mean": 0.40905165672302246, + "rewards/thermo_reward/std": 1.5724427700042725, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12975471280515194, + "epoch": 2.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0848793983459473, + "learning_rate": 2.3643709760963714e-07, + "loss": 0.0115, + "num_tokens": 10312193.0, + "reward": 6.577237606048584, + "reward_std": 2.452291965484619, + "rewards/fitness_reward/mean": 6.127921104431152, + "rewards/fitness_reward/std": 2.102125883102417, + "rewards/kidney_reward/mean": 0.3501453101634979, + "rewards/kidney_reward/std": 1.5519590377807617, + "rewards/length2tails_reward/mean": 0.7213736772537231, + "rewards/length2tails_reward/std": 0.3108052909374237, + "rewards/thermo_reward/mean": 0.18780046701431274, + "rewards/thermo_reward/std": 1.601993441581726, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12359000463038683, + "epoch": 2.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28357765078544617, + "learning_rate": 2.3503983468024456e-07, + "loss": 0.0011, + "num_tokens": 10320834.0, + "reward": 7.2601318359375, + "reward_std": 1.4297986030578613, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.7733430862426758, + "rewards/kidney_reward/std": 1.3763601779937744, + "rewards/length2tails_reward/mean": 0.6684156656265259, + "rewards/length2tails_reward/std": 0.33225399255752563, + "rewards/thermo_reward/mean": 0.4360945224761963, + "rewards/thermo_reward/std": 1.786563515663147, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12958783563226461, + "epoch": 2.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6146897077560425, + "learning_rate": 2.336461626432472e-07, + "loss": 0.0007, + "num_tokens": 10329546.0, + "reward": 7.084027290344238, + "reward_std": 1.2848799228668213, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.22326971590518951, + "rewards/kidney_reward/std": 1.2795557975769043, + "rewards/length2tails_reward/mean": 0.8084720373153687, + "rewards/length2tails_reward/std": 0.28825873136520386, + "rewards/thermo_reward/mean": 0.3579515814781189, + "rewards/thermo_reward/std": 1.8931461572647095, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15453569684177637, + "epoch": 2.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6941280364990234, + "learning_rate": 2.3225608804085016e-07, + "loss": 0.0278, + "num_tokens": 10338330.0, + "reward": 6.719433784484863, + "reward_std": 2.814514636993408, + "rewards/fitness_reward/mean": 6.112170219421387, + "rewards/fitness_reward/std": 2.1878795623779297, + "rewards/kidney_reward/mean": 0.509868860244751, + "rewards/kidney_reward/std": 1.2663058042526245, + "rewards/length2tails_reward/mean": 0.7951368093490601, + "rewards/length2tails_reward/std": 0.2764343023300171, + "rewards/thermo_reward/mean": 0.30709001421928406, + "rewards/thermo_reward/std": 1.7216320037841797, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1600744165480137, + "epoch": 2.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.719894528388977, + "learning_rate": 2.3086961739837107e-07, + "loss": -0.0145, + "num_tokens": 10347076.0, + "reward": 6.698928356170654, + "reward_std": 3.4363529682159424, + "rewards/fitness_reward/mean": 5.8348283767700195, + "rewards/fitness_reward/std": 2.976780891418457, + "rewards/kidney_reward/mean": 0.34624993801116943, + "rewards/kidney_reward/std": 1.490804672241211, + "rewards/length2tails_reward/mean": 0.7678369879722595, + "rewards/length2tails_reward/std": 0.2874162197113037, + "rewards/thermo_reward/mean": 0.9980307817459106, + "rewards/thermo_reward/std": 1.5384925603866577, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1424051383510232, + "epoch": 2.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2329249382019043, + "learning_rate": 2.2948675722421085e-07, + "loss": 0.0062, + "num_tokens": 10355763.0, + "reward": 6.569212913513184, + "reward_std": 2.796764373779297, + "rewards/fitness_reward/mean": 5.933171272277832, + "rewards/fitness_reward/std": 2.608490467071533, + "rewards/kidney_reward/mean": 0.30225175619125366, + "rewards/kidney_reward/std": 1.2117165327072144, + "rewards/length2tails_reward/mean": 0.7657819986343384, + "rewards/length2tails_reward/std": 0.2733713984489441, + "rewards/thermo_reward/mean": 0.5869405269622803, + "rewards/thermo_reward/std": 1.5722503662109375, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 275.1875, + "completions/mean_terminated_length": 275.1875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.1428227387368679, + "epoch": 2.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9525351524353027, + "learning_rate": 2.2810751400982077e-07, + "loss": 0.1094, + "num_tokens": 10364601.0, + "reward": 6.857797622680664, + "reward_std": 2.5363237857818604, + "rewards/fitness_reward/mean": 6.234767436981201, + "rewards/fitness_reward/std": 2.016842842102051, + "rewards/kidney_reward/mean": 0.4224570691585541, + "rewards/kidney_reward/std": 1.2973368167877197, + "rewards/length2tails_reward/mean": 0.8310712575912476, + "rewards/length2tails_reward/std": 0.21810440719127655, + "rewards/thermo_reward/mean": 0.40806788206100464, + "rewards/thermo_reward/std": 1.809707522392273, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.90625, + "completions/mean_terminated_length": 268.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12416747957468033, + "epoch": 2.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44372621178627014, + "learning_rate": 2.267318942296742e-07, + "loss": 0.0006, + "num_tokens": 10373238.0, + "reward": 6.433513641357422, + "reward_std": 2.3816165924072266, + "rewards/fitness_reward/mean": 6.237121105194092, + "rewards/fitness_reward/std": 2.0035276412963867, + "rewards/kidney_reward/mean": 0.19951285421848297, + "rewards/kidney_reward/std": 1.2154597043991089, + "rewards/length2tails_reward/mean": 0.7037875652313232, + "rewards/length2tails_reward/std": 0.2711665630340576, + "rewards/thermo_reward/mean": -0.15862132608890533, + "rewards/thermo_reward/std": 1.7928681373596191, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12712588999420404, + "epoch": 2.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5299016237258911, + "learning_rate": 2.2535990434123498e-07, + "loss": 0.0032, + "num_tokens": 10381915.0, + "reward": 6.953973293304443, + "reward_std": 2.02205228805542, + "rewards/fitness_reward/mean": 6.312586307525635, + "rewards/fitness_reward/std": 1.5766324996948242, + "rewards/kidney_reward/mean": 0.46650612354278564, + "rewards/kidney_reward/std": 1.295060396194458, + "rewards/length2tails_reward/mean": 0.7391375303268433, + "rewards/length2tails_reward/std": 0.2649126648902893, + "rewards/thermo_reward/mean": 0.446698933839798, + "rewards/thermo_reward/std": 1.7481553554534912, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1344369240105152, + "epoch": 2.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0505989789962769, + "learning_rate": 2.2399155078492692e-07, + "loss": -0.0007, + "num_tokens": 10390616.0, + "reward": 7.035079002380371, + "reward_std": 1.4662327766418457, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.30914032459259033, + "rewards/kidney_reward/std": 1.3180829286575317, + "rewards/length2tails_reward/mean": 0.8030677437782288, + "rewards/length2tails_reward/std": 0.24346104264259338, + "rewards/thermo_reward/mean": 0.38286587595939636, + "rewards/thermo_reward/std": 1.8458991050720215, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13711410947144032, + "epoch": 2.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3108275532722473, + "learning_rate": 2.2262683998410548e-07, + "loss": -0.0003, + "num_tokens": 10399289.0, + "reward": 7.1430983543396, + "reward_std": 1.3162767887115479, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.1994307041168213, + "rewards/kidney_reward/std": 1.4668281078338623, + "rewards/length2tails_reward/mean": 0.8067271709442139, + "rewards/length2tails_reward/std": 0.24364596605300903, + "rewards/thermo_reward/mean": 0.7067838311195374, + "rewards/thermo_reward/std": 1.5949379205703735, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14859586395323277, + "epoch": 2.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.531777024269104, + "learning_rate": 2.212657783450247e-07, + "loss": 0.0247, + "num_tokens": 10408045.0, + "reward": 6.657247066497803, + "reward_std": 2.1735312938690186, + "rewards/fitness_reward/mean": 6.270033359527588, + "rewards/fitness_reward/std": 1.8173484802246094, + "rewards/kidney_reward/mean": 0.12885035574436188, + "rewards/kidney_reward/std": 1.504292368888855, + "rewards/length2tails_reward/mean": 0.7863860130310059, + "rewards/length2tails_reward/std": 0.288088858127594, + "rewards/thermo_reward/mean": 0.2523841857910156, + "rewards/thermo_reward/std": 1.8280649185180664, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14634728711098433, + "epoch": 2.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40649887919425964, + "learning_rate": 2.1990837225680947e-07, + "loss": -0.0089, + "num_tokens": 10416814.0, + "reward": 7.3467254638671875, + "reward_std": 1.0463664531707764, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4199780821800232, + "rewards/kidney_reward/std": 1.436300277709961, + "rewards/length2tails_reward/mean": 0.8436079025268555, + "rewards/length2tails_reward/std": 0.2537226974964142, + "rewards/thermo_reward/mean": 0.669072151184082, + "rewards/thermo_reward/std": 1.415181040763855, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13807967212051153, + "epoch": 2.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0209078788757324, + "learning_rate": 2.1855462809142422e-07, + "loss": -0.0053, + "num_tokens": 10425509.0, + "reward": 6.752335071563721, + "reward_std": 2.331373453140259, + "rewards/fitness_reward/mean": 6.307352066040039, + "rewards/fitness_reward/std": 1.60624098777771, + "rewards/kidney_reward/mean": 0.3081025779247284, + "rewards/kidney_reward/std": 1.319443941116333, + "rewards/length2tails_reward/mean": 0.8190740346908569, + "rewards/length2tails_reward/std": 0.24252653121948242, + "rewards/thermo_reward/mean": 0.17232581973075867, + "rewards/thermo_reward/std": 1.8544268608093262, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 277.21875, + "completions/mean_terminated_length": 277.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14882122911512852, + "epoch": 2.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7406237125396729, + "learning_rate": 2.172045522036444e-07, + "loss": 0.0601, + "num_tokens": 10434412.0, + "reward": 6.383779525756836, + "reward_std": 2.6961283683776855, + "rewards/fitness_reward/mean": 5.965703010559082, + "rewards/fitness_reward/std": 2.4804646968841553, + "rewards/kidney_reward/mean": 0.25591757893562317, + "rewards/kidney_reward/std": 1.3128076791763306, + "rewards/length2tails_reward/mean": 0.7789655923843384, + "rewards/length2tails_reward/std": 0.3052135705947876, + "rewards/thermo_reward/mean": 0.1907517910003662, + "rewards/thermo_reward/std": 1.8690615892410278, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 265.59375, + "completions/mean_terminated_length": 265.59375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.1397678665816784, + "epoch": 2.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.610418438911438, + "learning_rate": 2.1585815093102456e-07, + "loss": -0.0839, + "num_tokens": 10442943.0, + "reward": 6.817094326019287, + "reward_std": 2.286875009536743, + "rewards/fitness_reward/mean": 6.220857620239258, + "rewards/fitness_reward/std": 2.0955264568328857, + "rewards/kidney_reward/mean": 0.4688721299171448, + "rewards/kidney_reward/std": 1.2381889820098877, + "rewards/length2tails_reward/mean": 0.8027282953262329, + "rewards/length2tails_reward/std": 0.22238974273204803, + "rewards/thermo_reward/mean": 0.3222363591194153, + "rewards/thermo_reward/std": 2.045714855194092, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15239833388477564, + "epoch": 2.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.259833812713623, + "learning_rate": 2.145154305938709e-07, + "loss": 0.0904, + "num_tokens": 10451767.0, + "reward": 6.841379165649414, + "reward_std": 2.064190626144409, + "rewards/fitness_reward/mean": 6.248695373535156, + "rewards/fitness_reward/std": 1.9380545616149902, + "rewards/kidney_reward/mean": 0.36406832933425903, + "rewards/kidney_reward/std": 1.182694911956787, + "rewards/length2tails_reward/mean": 0.6944384574890137, + "rewards/length2tails_reward/std": 0.3012729585170746, + "rewards/thermo_reward/mean": 0.4740801453590393, + "rewards/thermo_reward/std": 1.6848740577697754, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14045276027172804, + "epoch": 2.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0979050397872925, + "learning_rate": 2.1317639749520988e-07, + "loss": -0.0016, + "num_tokens": 10460485.0, + "reward": 7.112707614898682, + "reward_std": 1.1237766742706299, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3315911889076233, + "rewards/kidney_reward/std": 1.2447819709777832, + "rewards/length2tails_reward/mean": 0.7930424213409424, + "rewards/length2tails_reward/std": 0.2631719410419464, + "rewards/thermo_reward/mean": 0.5206843614578247, + "rewards/thermo_reward/std": 1.5175108909606934, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 261.65625, + "completions/mean_terminated_length": 261.65625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.13022050261497498, + "epoch": 2.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6739642024040222, + "learning_rate": 2.118410579207589e-07, + "loss": -0.142, + "num_tokens": 10468890.0, + "reward": 6.4762187004089355, + "reward_std": 2.584693431854248, + "rewards/fitness_reward/mean": 6.0280537605285645, + "rewards/fitness_reward/std": 2.142303705215454, + "rewards/kidney_reward/mean": 0.04269711673259735, + "rewards/kidney_reward/std": 1.222676396369934, + "rewards/length2tails_reward/mean": 0.7052397727966309, + "rewards/length2tails_reward/std": 0.32516634464263916, + "rewards/thermo_reward/mean": 0.501013457775116, + "rewards/thermo_reward/std": 1.816394329071045, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 283.40625, + "completions/mean_terminated_length": 283.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18277012929320335, + "epoch": 2.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0985469818115234, + "learning_rate": 2.1050941813889832e-07, + "loss": 0.1114, + "num_tokens": 10477991.0, + "reward": 6.7816619873046875, + "reward_std": 2.568706512451172, + "rewards/fitness_reward/mean": 5.935724258422852, + "rewards/fitness_reward/std": 2.585078716278076, + "rewards/kidney_reward/mean": 0.552318811416626, + "rewards/kidney_reward/std": 1.337119221687317, + "rewards/length2tails_reward/mean": 0.7825872898101807, + "rewards/length2tails_reward/std": 0.30334389209747314, + "rewards/thermo_reward/mean": 0.7482626438140869, + "rewards/thermo_reward/std": 1.4354112148284912, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15069701336324215, + "epoch": 2.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3031301498413086, + "learning_rate": 2.0918148440063942e-07, + "loss": 0.0698, + "num_tokens": 10486810.0, + "reward": 6.8844828605651855, + "reward_std": 2.1888115406036377, + "rewards/fitness_reward/mean": 6.243119239807129, + "rewards/fitness_reward/std": 1.969598412513733, + "rewards/kidney_reward/mean": 0.33939287066459656, + "rewards/kidney_reward/std": 1.308741569519043, + "rewards/length2tails_reward/mean": 0.7855542302131653, + "rewards/length2tails_reward/std": 0.2671493589878082, + "rewards/thermo_reward/mean": 0.5505576729774475, + "rewards/thermo_reward/std": 1.7787920236587524, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15555760636925697, + "epoch": 2.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1634215116500854, + "learning_rate": 2.0785726293959804e-07, + "loss": 0.011, + "num_tokens": 10495629.0, + "reward": 6.892745018005371, + "reward_std": 1.5912784337997437, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.13091489672660828, + "rewards/kidney_reward/std": 1.5787826776504517, + "rewards/length2tails_reward/mean": 0.7597600221633911, + "rewards/length2tails_reward/std": 0.2910412549972534, + "rewards/thermo_reward/mean": 0.2980766296386719, + "rewards/thermo_reward/std": 1.7125396728515625, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13757058698683977, + "epoch": 2.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984132170677185, + "learning_rate": 2.0653675997196207e-07, + "loss": -0.0009, + "num_tokens": 10504326.0, + "reward": 6.986088752746582, + "reward_std": 1.0979586839675903, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.036840811371803284, + "rewards/kidney_reward/std": 1.276766061782837, + "rewards/length2tails_reward/mean": 0.8175562024116516, + "rewards/length2tails_reward/std": 0.210645392537117, + "rewards/thermo_reward/mean": 0.41764307022094727, + "rewards/thermo_reward/std": 1.7940781116485596, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13410961348563433, + "epoch": 2.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7312173247337341, + "learning_rate": 2.0521998169646526e-07, + "loss": -0.0131, + "num_tokens": 10513061.0, + "reward": 6.632888317108154, + "reward_std": 1.5381122827529907, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1616494357585907, + "rewards/kidney_reward/std": 1.336515188217163, + "rewards/length2tails_reward/mean": 0.6866886615753174, + "rewards/length2tails_reward/std": 0.34856536984443665, + "rewards/thermo_reward/mean": -0.009857065975666046, + "rewards/thermo_reward/std": 1.9256420135498047, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 278.28125, + "completions/mean_terminated_length": 278.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1562620848417282, + "epoch": 2.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3384287357330322, + "learning_rate": 2.0390693429435623e-07, + "loss": 0.097, + "num_tokens": 10521998.0, + "reward": 6.14390754699707, + "reward_std": 3.2956759929656982, + "rewards/fitness_reward/mean": 5.695949554443359, + "rewards/fitness_reward/std": 2.779526948928833, + "rewards/kidney_reward/mean": -0.21876198053359985, + "rewards/kidney_reward/std": 1.3387104272842407, + "rewards/length2tails_reward/mean": 0.807336688041687, + "rewards/length2tails_reward/std": 0.262485146522522, + "rewards/thermo_reward/mean": 0.7110099792480469, + "rewards/thermo_reward/std": 1.4583662748336792, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 276.03125, + "completions/mean_terminated_length": 276.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15935837477445602, + "epoch": 2.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2186410427093506, + "learning_rate": 2.025976239293705e-07, + "loss": -0.0023, + "num_tokens": 10530863.0, + "reward": 6.752180099487305, + "reward_std": 2.8392486572265625, + "rewards/fitness_reward/mean": 6.0168962478637695, + "rewards/fitness_reward/std": 2.2008705139160156, + "rewards/kidney_reward/mean": 0.4526037275791168, + "rewards/kidney_reward/std": 1.2687206268310547, + "rewards/length2tails_reward/mean": 0.7034296989440918, + "rewards/length2tails_reward/std": 0.3169019818305969, + "rewards/thermo_reward/mean": 0.6662502288818359, + "rewards/thermo_reward/std": 1.781776785850525, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.3125, + "completions/mean_terminated_length": 265.3125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.13818510342389345, + "epoch": 2.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0207407474517822, + "learning_rate": 2.0129205674770067e-07, + "loss": -0.0805, + "num_tokens": 10539385.0, + "reward": 6.519955635070801, + "reward_std": 2.271798610687256, + "rewards/fitness_reward/mean": 6.029547691345215, + "rewards/fitness_reward/std": 2.1344797611236572, + "rewards/kidney_reward/mean": -0.047320008277893066, + "rewards/kidney_reward/std": 1.5888068675994873, + "rewards/length2tails_reward/mean": 0.8059554100036621, + "rewards/length2tails_reward/std": 0.26203134655952454, + "rewards/thermo_reward/mean": 0.6251572966575623, + "rewards/thermo_reward/std": 1.7402573823928833, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.1479767682030797, + "epoch": 2.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.646166205406189, + "learning_rate": 1.9999023887796796e-07, + "loss": 0.0164, + "num_tokens": 10548147.0, + "reward": 7.182011604309082, + "reward_std": 1.3523036241531372, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.7770439386367798, + "rewards/kidney_reward/std": 1.4609756469726562, + "rewards/length2tails_reward/mean": 0.7573673725128174, + "rewards/length2tails_reward/std": 0.3173437714576721, + "rewards/thermo_reward/mean": 0.4376543462276459, + "rewards/thermo_reward/std": 1.778507113456726, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1731070727109909, + "epoch": 2.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7692461013793945, + "learning_rate": 1.9869217643119408e-07, + "loss": 0.0181, + "num_tokens": 10556904.0, + "reward": 6.966423034667969, + "reward_std": 2.349330425262451, + "rewards/fitness_reward/mean": 6.236401557922363, + "rewards/fitness_reward/std": 2.007596492767334, + "rewards/kidney_reward/mean": 0.20747965574264526, + "rewards/kidney_reward/std": 1.3964067697525024, + "rewards/length2tails_reward/mean": 0.9009748697280884, + "rewards/length2tails_reward/std": 0.11851488798856735, + "rewards/thermo_reward/mean": 0.8020747303962708, + "rewards/thermo_reward/std": 1.5707067251205444, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13845635578036308, + "epoch": 2.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45231375098228455, + "learning_rate": 1.9739787550077115e-07, + "loss": -0.0015, + "num_tokens": 10565613.0, + "reward": 6.963863849639893, + "reward_std": 1.673593282699585, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.09308783710002899, + "rewards/kidney_reward/std": 1.5062453746795654, + "rewards/length2tails_reward/mean": 0.7864948511123657, + "rewards/length2tails_reward/std": 0.26666387915611267, + "rewards/thermo_reward/mean": 0.8767297267913818, + "rewards/thermo_reward/std": 1.2607451677322388, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13331259787082672, + "epoch": 2.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0724098682403564, + "learning_rate": 1.9610734216243518e-07, + "loss": -0.0046, + "num_tokens": 10574314.0, + "reward": 6.870992183685303, + "reward_std": 1.432407259941101, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4920026361942291, + "rewards/kidney_reward/std": 1.3550288677215576, + "rewards/length2tails_reward/mean": 0.8295128345489502, + "rewards/length2tails_reward/std": 0.19401250779628754, + "rewards/thermo_reward/mean": -0.14139311015605927, + "rewards/thermo_reward/std": 1.952734351158142, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.13072874303907156, + "epoch": 2.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6706132888793945, + "learning_rate": 1.9482058247423417e-07, + "loss": -0.0075, + "num_tokens": 10582980.0, + "reward": 7.080522537231445, + "reward_std": 2.253013849258423, + "rewards/fitness_reward/mean": 6.242404937744141, + "rewards/fitness_reward/std": 1.9736372232437134, + "rewards/kidney_reward/mean": 0.2913457751274109, + "rewards/kidney_reward/std": 1.3546520471572876, + "rewards/length2tails_reward/mean": 0.7990288138389587, + "rewards/length2tails_reward/std": 0.277334600687027, + "rewards/thermo_reward/mean": 0.9853748083114624, + "rewards/thermo_reward/std": 1.296433448791504, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13770211208611727, + "epoch": 2.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5283219218254089, + "learning_rate": 1.9353760247650398e-07, + "loss": -0.008, + "num_tokens": 10591662.0, + "reward": 7.38553524017334, + "reward_std": 1.223103404045105, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.7584617137908936, + "rewards/kidney_reward/std": 1.1852236986160278, + "rewards/length2tails_reward/mean": 0.7940632104873657, + "rewards/length2tails_reward/std": 0.2601299285888672, + "rewards/thermo_reward/mean": 0.6389591097831726, + "rewards/thermo_reward/std": 1.3715565204620361, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.3125, + "completions/mean_terminated_length": 268.3125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.1340145654976368, + "epoch": 2.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5603392124176025, + "learning_rate": 1.9225840819183626e-07, + "loss": 0.0004, + "num_tokens": 10600280.0, + "reward": 7.140044212341309, + "reward_std": 1.290602445602417, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3139425814151764, + "rewards/kidney_reward/std": 1.2883977890014648, + "rewards/length2tails_reward/mean": 0.7378472089767456, + "rewards/length2tails_reward/std": 0.30753093957901, + "rewards/thermo_reward/mean": 0.6206037402153015, + "rewards/thermo_reward/std": 1.598232388496399, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.1407667240127921, + "epoch": 2.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8380615711212158, + "learning_rate": 1.9098300562505264e-07, + "loss": -0.0137, + "num_tokens": 10608951.0, + "reward": 6.557684898376465, + "reward_std": 2.4457712173461914, + "rewards/fitness_reward/mean": 6.233341217041016, + "rewards/fitness_reward/std": 2.024909496307373, + "rewards/kidney_reward/mean": -0.13064411282539368, + "rewards/kidney_reward/std": 1.4267821311950684, + "rewards/length2tails_reward/mean": 0.778062641620636, + "rewards/length2tails_reward/std": 0.2736484110355377, + "rewards/thermo_reward/mean": 0.39030036330223083, + "rewards/thermo_reward/std": 1.7196393013000488, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13650969322770834, + "epoch": 2.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7355565428733826, + "learning_rate": 1.8971140076317493e-07, + "loss": -0.0096, + "num_tokens": 10617671.0, + "reward": 7.194827079772949, + "reward_std": 1.454677939414978, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.7479823231697083, + "rewards/kidney_reward/std": 1.1718031167984009, + "rewards/length2tails_reward/mean": 0.7393836975097656, + "rewards/length2tails_reward/std": 0.30609890818595886, + "rewards/thermo_reward/mean": 0.5013396143913269, + "rewards/thermo_reward/std": 1.6923469305038452, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 269.28125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.153823166154325, + "epoch": 2.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.126293182373047, + "learning_rate": 1.8844359957539778e-07, + "loss": -0.0492, + "num_tokens": 10626320.0, + "reward": 6.934659481048584, + "reward_std": 2.416071653366089, + "rewards/fitness_reward/mean": 6.236845970153809, + "rewards/fitness_reward/std": 2.0050840377807617, + "rewards/kidney_reward/mean": 0.47866952419281006, + "rewards/kidney_reward/std": 1.3772536516189575, + "rewards/length2tails_reward/mean": 0.8610520362854004, + "rewards/length2tails_reward/std": 0.18673007190227509, + "rewards/thermo_reward/mean": 0.4864315688610077, + "rewards/thermo_reward/std": 1.825042724609375, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1310725463554263, + "epoch": 2.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7593275308609009, + "learning_rate": 1.8717960801306098e-07, + "loss": 0.0029, + "num_tokens": 10634993.0, + "reward": 6.8131914138793945, + "reward_std": 1.2631603479385376, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.4057963192462921, + "rewards/kidney_reward/std": 1.2763725519180298, + "rewards/length2tails_reward/mean": 0.7397881150245667, + "rewards/length2tails_reward/std": 0.2975977063179016, + "rewards/thermo_reward/mean": 0.08005198836326599, + "rewards/thermo_reward/std": 1.98447847366333, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.17833027057349682, + "epoch": 2.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7265028953552246, + "learning_rate": 1.8591943200962056e-07, + "loss": -0.044, + "num_tokens": 10643558.0, + "reward": 6.312701225280762, + "reward_std": 3.492217779159546, + "rewards/fitness_reward/mean": 5.532222270965576, + "rewards/fitness_reward/std": 3.3477907180786133, + "rewards/kidney_reward/mean": 0.08220860362052917, + "rewards/kidney_reward/std": 1.2023855447769165, + "rewards/length2tails_reward/mean": 0.7690292596817017, + "rewards/length2tails_reward/std": 0.2859770655632019, + "rewards/thermo_reward/mean": 1.0942351818084717, + "rewards/thermo_reward/std": 1.12253737449646, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 266.5625, + "completions/mean_terminated_length": 266.5625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.15765101741999388, + "epoch": 2.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.937720775604248, + "learning_rate": 1.8466307748062204e-07, + "loss": -0.0207, + "num_tokens": 10652120.0, + "reward": 6.443487167358398, + "reward_std": 3.236635684967041, + "rewards/fitness_reward/mean": 5.859430313110352, + "rewards/fitness_reward/std": 2.8865418434143066, + "rewards/kidney_reward/mean": 0.41441404819488525, + "rewards/kidney_reward/std": 1.3339735269546509, + "rewards/length2tails_reward/mean": 0.7043296098709106, + "rewards/length2tails_reward/std": 0.33452701568603516, + "rewards/thermo_reward/mean": 0.40153470635414124, + "rewards/thermo_reward/std": 1.8202341794967651, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.15754753723740578, + "epoch": 2.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1962196826934814, + "learning_rate": 1.8341055032367147e-07, + "loss": 0.0072, + "num_tokens": 10660824.0, + "reward": 6.212782859802246, + "reward_std": 2.783827543258667, + "rewards/fitness_reward/mean": 5.878316879272461, + "rewards/fitness_reward/std": 2.4449498653411865, + "rewards/kidney_reward/mean": -0.013125769793987274, + "rewards/kidney_reward/std": 1.363172173500061, + "rewards/length2tails_reward/mean": 0.7736055850982666, + "rewards/length2tails_reward/std": 0.2994263172149658, + "rewards/thermo_reward/mean": 0.2952558696269989, + "rewards/thermo_reward/std": 1.6406909227371216, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14537839498370886, + "epoch": 2.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9369338154792786, + "learning_rate": 1.8216185641840875e-07, + "loss": -0.011, + "num_tokens": 10669618.0, + "reward": 7.15630578994751, + "reward_std": 1.10826575756073, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.24216413497924805, + "rewards/kidney_reward/std": 1.1586021184921265, + "rewards/length2tails_reward/mean": 0.7849744558334351, + "rewards/length2tails_reward/std": 0.2518375813961029, + "rewards/thermo_reward/mean": 0.4953639507293701, + "rewards/thermo_reward/std": 1.6769899129867554, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.13397904578596354, + "epoch": 2.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7844238877296448, + "learning_rate": 1.809170016264794e-07, + "loss": 0.0042, + "num_tokens": 10678291.0, + "reward": 7.054842948913574, + "reward_std": 1.279569149017334, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.29341426491737366, + "rewards/kidney_reward/std": 1.3664143085479736, + "rewards/length2tails_reward/mean": 0.8060706853866577, + "rewards/length2tails_reward/std": 0.2437075674533844, + "rewards/thermo_reward/mean": 0.4366181492805481, + "rewards/thermo_reward/std": 1.5780274868011475, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 276.0, + "completions/mean_terminated_length": 276.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16437082830816507, + "epoch": 2.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6893082857131958, + "learning_rate": 1.7967599179150784e-07, + "loss": 0.0131, + "num_tokens": 10687155.0, + "reward": 6.478178024291992, + "reward_std": 2.566878318786621, + "rewards/fitness_reward/mean": 6.062989234924316, + "rewards/fitness_reward/std": 1.960758090019226, + "rewards/kidney_reward/mean": -0.04700006544589996, + "rewards/kidney_reward/std": 1.4023725986480713, + "rewards/length2tails_reward/mean": 0.7702991962432861, + "rewards/length2tails_reward/std": 0.25465264916419983, + "rewards/thermo_reward/mean": 0.4922274649143219, + "rewards/thermo_reward/std": 1.8267821073532104, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 286.75, + "completions/mean_terminated_length": 286.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.19675271213054657, + "epoch": 2.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5938661098480225, + "learning_rate": 1.7843883273906867e-07, + "loss": 0.1489, + "num_tokens": 10696363.0, + "reward": 5.992588043212891, + "reward_std": 3.5342519283294678, + "rewards/fitness_reward/mean": 5.412173271179199, + "rewards/fitness_reward/std": 3.4160234928131104, + "rewards/kidney_reward/mean": 0.4962921440601349, + "rewards/kidney_reward/std": 1.149397611618042, + "rewards/length2tails_reward/mean": 0.8492158055305481, + "rewards/length2tails_reward/std": 0.19678044319152832, + "rewards/thermo_reward/mean": 0.23992903530597687, + "rewards/thermo_reward/std": 1.9339115619659424, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14384982362389565, + "epoch": 2.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3760433197021484, + "learning_rate": 1.772055302766604e-07, + "loss": -0.0097, + "num_tokens": 10705229.0, + "reward": 6.719727039337158, + "reward_std": 2.6368460655212402, + "rewards/fitness_reward/mean": 6.111083984375, + "rewards/fitness_reward/std": 2.193800926208496, + "rewards/kidney_reward/mean": 0.3884790539741516, + "rewards/kidney_reward/std": 1.240945816040039, + "rewards/length2tails_reward/mean": 0.786147952079773, + "rewards/length2tails_reward/std": 0.3011171221733093, + "rewards/thermo_reward/mean": 0.4357318878173828, + "rewards/thermo_reward/std": 1.7666430473327637, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.53125, + "completions/mean_terminated_length": 266.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.14180399384349585, + "epoch": 2.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.989327907562256, + "learning_rate": 1.759760901936782e-07, + "loss": -0.0827, + "num_tokens": 10713790.0, + "reward": 6.621981143951416, + "reward_std": 2.5199010372161865, + "rewards/fitness_reward/mean": 6.113877296447754, + "rewards/fitness_reward/std": 2.178572416305542, + "rewards/kidney_reward/mean": 0.34021058678627014, + "rewards/kidney_reward/std": 1.3446274995803833, + "rewards/length2tails_reward/mean": 0.8318252563476562, + "rewards/length2tails_reward/std": 0.23337016999721527, + "rewards/thermo_reward/mean": 0.26008450984954834, + "rewards/thermo_reward/std": 1.9035552740097046, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 295.5, + "completions/mean_terminated_length": 295.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17390831280499697, + "epoch": 2.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.292157173156738, + "learning_rate": 1.7475051826138553e-07, + "loss": 0.1699, + "num_tokens": 10723278.0, + "reward": 6.861367225646973, + "reward_std": 2.4555370807647705, + "rewards/fitness_reward/mean": 6.045530796051025, + "rewards/fitness_reward/std": 2.0511116981506348, + "rewards/kidney_reward/mean": 0.10334709286689758, + "rewards/kidney_reward/std": 1.262412428855896, + "rewards/length2tails_reward/mean": 0.813460111618042, + "rewards/length2tails_reward/std": 0.2190786600112915, + "rewards/thermo_reward/mean": 1.1215959787368774, + "rewards/thermo_reward/std": 1.3993767499923706, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 268.09375, + "completions/mean_terminated_length": 268.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.133622027002275, + "epoch": 2.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3739473819732666, + "learning_rate": 1.7352882023288906e-07, + "loss": -0.0044, + "num_tokens": 10731889.0, + "reward": 7.042867660522461, + "reward_std": 1.3168541193008423, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.6638315320014954, + "rewards/kidney_reward/std": 1.2620795965194702, + "rewards/length2tails_reward/mean": 0.5839706063270569, + "rewards/length2tails_reward/std": 0.37432390451431274, + "rewards/thermo_reward/mean": 0.15330010652542114, + "rewards/thermo_reward/std": 1.714321494102478, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.1388418935239315, + "epoch": 2.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8404762744903564, + "learning_rate": 1.7231100184310953e-07, + "loss": -0.0187, + "num_tokens": 10740578.0, + "reward": 6.7227067947387695, + "reward_std": 2.84771728515625, + "rewards/fitness_reward/mean": 6.11498498916626, + "rewards/fitness_reward/std": 2.172534704208374, + "rewards/kidney_reward/mean": 0.451224684715271, + "rewards/kidney_reward/std": 1.3972952365875244, + "rewards/length2tails_reward/mean": 0.7879926562309265, + "rewards/length2tails_reward/std": 0.30947035551071167, + "rewards/thermo_reward/mean": 0.37022292613983154, + "rewards/thermo_reward/std": 1.9970158338546753, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.14591133315116167, + "epoch": 2.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3854591846466064, + "learning_rate": 1.7109706880875608e-07, + "loss": 0.0274, + "num_tokens": 10749348.0, + "reward": 6.129911422729492, + "reward_std": 3.350022554397583, + "rewards/fitness_reward/mean": 5.774148941040039, + "rewards/fitness_reward/std": 2.8437979221343994, + "rewards/kidney_reward/mean": 0.20148763060569763, + "rewards/kidney_reward/std": 1.4032697677612305, + "rewards/length2tails_reward/mean": 0.842609703540802, + "rewards/length2tails_reward/std": 0.2193181812763214, + "rewards/thermo_reward/mean": 0.08873233944177628, + "rewards/thermo_reward/std": 1.973818063735962, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1543856943026185, + "epoch": 2.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7218741178512573, + "learning_rate": 1.6988702682829891e-07, + "loss": 0.0495, + "num_tokens": 10758109.0, + "reward": 6.939650535583496, + "reward_std": 2.4639525413513184, + "rewards/fitness_reward/mean": 6.1247878074646, + "rewards/fitness_reward/std": 2.119162082672119, + "rewards/kidney_reward/mean": 0.21883797645568848, + "rewards/kidney_reward/std": 1.5272595882415771, + "rewards/length2tails_reward/mean": 0.7354306578636169, + "rewards/length2tails_reward/std": 0.2757667303085327, + "rewards/thermo_reward/mean": 1.0431718826293945, + "rewards/thermo_reward/std": 1.4568425416946411, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.13925810065120459, + "epoch": 2.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3526345491409302, + "learning_rate": 1.686808815819435e-07, + "loss": -0.0195, + "num_tokens": 10766777.0, + "reward": 6.827254295349121, + "reward_std": 2.6964027881622314, + "rewards/fitness_reward/mean": 6.110751152038574, + "rewards/fitness_reward/std": 2.1956191062927246, + "rewards/kidney_reward/mean": 0.2765055000782013, + "rewards/kidney_reward/std": 1.186026692390442, + "rewards/length2tails_reward/mean": 0.7792638540267944, + "rewards/length2tails_reward/std": 0.27930328249931335, + "rewards/thermo_reward/mean": 0.766869068145752, + "rewards/thermo_reward/std": 1.7156330347061157, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.13446869980543852, + "epoch": 2.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6933838725090027, + "learning_rate": 1.6747863873160229e-07, + "loss": 0.0002, + "num_tokens": 10775447.0, + "reward": 7.049829483032227, + "reward_std": 2.1705031394958496, + "rewards/fitness_reward/mean": 6.2614850997924805, + "rewards/fitness_reward/std": 1.8657041788101196, + "rewards/kidney_reward/mean": 0.2895672023296356, + "rewards/kidney_reward/std": 1.2855594158172607, + "rewards/length2tails_reward/mean": 0.7912293672561646, + "rewards/length2tails_reward/std": 0.2643392086029053, + "rewards/thermo_reward/mean": 0.8915072083473206, + "rewards/thermo_reward/std": 1.4952034950256348, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13300312496721745, + "epoch": 2.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29931145906448364, + "learning_rate": 1.6628030392087e-07, + "loss": -0.0016, + "num_tokens": 10784167.0, + "reward": 7.045505046844482, + "reward_std": 1.1196742057800293, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.38945239782333374, + "rewards/kidney_reward/std": 1.3009051084518433, + "rewards/length2tails_reward/mean": 0.8276556730270386, + "rewards/length2tails_reward/std": 0.26540234684944153, + "rewards/thermo_reward/mean": 0.1051337718963623, + "rewards/thermo_reward/std": 1.9927774667739868, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13917091581970453, + "epoch": 2.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9544405937194824, + "learning_rate": 1.650858827749948e-07, + "loss": 0.0098, + "num_tokens": 10792897.0, + "reward": 6.902583122253418, + "reward_std": 1.2768148183822632, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.15653064846992493, + "rewards/kidney_reward/std": 1.4467251300811768, + "rewards/length2tails_reward/mean": 0.7423674464225769, + "rewards/length2tails_reward/std": 0.3344264626502991, + "rewards/thermo_reward/mean": 0.5068111419677734, + "rewards/thermo_reward/std": 1.6707382202148438, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.16430540196597576, + "epoch": 2.4699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3516767024993896, + "learning_rate": 1.6389538090085464e-07, + "loss": 0.0352, + "num_tokens": 10801659.0, + "reward": 6.88657283782959, + "reward_std": 2.46352481842041, + "rewards/fitness_reward/mean": 5.910763263702393, + "rewards/fitness_reward/std": 2.265733242034912, + "rewards/kidney_reward/mean": 0.5301785469055176, + "rewards/kidney_reward/std": 1.3465850353240967, + "rewards/length2tails_reward/mean": 0.7707039713859558, + "rewards/length2tails_reward/std": 0.2768467664718628, + "rewards/thermo_reward/mean": 1.0360894203186035, + "rewards/thermo_reward/std": 1.0366123914718628, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13207094091922045, + "epoch": 2.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5908097624778748, + "learning_rate": 1.627088038869283e-07, + "loss": 0.0025, + "num_tokens": 10810336.0, + "reward": 7.3425188064575195, + "reward_std": 1.04904043674469, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.43143215775489807, + "rewards/kidney_reward/std": 1.3173141479492188, + "rewards/length2tails_reward/mean": 0.8217315673828125, + "rewards/length2tails_reward/std": 0.17892324924468994, + "rewards/thermo_reward/mean": 0.6601424217224121, + "rewards/thermo_reward/std": 1.5319675207138062, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12334689777344465, + "epoch": 2.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3092116117477417, + "learning_rate": 1.6152615730327157e-07, + "loss": -0.0063, + "num_tokens": 10818972.0, + "reward": 7.132142066955566, + "reward_std": 1.2887277603149414, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.526396632194519, + "rewards/kidney_reward/std": 1.157102346420288, + "rewards/length2tails_reward/mean": 0.6743857264518738, + "rewards/length2tails_reward/std": 0.334469735622406, + "rewards/thermo_reward/mean": 0.42407551407814026, + "rewards/thermo_reward/std": 1.6632673740386963, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14229581132531166, + "epoch": 2.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8910076022148132, + "learning_rate": 1.603474467014897e-07, + "loss": -0.0024, + "num_tokens": 10827662.0, + "reward": 7.033144950866699, + "reward_std": 1.306127905845642, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.09438171982765198, + "rewards/kidney_reward/std": 1.437690019607544, + "rewards/length2tails_reward/mean": 0.7472254037857056, + "rewards/length2tails_reward/std": 0.29949691891670227, + "rewards/thermo_reward/mean": 0.621677577495575, + "rewards/thermo_reward/std": 1.7310734987258911, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.14316700119525194, + "epoch": 2.4779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.828761100769043, + "learning_rate": 1.5917267761471022e-07, + "loss": 0.001, + "num_tokens": 10836285.0, + "reward": 6.7781524658203125, + "reward_std": 2.383754014968872, + "rewards/fitness_reward/mean": 6.18259334564209, + "rewards/fitness_reward/std": 1.806985855102539, + "rewards/kidney_reward/mean": 0.2779150903224945, + "rewards/kidney_reward/std": 1.4220167398452759, + "rewards/length2tails_reward/mean": 0.7584176063537598, + "rewards/length2tails_reward/std": 0.32442399859428406, + "rewards/thermo_reward/mean": 0.5339927077293396, + "rewards/thermo_reward/std": 1.7273677587509155, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 279.9375, + "completions/mean_terminated_length": 279.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1631597252562642, + "epoch": 2.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.68607234954834, + "learning_rate": 1.5800185555756006e-07, + "loss": 0.1181, + "num_tokens": 10845275.0, + "reward": 6.613602638244629, + "reward_std": 3.190074920654297, + "rewards/fitness_reward/mean": 5.882923126220703, + "rewards/fitness_reward/std": 2.788001298904419, + "rewards/kidney_reward/mean": 0.3340502381324768, + "rewards/kidney_reward/std": 1.421899437904358, + "rewards/length2tails_reward/mean": 0.7944344878196716, + "rewards/length2tails_reward/std": 0.2668580710887909, + "rewards/thermo_reward/mean": 0.7300912737846375, + "rewards/thermo_reward/std": 1.5367861986160278, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1231886362656951, + "epoch": 2.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7178840637207031, + "learning_rate": 1.5683498602613689e-07, + "loss": -0.0003, + "num_tokens": 10853933.0, + "reward": 7.069680690765381, + "reward_std": 1.0834349393844604, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4987313747406006, + "rewards/kidney_reward/std": 1.1319782733917236, + "rewards/length2tails_reward/mean": 0.7345043420791626, + "rewards/length2tails_reward/std": 0.32117703557014465, + "rewards/thermo_reward/mean": 0.2967594265937805, + "rewards/thermo_reward/std": 1.721886396408081, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.136326402425766, + "epoch": 2.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.227076530456543, + "learning_rate": 1.5567207449798515e-07, + "loss": -0.0028, + "num_tokens": 10862757.0, + "reward": 6.868923664093018, + "reward_std": 1.7804642915725708, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.5362035036087036, + "rewards/kidney_reward/std": 1.3113242387771606, + "rewards/length2tails_reward/mean": 0.7428072690963745, + "rewards/length2tails_reward/std": 0.34601032733917236, + "rewards/thermo_reward/mean": 0.26557812094688416, + "rewards/thermo_reward/std": 1.7056077718734741, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.15498424042016268, + "epoch": 2.4859999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8364877700805664, + "learning_rate": 1.5451312643206826e-07, + "loss": -0.0097, + "num_tokens": 10871409.0, + "reward": 6.704320430755615, + "reward_std": 2.2609105110168457, + "rewards/fitness_reward/mean": 6.082125186920166, + "rewards/fitness_reward/std": 1.8627053499221802, + "rewards/kidney_reward/mean": 0.3334283232688904, + "rewards/kidney_reward/std": 1.3245787620544434, + "rewards/length2tails_reward/mean": 0.7651385068893433, + "rewards/length2tails_reward/std": 0.30930861830711365, + "rewards/thermo_reward/mean": 0.5283927917480469, + "rewards/thermo_reward/std": 1.6263214349746704, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13635238073766232, + "epoch": 2.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3313847780227661, + "learning_rate": 1.5335814726874573e-07, + "loss": -0.0053, + "num_tokens": 10880123.0, + "reward": 6.245105743408203, + "reward_std": 2.899134397506714, + "rewards/fitness_reward/mean": 5.791745185852051, + "rewards/fitness_reward/std": 2.4210124015808105, + "rewards/kidney_reward/mean": -0.13411888480186462, + "rewards/kidney_reward/std": 1.2954943180084229, + "rewards/length2tails_reward/mean": 0.7182813882827759, + "rewards/length2tails_reward/std": 0.34886518120765686, + "rewards/thermo_reward/mean": 0.681698203086853, + "rewards/thermo_reward/std": 1.5872745513916016, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.14791954308748245, + "epoch": 2.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.83034610748291, + "learning_rate": 1.5220714242974507e-07, + "loss": -0.0214, + "num_tokens": 10888746.0, + "reward": 5.824936866760254, + "reward_std": 3.9612488746643066, + "rewards/fitness_reward/mean": 5.15399694442749, + "rewards/fitness_reward/std": 3.8683125972747803, + "rewards/kidney_reward/mean": 0.38159358501434326, + "rewards/kidney_reward/std": 1.2706904411315918, + "rewards/length2tails_reward/mean": 0.8645726442337036, + "rewards/length2tails_reward/std": 0.23124291002750397, + "rewards/thermo_reward/mean": 0.5279995203018188, + "rewards/thermo_reward/std": 1.6427781581878662, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1450122306123376, + "epoch": 2.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9722148776054382, + "learning_rate": 1.5106011731813827e-07, + "loss": -0.0009, + "num_tokens": 10897476.0, + "reward": 7.187895774841309, + "reward_std": 1.3326863050460815, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5791944265365601, + "rewards/kidney_reward/std": 1.1656816005706787, + "rewards/length2tails_reward/mean": 0.8004372119903564, + "rewards/length2tails_reward/std": 0.27950695157051086, + "rewards/thermo_reward/mean": 0.419760137796402, + "rewards/thermo_reward/std": 1.796669602394104, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13320942502468824, + "epoch": 2.4939999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2254817485809326, + "learning_rate": 1.4991707731831492e-07, + "loss": 0.0814, + "num_tokens": 10906250.0, + "reward": 6.861900329589844, + "reward_std": 2.2667813301086426, + "rewards/fitness_reward/mean": 5.945378303527832, + "rewards/fitness_reward/std": 2.090650796890259, + "rewards/kidney_reward/mean": 0.7168327569961548, + "rewards/kidney_reward/std": 1.2535909414291382, + "rewards/length2tails_reward/mean": 0.6355811357498169, + "rewards/length2tails_reward/std": 0.3203444182872772, + "rewards/thermo_reward/mean": 0.7984210252761841, + "rewards/thermo_reward/std": 1.3946181535720825, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 275.96875, + "completions/mean_terminated_length": 275.96875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.16348459385335445, + "epoch": 2.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5615265369415283, + "learning_rate": 1.487780277959576e-07, + "loss": 0.0915, + "num_tokens": 10915113.0, + "reward": 6.307394027709961, + "reward_std": 2.683405637741089, + "rewards/fitness_reward/mean": 6.008951187133789, + "rewards/fitness_reward/std": 2.242719888687134, + "rewards/kidney_reward/mean": 0.0701042115688324, + "rewards/kidney_reward/std": 1.3624838590621948, + "rewards/length2tails_reward/mean": 0.7319860458374023, + "rewards/length2tails_reward/std": 0.33009132742881775, + "rewards/thermo_reward/mean": 0.1607879400253296, + "rewards/thermo_reward/std": 1.9410432577133179, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14280677307397127, + "epoch": 2.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8168588280677795, + "learning_rate": 1.4764297409801763e-07, + "loss": -0.002, + "num_tokens": 10923800.0, + "reward": 7.078860282897949, + "reward_std": 1.0025743246078491, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.42494362592697144, + "rewards/kidney_reward/std": 1.0538948774337769, + "rewards/length2tails_reward/mean": 0.76559978723526, + "rewards/length2tails_reward/std": 0.3061850070953369, + "rewards/thermo_reward/mean": 0.37335893511772156, + "rewards/thermo_reward/std": 1.7233487367630005, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 288.3125, + "completions/mean_terminated_length": 288.3125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.22077888809144497, + "epoch": 2.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.544058322906494, + "learning_rate": 1.4651192155268766e-07, + "loss": 0.1503, + "num_tokens": 10933058.0, + "reward": 6.144808769226074, + "reward_std": 3.7819676399230957, + "rewards/fitness_reward/mean": 5.242587566375732, + "rewards/fitness_reward/std": 3.6278676986694336, + "rewards/kidney_reward/mean": 0.2942799925804138, + "rewards/kidney_reward/std": 1.354179859161377, + "rewards/length2tails_reward/mean": 0.8237448930740356, + "rewards/length2tails_reward/std": 0.23254713416099548, + "rewards/thermo_reward/mean": 1.0982896089553833, + "rewards/thermo_reward/std": 1.337283968925476, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 274.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16482182685285807, + "epoch": 2.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0216729640960693, + "learning_rate": 1.453848754693795e-07, + "loss": 0.0583, + "num_tokens": 10941879.0, + "reward": 6.771227836608887, + "reward_std": 2.306549549102783, + "rewards/fitness_reward/mean": 6.049060821533203, + "rewards/fitness_reward/std": 2.03277850151062, + "rewards/kidney_reward/mean": 0.384724885225296, + "rewards/kidney_reward/std": 1.3810142278671265, + "rewards/length2tails_reward/mean": 0.8267073631286621, + "rewards/length2tails_reward/std": 0.24235688149929047, + "rewards/thermo_reward/mean": 0.646255373954773, + "rewards/thermo_reward/std": 1.6654709577560425, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 275.625, + "completions/mean_terminated_length": 275.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17248440254479647, + "epoch": 2.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.394607067108154, + "learning_rate": 1.4426184113869667e-07, + "loss": 0.1146, + "num_tokens": 10950731.0, + "reward": 6.672453880310059, + "reward_std": 2.0241599082946777, + "rewards/fitness_reward/mean": 6.236730575561523, + "rewards/fitness_reward/std": 2.0057373046875, + "rewards/kidney_reward/mean": 0.08068785071372986, + "rewards/kidney_reward/std": 1.3135042190551758, + "rewards/length2tails_reward/mean": 0.7176831960678101, + "rewards/length2tails_reward/std": 0.3168506920337677, + "rewards/thermo_reward/mean": 0.4319174885749817, + "rewards/thermo_reward/std": 1.648193359375, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.13867691345512867, + "epoch": 2.5060000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.050812244415283, + "learning_rate": 1.4314282383241095e-07, + "loss": -0.0108, + "num_tokens": 10959377.0, + "reward": 6.458885192871094, + "reward_std": 3.445922613143921, + "rewards/fitness_reward/mean": 5.707927703857422, + "rewards/fitness_reward/std": 2.748197555541992, + "rewards/kidney_reward/mean": 0.5447063446044922, + "rewards/kidney_reward/std": 1.4535712003707886, + "rewards/length2tails_reward/mean": 0.7592275142669678, + "rewards/length2tails_reward/std": 0.28678154945373535, + "rewards/thermo_reward/mean": 0.5775938630104065, + "rewards/thermo_reward/std": 1.5720056295394897, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1333228461444378, + "epoch": 2.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7781708836555481, + "learning_rate": 1.420278288034371e-07, + "loss": -0.0053, + "num_tokens": 10968057.0, + "reward": 7.300905704498291, + "reward_std": 1.5097392797470093, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.743131697177887, + "rewards/kidney_reward/std": 1.2453510761260986, + "rewards/length2tails_reward/mean": 0.772466778755188, + "rewards/length2tails_reward/std": 0.3110933005809784, + "rewards/thermo_reward/mean": 0.701805591583252, + "rewards/thermo_reward/std": 1.467189908027649, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 265.65625, + "completions/mean_terminated_length": 265.65625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.15651994105428457, + "epoch": 2.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3056888580322266, + "learning_rate": 1.4091686128580926e-07, + "loss": -0.1036, + "num_tokens": 10976590.0, + "reward": 6.690296173095703, + "reward_std": 2.116696834564209, + "rewards/fitness_reward/mean": 6.23317813873291, + "rewards/fitness_reward/std": 2.0258333683013916, + "rewards/kidney_reward/mean": 0.30987870693206787, + "rewards/kidney_reward/std": 1.18710458278656, + "rewards/length2tails_reward/mean": 0.7709380388259888, + "rewards/length2tails_reward/std": 0.2829013466835022, + "rewards/thermo_reward/mean": 0.21888795495033264, + "rewards/thermo_reward/std": 1.8321727514266968, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13938054535537958, + "epoch": 2.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1305482387542725, + "learning_rate": 1.3980992649465472e-07, + "loss": 0.0059, + "num_tokens": 10985255.0, + "reward": 6.853122711181641, + "reward_std": 2.085076093673706, + "rewards/fitness_reward/mean": 6.186428070068359, + "rewards/fitness_reward/std": 1.7864642143249512, + "rewards/kidney_reward/mean": 0.45179492235183716, + "rewards/kidney_reward/std": 1.4332520961761475, + "rewards/length2tails_reward/mean": 0.7064170837402344, + "rewards/length2tails_reward/std": 0.3224819302558899, + "rewards/thermo_reward/mean": 0.5283844470977783, + "rewards/thermo_reward/std": 1.6610950231552124, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14205337781459093, + "epoch": 2.5140000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5105639100074768, + "learning_rate": 1.3870702962617008e-07, + "loss": 0.0024, + "num_tokens": 10993972.0, + "reward": 7.00224494934082, + "reward_std": 1.2950648069381714, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.0970381647348404, + "rewards/kidney_reward/std": 1.472644329071045, + "rewards/length2tails_reward/mean": 0.7643052935600281, + "rewards/length2tails_reward/std": 0.2566501796245575, + "rewards/thermo_reward/mean": 0.9487347602844238, + "rewards/thermo_reward/std": 1.5298305749893188, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16099613811820745, + "epoch": 2.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.41198992729187, + "learning_rate": 1.376081758575981e-07, + "loss": 0.0403, + "num_tokens": 11002779.0, + "reward": 6.760746002197266, + "reward_std": 2.6566321849823, + "rewards/fitness_reward/mean": 6.113055229187012, + "rewards/fitness_reward/std": 2.183053731918335, + "rewards/kidney_reward/mean": 0.6422954797744751, + "rewards/kidney_reward/std": 1.3250433206558228, + "rewards/length2tails_reward/mean": 0.8097543716430664, + "rewards/length2tails_reward/std": 0.2507774233818054, + "rewards/thermo_reward/mean": 0.2482084333896637, + "rewards/thermo_reward/std": 1.6853443384170532, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 279.0, + "completions/mean_terminated_length": 279.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15969465114176273, + "epoch": 2.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5062834024429321, + "learning_rate": 1.3651337034720113e-07, + "loss": -0.02, + "num_tokens": 11011739.0, + "reward": 7.165556907653809, + "reward_std": 1.2174426317214966, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.30569979548454285, + "rewards/kidney_reward/std": 1.3607003688812256, + "rewards/length2tails_reward/mean": 0.8306646347045898, + "rewards/length2tails_reward/std": 0.24817925691604614, + "rewards/thermo_reward/mean": 0.42748451232910156, + "rewards/thermo_reward/std": 1.7862064838409424, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13787450827658176, + "epoch": 2.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2863463163375854, + "learning_rate": 1.3542261823423907e-07, + "loss": 0.0008, + "num_tokens": 11020447.0, + "reward": 7.186847686767578, + "reward_std": 2.235790967941284, + "rewards/fitness_reward/mean": 6.1153035163879395, + "rewards/fitness_reward/std": 2.1707992553710938, + "rewards/kidney_reward/mean": 0.32901477813720703, + "rewards/kidney_reward/std": 1.4136900901794434, + "rewards/length2tails_reward/mean": 0.8276294469833374, + "rewards/length2tails_reward/std": 0.24257804453372955, + "rewards/thermo_reward/mean": 1.4002583026885986, + "rewards/thermo_reward/std": 0.9610304832458496, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.15625, + "completions/mean_terminated_length": 265.15625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.14817991759628057, + "epoch": 2.5220000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.353713870048523, + "learning_rate": 1.3433592463894372e-07, + "loss": -0.0452, + "num_tokens": 11028964.0, + "reward": 5.634735107421875, + "reward_std": 4.194916725158691, + "rewards/fitness_reward/mean": 5.238389015197754, + "rewards/fitness_reward/std": 3.657475709915161, + "rewards/kidney_reward/mean": 0.30517953634262085, + "rewards/kidney_reward/std": 1.2891331911087036, + "rewards/length2tails_reward/mean": 0.7029399871826172, + "rewards/length2tails_reward/std": 0.3876851797103882, + "rewards/thermo_reward/mean": 0.13604281842708588, + "rewards/thermo_reward/std": 1.7282103300094604, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 281.28125, + "completions/mean_terminated_length": 281.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16873465664684772, + "epoch": 2.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.664103507995605, + "learning_rate": 1.3325329466249535e-07, + "loss": 0.1986, + "num_tokens": 11037997.0, + "reward": 7.174631118774414, + "reward_std": 2.6238505840301514, + "rewards/fitness_reward/mean": 6.2383575439453125, + "rewards/fitness_reward/std": 1.9965349435806274, + "rewards/kidney_reward/mean": 0.5765241980552673, + "rewards/kidney_reward/std": 1.1775901317596436, + "rewards/length2tails_reward/mean": 0.7843703031539917, + "rewards/length2tails_reward/std": 0.25142306089401245, + "rewards/thermo_reward/mean": 0.9038380980491638, + "rewards/thermo_reward/std": 1.4799734354019165, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 275.15625, + "completions/mean_terminated_length": 275.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1589777208864689, + "epoch": 2.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.106027126312256, + "learning_rate": 1.321747333869986e-07, + "loss": 0.0526, + "num_tokens": 11046834.0, + "reward": 6.602090358734131, + "reward_std": 2.962291955947876, + "rewards/fitness_reward/mean": 5.915340423583984, + "rewards/fitness_reward/std": 2.674147605895996, + "rewards/kidney_reward/mean": 0.5103333592414856, + "rewards/kidney_reward/std": 1.3865878582000732, + "rewards/length2tails_reward/mean": 0.8465596437454224, + "rewards/length2tails_reward/std": 0.2365303486585617, + "rewards/thermo_reward/mean": 0.43988561630249023, + "rewards/thermo_reward/std": 1.9005239009857178, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13879275601357222, + "epoch": 2.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6873528957366943, + "learning_rate": 1.311002458754593e-07, + "loss": -0.0022, + "num_tokens": 11055498.0, + "reward": 6.7083659172058105, + "reward_std": 1.7778831720352173, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.05778620019555092, + "rewards/kidney_reward/std": 1.268420934677124, + "rewards/length2tails_reward/mean": 0.7357010841369629, + "rewards/length2tails_reward/std": 0.322184294462204, + "rewards/thermo_reward/mean": 0.6324108839035034, + "rewards/thermo_reward/std": 1.6303297281265259, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.71875, + "completions/mean_terminated_length": 265.71875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.12634800001978874, + "epoch": 2.5300000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3306564092636108, + "learning_rate": 1.3002983717175954e-07, + "loss": -0.0832, + "num_tokens": 11064033.0, + "reward": 6.963728904724121, + "reward_std": 2.3005309104919434, + "rewards/fitness_reward/mean": 6.23915433883667, + "rewards/fitness_reward/std": 1.9920251369476318, + "rewards/kidney_reward/mean": 0.346341073513031, + "rewards/kidney_reward/std": 1.3824703693389893, + "rewards/length2tails_reward/mean": 0.7443707585334778, + "rewards/length2tails_reward/std": 0.31186145544052124, + "rewards/thermo_reward/mean": 0.730621337890625, + "rewards/thermo_reward/std": 1.622398018836975, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.13730474468320608, + "epoch": 2.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.463104248046875, + "learning_rate": 1.2896351230063496e-07, + "loss": 0.0032, + "num_tokens": 11072742.0, + "reward": 6.968493461608887, + "reward_std": 1.557864785194397, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.33006221055984497, + "rewards/kidney_reward/std": 1.5677642822265625, + "rewards/length2tails_reward/mean": 0.846727728843689, + "rewards/length2tails_reward/std": 0.16502907872200012, + "rewards/thermo_reward/mean": 0.20694246888160706, + "rewards/thermo_reward/std": 1.8171974420547485, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.14338538609445095, + "epoch": 2.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8924700021743774, + "learning_rate": 1.2790127626765047e-07, + "loss": -0.0245, + "num_tokens": 11081397.0, + "reward": 6.5691938400268555, + "reward_std": 2.644094705581665, + "rewards/fitness_reward/mean": 5.845990180969238, + "rewards/fitness_reward/std": 2.5938191413879395, + "rewards/kidney_reward/mean": 0.36223363876342773, + "rewards/kidney_reward/std": 1.5675878524780273, + "rewards/length2tails_reward/mean": 0.7985920906066895, + "rewards/length2tails_reward/std": 0.2833617925643921, + "rewards/thermo_reward/mean": 0.6848776340484619, + "rewards/thermo_reward/std": 1.7325043678283691, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 282.5625, + "completions/mean_terminated_length": 282.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1525559900328517, + "epoch": 2.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.082820177078247, + "learning_rate": 1.26843134059177e-07, + "loss": 0.1373, + "num_tokens": 11090471.0, + "reward": 6.443244457244873, + "reward_std": 3.021930694580078, + "rewards/fitness_reward/mean": 5.767224311828613, + "rewards/fitness_reward/std": 2.8718364238739014, + "rewards/kidney_reward/mean": 0.42932993173599243, + "rewards/kidney_reward/std": 1.168616771697998, + "rewards/length2tails_reward/mean": 0.8396263122558594, + "rewards/length2tails_reward/std": 0.22616492211818695, + "rewards/thermo_reward/mean": 0.5028963088989258, + "rewards/thermo_reward/std": 1.8243727684020996, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13960972987115383, + "epoch": 2.5380000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6062420010566711, + "learning_rate": 1.2578909064236887e-07, + "loss": 0.0056, + "num_tokens": 11099174.0, + "reward": 6.696412086486816, + "reward_std": 1.409006118774414, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.19778165221214294, + "rewards/kidney_reward/std": 1.2680705785751343, + "rewards/length2tails_reward/mean": 0.779447078704834, + "rewards/length2tails_reward/std": 0.2503746747970581, + "rewards/thermo_reward/mean": -0.17129836976528168, + "rewards/thermo_reward/std": 2.166795253753662, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.16253331303596497, + "epoch": 2.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4970827102661133, + "learning_rate": 1.2473915096513888e-07, + "loss": 0.0012, + "num_tokens": 11107891.0, + "reward": 6.726602554321289, + "reward_std": 2.4670677185058594, + "rewards/fitness_reward/mean": 6.028541564941406, + "rewards/fitness_reward/std": 2.1397509574890137, + "rewards/kidney_reward/mean": 0.5297418236732483, + "rewards/kidney_reward/std": 1.5598334074020386, + "rewards/length2tails_reward/mean": 0.8497204184532166, + "rewards/length2tails_reward/std": 0.20643417537212372, + "rewards/thermo_reward/mean": 0.4415205717086792, + "rewards/thermo_reward/std": 1.6006488800048828, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 277.40625, + "completions/mean_terminated_length": 277.40625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.16540326736867428, + "epoch": 2.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2988548278808594, + "learning_rate": 1.2369331995613663e-07, + "loss": 0.0277, + "num_tokens": 11116800.0, + "reward": 6.583467483520508, + "reward_std": 2.3983428478240967, + "rewards/fitness_reward/mean": 6.209179401397705, + "rewards/fitness_reward/std": 2.1615891456604004, + "rewards/kidney_reward/mean": -0.1511869728565216, + "rewards/kidney_reward/std": 1.3648273944854736, + "rewards/length2tails_reward/mean": 0.8468163013458252, + "rewards/length2tails_reward/std": 0.1635502427816391, + "rewards/thermo_reward/mean": 0.4763551950454712, + "rewards/thermo_reward/std": 1.7167506217956543, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 264.03125, + "completions/mean_terminated_length": 264.03125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.1943329405039549, + "epoch": 2.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8937320709228516, + "learning_rate": 1.2265160252472396e-07, + "loss": -0.0644, + "num_tokens": 11125281.0, + "reward": 5.662491321563721, + "reward_std": 4.140250205993652, + "rewards/fitness_reward/mean": 5.092027187347412, + "rewards/fitness_reward/std": 3.764228105545044, + "rewards/kidney_reward/mean": 0.6592483520507812, + "rewards/kidney_reward/std": 1.1888881921768188, + "rewards/length2tails_reward/mean": 0.7980906367301941, + "rewards/length2tails_reward/std": 0.317594438791275, + "rewards/thermo_reward/mean": 0.08263438940048218, + "rewards/thermo_reward/std": 2.132211446762085, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 266.03125, + "completions/mean_terminated_length": 266.03125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.15273398533463478, + "epoch": 2.5460000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4597315788269043, + "learning_rate": 1.2161400356095374e-07, + "loss": -0.0524, + "num_tokens": 11133826.0, + "reward": 6.416354179382324, + "reward_std": 2.8398849964141846, + "rewards/fitness_reward/mean": 5.8878021240234375, + "rewards/fitness_reward/std": 2.772123336791992, + "rewards/kidney_reward/mean": 0.4200093746185303, + "rewards/kidney_reward/std": 1.4113739728927612, + "rewards/length2tails_reward/mean": 0.7279560565948486, + "rewards/length2tails_reward/std": 0.30436235666275024, + "rewards/thermo_reward/mean": 0.2731165289878845, + "rewards/thermo_reward/std": 1.896885633468628, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 275.375, + "completions/mean_terminated_length": 275.375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.17170690465718508, + "epoch": 2.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.124152421951294, + "learning_rate": 1.2058052793554473e-07, + "loss": -0.0918, + "num_tokens": 11142670.0, + "reward": 6.151288986206055, + "reward_std": 2.769615650177002, + "rewards/fitness_reward/mean": 5.644631385803223, + "rewards/fitness_reward/std": 2.6252248287200928, + "rewards/kidney_reward/mean": -0.09959825128316879, + "rewards/kidney_reward/std": 1.1967122554779053, + "rewards/length2tails_reward/mean": 0.7944548726081848, + "rewards/length2tails_reward/std": 0.3183787763118744, + "rewards/thermo_reward/mean": 0.7156860828399658, + "rewards/thermo_reward/std": 1.4565240144729614, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 278.4375, + "completions/mean_terminated_length": 278.4375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.17168503254652023, + "epoch": 2.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9615895748138428, + "learning_rate": 1.195511804998609e-07, + "loss": 0.0982, + "num_tokens": 11151612.0, + "reward": 6.976527214050293, + "reward_std": 2.3976783752441406, + "rewards/fitness_reward/mean": 6.235381126403809, + "rewards/fitness_reward/std": 2.013369560241699, + "rewards/kidney_reward/mean": 0.4650770127773285, + "rewards/kidney_reward/std": 1.3732489347457886, + "rewards/length2tails_reward/mean": 0.858077883720398, + "rewards/length2tails_reward/std": 0.2123798131942749, + "rewards/thermo_reward/mean": 0.5881752967834473, + "rewards/thermo_reward/std": 1.781854510307312, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 281.40625, + "completions/mean_terminated_length": 281.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15420350432395935, + "epoch": 2.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8219046592712402, + "learning_rate": 1.1852596608588671e-07, + "loss": 0.0991, + "num_tokens": 11160649.0, + "reward": 6.860178470611572, + "reward_std": 2.296172618865967, + "rewards/fitness_reward/mean": 6.230461120605469, + "rewards/fitness_reward/std": 2.0412020683288574, + "rewards/kidney_reward/mean": 0.4495287239551544, + "rewards/kidney_reward/std": 1.3362219333648682, + "rewards/length2tails_reward/mean": 0.791000247001648, + "rewards/length2tails_reward/std": 0.22419226169586182, + "rewards/thermo_reward/mean": 0.4144059419631958, + "rewards/thermo_reward/std": 1.5571057796478271, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16235809493809938, + "epoch": 2.5540000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0311994552612305, + "learning_rate": 1.175048895062054e-07, + "loss": 0.0159, + "num_tokens": 11169391.0, + "reward": 6.27305269241333, + "reward_std": 2.25542950630188, + "rewards/fitness_reward/mean": 6.127653121948242, + "rewards/fitness_reward/std": 2.1035826206207275, + "rewards/kidney_reward/mean": -0.028336012735962868, + "rewards/kidney_reward/std": 1.3013705015182495, + "rewards/length2tails_reward/mean": 0.7887170314788818, + "rewards/length2tails_reward/std": 0.29202377796173096, + "rewards/thermo_reward/mean": -0.0752229169011116, + "rewards/thermo_reward/std": 2.071074962615967, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 265.5625, + "completions/mean_terminated_length": 265.5625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.14668810553848743, + "epoch": 2.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2922263145446777, + "learning_rate": 1.1648795555397716e-07, + "loss": -0.0407, + "num_tokens": 11177921.0, + "reward": 6.2823991775512695, + "reward_std": 2.8683526515960693, + "rewards/fitness_reward/mean": 5.939599990844727, + "rewards/fitness_reward/std": 2.5724868774414062, + "rewards/kidney_reward/mean": 0.17976199090480804, + "rewards/kidney_reward/std": 1.3973599672317505, + "rewards/length2tails_reward/mean": 0.8208662867546082, + "rewards/length2tails_reward/std": 0.24103957414627075, + "rewards/thermo_reward/mean": 0.09540346264839172, + "rewards/thermo_reward/std": 1.6673074960708618, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.90625, + "completions/mean_terminated_length": 262.90625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.147798758931458, + "epoch": 2.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5885316133499146, + "learning_rate": 1.1547516900291443e-07, + "loss": -0.0999, + "num_tokens": 11186366.0, + "reward": 6.605360984802246, + "reward_std": 2.0892367362976074, + "rewards/fitness_reward/mean": 5.962489128112793, + "rewards/fitness_reward/std": 2.0055184364318848, + "rewards/kidney_reward/mean": 0.3093305230140686, + "rewards/kidney_reward/std": 1.501753330230713, + "rewards/length2tails_reward/mean": 0.705924391746521, + "rewards/length2tails_reward/std": 0.3187275230884552, + "rewards/thermo_reward/mean": 0.6234513521194458, + "rewards/thermo_reward/std": 1.423208236694336, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.13173521868884563, + "epoch": 2.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3490428626537323, + "learning_rate": 1.144665346072623e-07, + "loss": -0.0031, + "num_tokens": 11195002.0, + "reward": 6.792850494384766, + "reward_std": 1.777622103691101, + "rewards/fitness_reward/mean": 6.187403678894043, + "rewards/fitness_reward/std": 1.7812501192092896, + "rewards/kidney_reward/mean": 0.002899991348385811, + "rewards/kidney_reward/std": 1.2121065855026245, + "rewards/length2tails_reward/mean": 0.7328965067863464, + "rewards/length2tails_reward/std": 0.26981785893440247, + "rewards/thermo_reward/mean": 0.8415464162826538, + "rewards/thermo_reward/std": 1.3412245512008667, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12782933376729488, + "epoch": 2.5620000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7457025647163391, + "learning_rate": 1.1346205710177303e-07, + "loss": -0.0023, + "num_tokens": 11203663.0, + "reward": 6.979855060577393, + "reward_std": 1.1470857858657837, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.220194011926651, + "rewards/kidney_reward/std": 1.360392689704895, + "rewards/length2tails_reward/mean": 0.6909193992614746, + "rewards/length2tails_reward/std": 0.3535473644733429, + "rewards/thermo_reward/mean": 0.21146000921726227, + "rewards/thermo_reward/std": 1.7288585901260376, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.14081147126853466, + "epoch": 2.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6050026416778564, + "learning_rate": 1.1246174120168727e-07, + "loss": -0.0065, + "num_tokens": 11212325.0, + "reward": 6.674251556396484, + "reward_std": 2.5174691677093506, + "rewards/fitness_reward/mean": 6.024764537811279, + "rewards/fitness_reward/std": 2.159541606903076, + "rewards/kidney_reward/mean": 0.6854661107063293, + "rewards/kidney_reward/std": 1.0346031188964844, + "rewards/length2tails_reward/mean": 0.7680181264877319, + "rewards/length2tails_reward/std": 0.3201650381088257, + "rewards/thermo_reward/mean": 0.229498490691185, + "rewards/thermo_reward/std": 1.8448776006698608, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14766631368547678, + "epoch": 2.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6355104446411133, + "learning_rate": 1.1146559160270874e-07, + "loss": 0.0041, + "num_tokens": 11221016.0, + "reward": 6.580097198486328, + "reward_std": 2.527644395828247, + "rewards/fitness_reward/mean": 6.212012767791748, + "rewards/fitness_reward/std": 2.1455624103546143, + "rewards/kidney_reward/mean": 0.4586600065231323, + "rewards/kidney_reward/std": 1.2884141206741333, + "rewards/length2tails_reward/mean": 0.7166929244995117, + "rewards/length2tails_reward/std": 0.33269643783569336, + "rewards/thermo_reward/mean": -0.08083853870630264, + "rewards/thermo_reward/std": 1.8933789730072021, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13667545095086098, + "epoch": 2.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3018914461135864, + "learning_rate": 1.1047361298098468e-07, + "loss": -0.0006, + "num_tokens": 11229710.0, + "reward": 6.9851789474487305, + "reward_std": 2.4701969623565674, + "rewards/fitness_reward/mean": 6.2334794998168945, + "rewards/fitness_reward/std": 2.0241284370422363, + "rewards/kidney_reward/mean": 0.43473029136657715, + "rewards/kidney_reward/std": 1.492920994758606, + "rewards/length2tails_reward/mean": 0.7245880961418152, + "rewards/length2tails_reward/std": 0.2910359501838684, + "rewards/thermo_reward/mean": 0.7063749432563782, + "rewards/thermo_reward/std": 1.4896142482757568, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.12583217304199934, + "epoch": 2.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3929624557495117, + "learning_rate": 1.0948580999308233e-07, + "loss": -0.0029, + "num_tokens": 11238326.0, + "reward": 6.701354026794434, + "reward_std": 1.4116487503051758, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.25893306732177734, + "rewards/kidney_reward/std": 1.3018968105316162, + "rewards/length2tails_reward/mean": 0.6769523620605469, + "rewards/length2tails_reward/std": 0.3186993896961212, + "rewards/thermo_reward/mean": -0.17131927609443665, + "rewards/thermo_reward/std": 1.9665024280548096, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 285.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17031805217266083, + "epoch": 2.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.774345874786377, + "learning_rate": 1.0850218727596716e-07, + "loss": 0.2012, + "num_tokens": 11247493.0, + "reward": 6.535714626312256, + "reward_std": 2.429759979248047, + "rewards/fitness_reward/mean": 6.118473052978516, + "rewards/fitness_reward/std": 2.153531551361084, + "rewards/kidney_reward/mean": 0.4238532781600952, + "rewards/kidney_reward/std": 1.2699496746063232, + "rewards/length2tails_reward/mean": 0.663884162902832, + "rewards/length2tails_reward/std": 0.37357431650161743, + "rewards/thermo_reward/mean": 0.07868780195713043, + "rewards/thermo_reward/std": 1.8400204181671143, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 274.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13812145683914423, + "epoch": 2.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7433780431747437, + "learning_rate": 1.0752274944698259e-07, + "loss": 0.025, + "num_tokens": 11256305.0, + "reward": 7.3515167236328125, + "reward_std": 1.2924505472183228, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5429353713989258, + "rewards/kidney_reward/std": 1.3698012828826904, + "rewards/length2tails_reward/mean": 0.7748067378997803, + "rewards/length2tails_reward/std": 0.2981312870979309, + "rewards/thermo_reward/mean": 0.5900977849960327, + "rewards/thermo_reward/std": 1.8615976572036743, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.1437450535595417, + "epoch": 2.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.083304762840271, + "learning_rate": 1.0654750110382627e-07, + "loss": 0.007, + "num_tokens": 11264944.0, + "reward": 6.704545021057129, + "reward_std": 2.5547947883605957, + "rewards/fitness_reward/mean": 6.256975173950195, + "rewards/fitness_reward/std": 1.891215443611145, + "rewards/kidney_reward/mean": 0.17183798551559448, + "rewards/kidney_reward/std": 1.267488718032837, + "rewards/length2tails_reward/mean": 0.7747859954833984, + "rewards/length2tails_reward/std": 0.3014908730983734, + "rewards/thermo_reward/mean": 0.3359082341194153, + "rewards/thermo_reward/std": 1.7451852560043335, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.13253707345575094, + "epoch": 2.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0129252672195435, + "learning_rate": 1.0557644682453037e-07, + "loss": -0.0236, + "num_tokens": 11273595.0, + "reward": 6.433099746704102, + "reward_std": 3.1920018196105957, + "rewards/fitness_reward/mean": 5.624979019165039, + "rewards/fitness_reward/std": 3.0691933631896973, + "rewards/kidney_reward/mean": 0.42991939187049866, + "rewards/kidney_reward/std": 1.3700929880142212, + "rewards/length2tails_reward/mean": 0.7174723148345947, + "rewards/length2tails_reward/std": 0.3021564185619354, + "rewards/thermo_reward/mean": 0.8275867700576782, + "rewards/thermo_reward/std": 1.3774175643920898, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.14543141331523657, + "epoch": 2.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49910205602645874, + "learning_rate": 1.0460959116743807e-07, + "loss": 0.0032, + "num_tokens": 11282257.0, + "reward": 6.711872100830078, + "reward_std": 2.3827362060546875, + "rewards/fitness_reward/mean": 6.313725471496582, + "rewards/fitness_reward/std": 1.570186972618103, + "rewards/kidney_reward/mean": 0.1728067696094513, + "rewards/kidney_reward/std": 1.3351936340332031, + "rewards/length2tails_reward/mean": 0.8178122639656067, + "rewards/length2tails_reward/std": 0.25317269563674927, + "rewards/thermo_reward/mean": 0.21458041667938232, + "rewards/thermo_reward/std": 1.954363226890564, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15918637812137604, + "epoch": 2.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4545961618423462, + "learning_rate": 1.0364693867118424e-07, + "loss": 0.0035, + "num_tokens": 11290939.0, + "reward": 7.231572151184082, + "reward_std": 1.077547311782837, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.42022648453712463, + "rewards/kidney_reward/std": 1.2498514652252197, + "rewards/length2tails_reward/mean": 0.8012136220932007, + "rewards/length2tails_reward/std": 0.26833879947662354, + "rewards/thermo_reward/mean": 0.6656925678253174, + "rewards/thermo_reward/std": 1.5421216487884521, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1271618753671646, + "epoch": 2.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6567250490188599, + "learning_rate": 1.026884938546726e-07, + "loss": 0.0026, + "num_tokens": 11299591.0, + "reward": 7.086432456970215, + "reward_std": 1.041763424873352, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.39575546979904175, + "rewards/kidney_reward/std": 1.30070161819458, + "rewards/length2tails_reward/mean": 0.6470104455947876, + "rewards/length2tails_reward/std": 0.30362820625305176, + "rewards/thermo_reward/mean": 0.2710069715976715, + "rewards/thermo_reward/std": 1.816556692123413, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.15106998197734356, + "epoch": 2.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7921992540359497, + "learning_rate": 1.0173426121705575e-07, + "loss": -0.0483, + "num_tokens": 11308275.0, + "reward": 6.769179821014404, + "reward_std": 2.708894968032837, + "rewards/fitness_reward/mean": 6.008904457092285, + "rewards/fitness_reward/std": 2.2429678440093994, + "rewards/kidney_reward/mean": 0.15658941864967346, + "rewards/kidney_reward/std": 1.310433268547058, + "rewards/length2tails_reward/mean": 0.817096471786499, + "rewards/length2tails_reward/std": 0.2115204632282257, + "rewards/thermo_reward/mean": 0.9554123878479004, + "rewards/thermo_reward/std": 1.524361491203308, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.71875, + "completions/mean_terminated_length": 268.71875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.15611295960843563, + "epoch": 2.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4831995964050293, + "learning_rate": 1.0078424523771268e-07, + "loss": -0.038, + "num_tokens": 11316906.0, + "reward": 6.805905342102051, + "reward_std": 2.766227960586548, + "rewards/fitness_reward/mean": 6.002234935760498, + "rewards/fitness_reward/std": 2.278193950653076, + "rewards/kidney_reward/mean": 0.3718709945678711, + "rewards/kidney_reward/std": 1.2618319988250732, + "rewards/length2tails_reward/mean": 0.8136824369430542, + "rewards/length2tails_reward/std": 0.20324693620204926, + "rewards/thermo_reward/mean": 0.8286288976669312, + "rewards/thermo_reward/std": 1.5846936702728271, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.34375, + "completions/mean_terminated_length": 263.34375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.18121715355664492, + "epoch": 2.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.285388469696045, + "learning_rate": 9.983845037622873e-08, + "loss": -0.0794, + "num_tokens": 11325365.0, + "reward": 6.3965959548950195, + "reward_std": 3.248755931854248, + "rewards/fitness_reward/mean": 5.596299648284912, + "rewards/fitness_reward/std": 3.149874687194824, + "rewards/kidney_reward/mean": 0.5928501486778259, + "rewards/kidney_reward/std": 1.2771762609481812, + "rewards/length2tails_reward/mean": 0.784777045249939, + "rewards/length2tails_reward/std": 0.2656571567058563, + "rewards/thermo_reward/mean": 0.6153541803359985, + "rewards/thermo_reward/std": 1.7248305082321167, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.141491892747581, + "epoch": 2.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8633207082748413, + "learning_rate": 9.889688107237459e-08, + "loss": 0.003, + "num_tokens": 11334076.0, + "reward": 6.9637651443481445, + "reward_std": 1.1524386405944824, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.07243490219116211, + "rewards/kidney_reward/std": 1.4863405227661133, + "rewards/length2tails_reward/mean": 0.8195652365684509, + "rewards/length2tails_reward/std": 0.22382794320583344, + "rewards/thermo_reward/mean": 0.613563597202301, + "rewards/thermo_reward/std": 1.4932918548583984, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 278.65625, + "completions/mean_terminated_length": 278.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15627904795110226, + "epoch": 2.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.522937059402466, + "learning_rate": 9.79595417460849e-08, + "loss": 0.0927, + "num_tokens": 11343025.0, + "reward": 6.162182331085205, + "reward_std": 3.7544608116149902, + "rewards/fitness_reward/mean": 5.443257808685303, + "rewards/fitness_reward/std": 3.6266250610351562, + "rewards/kidney_reward/mean": 0.36185890436172485, + "rewards/kidney_reward/std": 1.2872637510299683, + "rewards/length2tails_reward/mean": 0.7766433954238892, + "rewards/length2tails_reward/std": 0.27492356300354004, + "rewards/thermo_reward/mean": 0.6876682043075562, + "rewards/thermo_reward/std": 1.5193746089935303, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 267.3125, + "completions/mean_terminated_length": 267.3125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.1454916587099433, + "epoch": 2.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4790854454040527, + "learning_rate": 9.702643679743815e-08, + "loss": -0.0258, + "num_tokens": 11351611.0, + "reward": 6.653799057006836, + "reward_std": 2.12656831741333, + "rewards/fitness_reward/mean": 6.26751708984375, + "rewards/fitness_reward/std": 1.831581473350525, + "rewards/kidney_reward/mean": 0.016674287617206573, + "rewards/kidney_reward/std": 1.304878830909729, + "rewards/length2tails_reward/mean": 0.7734169960021973, + "rewards/length2tails_reward/std": 0.2881260812282562, + "rewards/thermo_reward/mean": 0.3691806197166443, + "rewards/thermo_reward/std": 1.6688036918640137, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1550305224955082, + "epoch": 2.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1952439546585083, + "learning_rate": 9.609757060663559e-08, + "loss": -0.0093, + "num_tokens": 11360360.0, + "reward": 6.782873630523682, + "reward_std": 2.4512579441070557, + "rewards/fitness_reward/mean": 6.021415710449219, + "rewards/fitness_reward/std": 2.177117109298706, + "rewards/kidney_reward/mean": 0.6395650506019592, + "rewards/kidney_reward/std": 1.1912813186645508, + "rewards/length2tails_reward/mean": 0.8013156652450562, + "rewards/length2tails_reward/std": 0.3027305603027344, + "rewards/thermo_reward/mean": 0.4826931059360504, + "rewards/thermo_reward/std": 1.742495059967041, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 278.3125, + "completions/mean_terminated_length": 278.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14959668926894665, + "epoch": 2.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.488746166229248, + "learning_rate": 9.517294753398064e-08, + "loss": 0.087, + "num_tokens": 11369298.0, + "reward": 6.230415344238281, + "reward_std": 3.3283166885375977, + "rewards/fitness_reward/mean": 5.877053260803223, + "rewards/fitness_reward/std": 2.8109235763549805, + "rewards/kidney_reward/mean": 0.04613116383552551, + "rewards/kidney_reward/std": 1.4171053171157837, + "rewards/length2tails_reward/mean": 0.797641396522522, + "rewards/length2tails_reward/std": 0.2692441940307617, + "rewards/thermo_reward/mean": 0.26177284121513367, + "rewards/thermo_reward/std": 1.8704382181167603, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 279.3125, + "completions/mean_terminated_length": 279.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17876174859702587, + "epoch": 2.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.030090808868408, + "learning_rate": 9.425257191985859e-08, + "loss": 0.0698, + "num_tokens": 11378268.0, + "reward": 6.170225620269775, + "reward_std": 3.5844838619232178, + "rewards/fitness_reward/mean": 5.498432159423828, + "rewards/fitness_reward/std": 3.4530649185180664, + "rewards/kidney_reward/mean": 0.20462659001350403, + "rewards/kidney_reward/std": 1.3121966123580933, + "rewards/length2tails_reward/mean": 0.8861352801322937, + "rewards/length2tails_reward/std": 0.18705448508262634, + "rewards/thermo_reward/mean": 0.6958932876586914, + "rewards/thermo_reward/std": 1.6568849086761475, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13294328935444355, + "epoch": 2.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44752955436706543, + "learning_rate": 9.333644808471675e-08, + "loss": -0.0023, + "num_tokens": 11386930.0, + "reward": 7.302291393280029, + "reward_std": 1.2112901210784912, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.28392332792282104, + "rewards/kidney_reward/std": 1.3022162914276123, + "rewards/length2tails_reward/mean": 0.8085978031158447, + "rewards/length2tails_reward/std": 0.21578450500965118, + "rewards/thermo_reward/mean": 0.7337634563446045, + "rewards/thermo_reward/std": 1.6511890888214111, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 275.78125, + "completions/mean_terminated_length": 275.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16998404636979103, + "epoch": 2.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.385599613189697, + "learning_rate": 9.24245803290431e-08, + "loss": 0.0766, + "num_tokens": 11395787.0, + "reward": 6.8666253089904785, + "reward_std": 2.591313123703003, + "rewards/fitness_reward/mean": 6.241159439086914, + "rewards/fitness_reward/std": 1.9806838035583496, + "rewards/kidney_reward/mean": 0.27050507068634033, + "rewards/kidney_reward/std": 1.4988094568252563, + "rewards/length2tails_reward/mean": 0.7789663672447205, + "rewards/length2tails_reward/std": 0.26763203740119934, + "rewards/thermo_reward/mean": 0.590943455696106, + "rewards/thermo_reward/std": 1.671730399131775, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.1440442530438304, + "epoch": 2.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5852406024932861, + "learning_rate": 9.151697293334648e-08, + "loss": -0.0032, + "num_tokens": 11404477.0, + "reward": 6.973286151885986, + "reward_std": 1.5978879928588867, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.3772517740726471, + "rewards/kidney_reward/std": 1.4440059661865234, + "rewards/length2tails_reward/mean": 0.7856607437133789, + "rewards/length2tails_reward/std": 0.3061576783657074, + "rewards/thermo_reward/mean": 0.1998719722032547, + "rewards/thermo_reward/std": 2.012409210205078, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.15813359152525663, + "epoch": 2.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9172699451446533, + "learning_rate": 9.061363015813761e-08, + "loss": -0.0087, + "num_tokens": 11413183.0, + "reward": 6.844547271728516, + "reward_std": 2.3700084686279297, + "rewards/fitness_reward/mean": 6.202705383300781, + "rewards/fitness_reward/std": 2.1982128620147705, + "rewards/kidney_reward/mean": 0.10016083717346191, + "rewards/kidney_reward/std": 1.43922758102417, + "rewards/length2tails_reward/mean": 0.8350353240966797, + "rewards/length2tails_reward/std": 0.21139366924762726, + "rewards/thermo_reward/mean": 0.7660048007965088, + "rewards/thermo_reward/std": 1.3410688638687134, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1475679101422429, + "epoch": 2.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8431895971298218, + "learning_rate": 8.971455624390677e-08, + "loss": 0.0018, + "num_tokens": 11421881.0, + "reward": 6.34879207611084, + "reward_std": 3.048671007156372, + "rewards/fitness_reward/mean": 5.8180084228515625, + "rewards/fitness_reward/std": 2.714780330657959, + "rewards/kidney_reward/mean": 0.3688226044178009, + "rewards/kidney_reward/std": 1.2881696224212646, + "rewards/length2tails_reward/mean": 0.7111343145370483, + "rewards/length2tails_reward/std": 0.31402069330215454, + "rewards/thermo_reward/mean": 0.33717772364616394, + "rewards/thermo_reward/std": 1.6664175987243652, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.16211927123367786, + "epoch": 2.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.941136598587036, + "learning_rate": 8.881975541110664e-08, + "loss": 0.0457, + "num_tokens": 11430700.0, + "reward": 6.379903793334961, + "reward_std": 2.833163022994995, + "rewards/fitness_reward/mean": 5.796462059020996, + "rewards/fitness_reward/std": 2.757930278778076, + "rewards/kidney_reward/mean": 0.5540950298309326, + "rewards/kidney_reward/std": 1.26094651222229, + "rewards/length2tails_reward/mean": 0.7833702564239502, + "rewards/length2tails_reward/std": 0.26671385765075684, + "rewards/thermo_reward/mean": 0.22110264003276825, + "rewards/thermo_reward/std": 1.9693902730941772, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13269296754151583, + "epoch": 2.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0597643852233887, + "learning_rate": 8.792923186013023e-08, + "loss": -0.0011, + "num_tokens": 11439380.0, + "reward": 6.808428764343262, + "reward_std": 1.2131792306900024, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.07270577549934387, + "rewards/kidney_reward/std": 1.2641968727111816, + "rewards/length2tails_reward/mean": 0.7487291097640991, + "rewards/length2tails_reward/std": 0.26070666313171387, + "rewards/thermo_reward/mean": 0.33857959508895874, + "rewards/thermo_reward/std": 1.7062650918960571, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13820412196218967, + "epoch": 2.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29976531863212585, + "learning_rate": 8.70429897712921e-08, + "loss": -0.0049, + "num_tokens": 11448073.0, + "reward": 7.059915065765381, + "reward_std": 1.3082730770111084, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.25037407875061035, + "rewards/kidney_reward/std": 1.325573205947876, + "rewards/length2tails_reward/mean": 0.7799757719039917, + "rewards/length2tails_reward/std": 0.25871542096138, + "rewards/thermo_reward/mean": 0.5028499364852905, + "rewards/thermo_reward/std": 1.5315024852752686, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15091517101973295, + "epoch": 2.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9314122200012207, + "learning_rate": 8.616103330480884e-08, + "loss": 0.0072, + "num_tokens": 11456819.0, + "reward": 7.221317291259766, + "reward_std": 1.1511346101760864, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.46290361881256104, + "rewards/kidney_reward/std": 1.2806140184402466, + "rewards/length2tails_reward/mean": 0.7854551076889038, + "rewards/length2tails_reward/std": 0.3004823923110962, + "rewards/thermo_reward/mean": 0.4044070839881897, + "rewards/thermo_reward/std": 1.7189313173294067, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12267510499805212, + "epoch": 2.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8689975142478943, + "learning_rate": 8.528336660077973e-08, + "loss": -0.0028, + "num_tokens": 11465483.0, + "reward": 6.2420196533203125, + "reward_std": 1.7789921760559082, + "rewards/fitness_reward/mean": 6.10767936706543, + "rewards/fitness_reward/std": 1.7336585521697998, + "rewards/kidney_reward/mean": 0.17017222940921783, + "rewards/kidney_reward/std": 1.405943751335144, + "rewards/length2tails_reward/mean": 0.7286784648895264, + "rewards/length2tails_reward/std": 0.3074788749217987, + "rewards/thermo_reward/mean": -0.2658313512802124, + "rewards/thermo_reward/std": 1.8454989194869995, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1428207764402032, + "epoch": 2.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.67376971244812, + "learning_rate": 8.440999377916613e-08, + "loss": 0.0473, + "num_tokens": 11474238.0, + "reward": 7.1986870765686035, + "reward_std": 2.356982707977295, + "rewards/fitness_reward/mean": 6.235002517700195, + "rewards/fitness_reward/std": 2.015511989593506, + "rewards/kidney_reward/mean": 0.8282337784767151, + "rewards/kidney_reward/std": 1.2971330881118774, + "rewards/length2tails_reward/mean": 0.7973774671554565, + "rewards/length2tails_reward/std": 0.25914981961250305, + "rewards/thermo_reward/mean": 0.7004466652870178, + "rewards/thermo_reward/std": 1.6007194519042969, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 274.15625, + "completions/mean_terminated_length": 274.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1439397530630231, + "epoch": 2.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5685398578643799, + "learning_rate": 8.3540918939774e-08, + "loss": 0.0136, + "num_tokens": 11483043.0, + "reward": 6.8435893058776855, + "reward_std": 1.4647371768951416, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.04395785927772522, + "rewards/kidney_reward/std": 1.5680180788040161, + "rewards/length2tails_reward/mean": 0.7202156186103821, + "rewards/length2tails_reward/std": 0.3144298195838928, + "rewards/thermo_reward/mean": 0.30649372935295105, + "rewards/thermo_reward/std": 1.7472255229949951, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.14158189296722412, + "epoch": 2.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6362239718437195, + "learning_rate": 8.267614616223273e-08, + "loss": 0.0015, + "num_tokens": 11491752.0, + "reward": 7.182564735412598, + "reward_std": 1.4124342203140259, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.20992687344551086, + "rewards/kidney_reward/std": 1.4904730319976807, + "rewards/length2tails_reward/mean": 0.7557791471481323, + "rewards/length2tails_reward/std": 0.253665566444397, + "rewards/thermo_reward/mean": 0.8006943464279175, + "rewards/thermo_reward/std": 1.7870943546295166, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12287769094109535, + "epoch": 2.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5174239873886108, + "learning_rate": 8.181567950597713e-08, + "loss": -0.0039, + "num_tokens": 11500423.0, + "reward": 6.945135116577148, + "reward_std": 1.9738068580627441, + "rewards/fitness_reward/mean": 6.188211441040039, + "rewards/fitness_reward/std": 1.7769315242767334, + "rewards/kidney_reward/mean": 0.48789387941360474, + "rewards/kidney_reward/std": 1.3209964036941528, + "rewards/length2tails_reward/mean": 0.7339325547218323, + "rewards/length2tails_reward/std": 0.31823745369911194, + "rewards/thermo_reward/mean": 0.6589872241020203, + "rewards/thermo_reward/std": 1.2833772897720337, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13281282410025597, + "epoch": 2.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33253607153892517, + "learning_rate": 8.095952301022846e-08, + "loss": 0.0063, + "num_tokens": 11509097.0, + "reward": 6.832671165466309, + "reward_std": 1.3681811094284058, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5004919767379761, + "rewards/kidney_reward/std": 1.231303334236145, + "rewards/length2tails_reward/mean": 0.7518715858459473, + "rewards/length2tails_reward/std": 0.2443847358226776, + "rewards/thermo_reward/mean": -0.18770401179790497, + "rewards/thermo_reward/std": 1.894903540611267, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14613137021660805, + "epoch": 2.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3056740760803223, + "learning_rate": 8.010768069397455e-08, + "loss": 0.0106, + "num_tokens": 11517755.0, + "reward": 6.717390537261963, + "reward_std": 2.4455068111419678, + "rewards/fitness_reward/mean": 6.121276378631592, + "rewards/fitness_reward/std": 2.1382689476013184, + "rewards/kidney_reward/mean": 0.5556734204292297, + "rewards/kidney_reward/std": 1.4169511795043945, + "rewards/length2tails_reward/mean": 0.6154686212539673, + "rewards/length2tails_reward/std": 0.34208738803863525, + "rewards/thermo_reward/mean": 0.32882094383239746, + "rewards/thermo_reward/std": 1.6126601696014404, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.13918451964855194, + "epoch": 2.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.260502338409424, + "learning_rate": 7.926015655595253e-08, + "loss": -0.0219, + "num_tokens": 11526400.0, + "reward": 6.543846607208252, + "reward_std": 2.767101526260376, + "rewards/fitness_reward/mean": 5.692856311798096, + "rewards/fitness_reward/std": 2.7915756702423096, + "rewards/kidney_reward/mean": 0.7697734832763672, + "rewards/kidney_reward/std": 1.1631016731262207, + "rewards/length2tails_reward/mean": 0.8018798828125, + "rewards/length2tails_reward/std": 0.299836665391922, + "rewards/thermo_reward/mean": 0.5312666893005371, + "rewards/thermo_reward/std": 1.5642306804656982, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13573483377695084, + "epoch": 2.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.669712245464325, + "learning_rate": 7.84169545746275e-08, + "loss": 0.003, + "num_tokens": 11535105.0, + "reward": 6.988643646240234, + "reward_std": 1.1130317449569702, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2840263843536377, + "rewards/kidney_reward/std": 1.295556902885437, + "rewards/length2tails_reward/mean": 0.7693678140640259, + "rewards/length2tails_reward/std": 0.2954905927181244, + "rewards/thermo_reward/mean": 0.1259807050228119, + "rewards/thermo_reward/std": 1.9685497283935547, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11833838000893593, + "epoch": 2.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0643484592437744, + "learning_rate": 7.75780787081769e-08, + "loss": 0.0018, + "num_tokens": 11543762.0, + "reward": 6.1620869636535645, + "reward_std": 3.317458391189575, + "rewards/fitness_reward/mean": 5.7596211433410645, + "rewards/fitness_reward/std": 2.5500731468200684, + "rewards/kidney_reward/mean": 0.26211708784103394, + "rewards/kidney_reward/std": 1.404579758644104, + "rewards/length2tails_reward/mean": 0.6791694164276123, + "rewards/length2tails_reward/std": 0.31338968873023987, + "rewards/thermo_reward/mean": 0.2032301127910614, + "rewards/thermo_reward/std": 1.8497836589813232, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.15640322398394346, + "epoch": 2.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.055444717407227, + "learning_rate": 7.674353289446944e-08, + "loss": -0.012, + "num_tokens": 11552499.0, + "reward": 6.838186740875244, + "reward_std": 2.389883279800415, + "rewards/fitness_reward/mean": 6.219882965087891, + "rewards/fitness_reward/std": 2.101041555404663, + "rewards/kidney_reward/mean": 0.45063361525535583, + "rewards/kidney_reward/std": 1.3532819747924805, + "rewards/length2tails_reward/mean": 0.7730776071548462, + "rewards/length2tails_reward/std": 0.2830614149570465, + "rewards/thermo_reward/mean": 0.3994349241256714, + "rewards/thermo_reward/std": 1.7491199970245361, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14921415597200394, + "epoch": 2.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.056000828742981, + "learning_rate": 7.59133210510483e-08, + "loss": 0.0179, + "num_tokens": 11561243.0, + "reward": 7.034969329833984, + "reward_std": 2.0938680171966553, + "rewards/fitness_reward/mean": 6.173101425170898, + "rewards/fitness_reward/std": 1.8578987121582031, + "rewards/kidney_reward/mean": 0.2449251115322113, + "rewards/kidney_reward/std": 1.2185057401657104, + "rewards/length2tails_reward/mean": 0.7973313331604004, + "rewards/length2tails_reward/std": 0.22914522886276245, + "rewards/thermo_reward/mean": 1.080146312713623, + "rewards/thermo_reward/std": 1.220781683921814, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.9375, + "completions/mean_terminated_length": 268.9375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.13990407064557076, + "epoch": 2.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2591614723205566, + "learning_rate": 7.508744707511116e-08, + "loss": -0.0059, + "num_tokens": 11569881.0, + "reward": 6.409752368927002, + "reward_std": 3.4788880348205566, + "rewards/fitness_reward/mean": 5.593265533447266, + "rewards/fitness_reward/std": 3.1884958744049072, + "rewards/kidney_reward/mean": 0.7759151458740234, + "rewards/kidney_reward/std": 1.4832079410552979, + "rewards/length2tails_reward/mean": 0.7381703853607178, + "rewards/length2tails_reward/std": 0.3154587745666504, + "rewards/thermo_reward/mean": 0.4879724681377411, + "rewards/thermo_reward/std": 1.7019803524017334, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15936814062297344, + "epoch": 2.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5091710090637207, + "learning_rate": 7.426591484349388e-08, + "loss": 0.01, + "num_tokens": 11578657.0, + "reward": 6.631004333496094, + "reward_std": 2.2312073707580566, + "rewards/fitness_reward/mean": 5.931850433349609, + "rewards/fitness_reward/std": 2.158649444580078, + "rewards/kidney_reward/mean": 0.3921756446361542, + "rewards/kidney_reward/std": 1.3076163530349731, + "rewards/length2tails_reward/mean": 0.7677497267723083, + "rewards/length2tails_reward/std": 0.3126368224620819, + "rewards/thermo_reward/mean": 0.6222577691078186, + "rewards/thermo_reward/std": 1.6855151653289795, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1282121865078807, + "epoch": 2.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3864172697067261, + "learning_rate": 7.344872821265025e-08, + "loss": -0.0026, + "num_tokens": 11587366.0, + "reward": 6.911219596862793, + "reward_std": 1.7677754163742065, + "rewards/fitness_reward/mean": 6.30252742767334, + "rewards/fitness_reward/std": 1.6335350275039673, + "rewards/kidney_reward/mean": 0.22030992805957794, + "rewards/kidney_reward/std": 1.3374607563018799, + "rewards/length2tails_reward/mean": 0.7820232510566711, + "rewards/length2tails_reward/std": 0.29547762870788574, + "rewards/thermo_reward/mean": 0.6060633659362793, + "rewards/thermo_reward/std": 1.6638108491897583, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14613053295761347, + "epoch": 2.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7953296899795532, + "learning_rate": 7.263589101863543e-08, + "loss": -0.0005, + "num_tokens": 11596098.0, + "reward": 7.209301471710205, + "reward_std": 0.9483944177627563, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.12361828237771988, + "rewards/kidney_reward/std": 1.1958742141723633, + "rewards/length2tails_reward/mean": 0.8695204257965088, + "rewards/length2tails_reward/std": 0.19919784367084503, + "rewards/thermo_reward/mean": 0.6776284575462341, + "rewards/thermo_reward/std": 1.5123670101165771, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1463224794715643, + "epoch": 2.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8098628520965576, + "learning_rate": 7.182740707708756e-08, + "loss": 0.0025, + "num_tokens": 11604809.0, + "reward": 6.926301956176758, + "reward_std": 1.3545396327972412, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.45776429772377014, + "rewards/kidney_reward/std": 1.4779555797576904, + "rewards/length2tails_reward/mean": 0.7788654565811157, + "rewards/length2tails_reward/std": 0.2807927131652832, + "rewards/thermo_reward/mean": -0.17719003558158875, + "rewards/thermo_reward/std": 1.7955461740493774, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13200621120631695, + "epoch": 2.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0559370517730713, + "learning_rate": 7.102328018320858e-08, + "loss": 0.0058, + "num_tokens": 11613480.0, + "reward": 6.919334411621094, + "reward_std": 2.29030179977417, + "rewards/fitness_reward/mean": 6.212594032287598, + "rewards/fitness_reward/std": 1.6472396850585938, + "rewards/kidney_reward/mean": 0.0781867504119873, + "rewards/kidney_reward/std": 1.5406161546707153, + "rewards/length2tails_reward/mean": 0.7613270282745361, + "rewards/length2tails_reward/std": 0.3006376326084137, + "rewards/thermo_reward/mean": 0.9546313881874084, + "rewards/thermo_reward/std": 1.3636603355407715, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14988171216100454, + "epoch": 2.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.758458137512207, + "learning_rate": 7.022351411174865e-08, + "loss": -0.0002, + "num_tokens": 11622170.0, + "reward": 6.642274856567383, + "reward_std": 2.442777156829834, + "rewards/fitness_reward/mean": 5.922212600708008, + "rewards/fitness_reward/std": 2.2074363231658936, + "rewards/kidney_reward/mean": 0.23649896681308746, + "rewards/kidney_reward/std": 1.3439737558364868, + "rewards/length2tails_reward/mean": 0.8255376815795898, + "rewards/length2tails_reward/std": 0.2498103529214859, + "rewards/thermo_reward/mean": 0.7908563017845154, + "rewards/thermo_reward/std": 1.4177896976470947, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13416806422173977, + "epoch": 2.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5418772101402283, + "learning_rate": 6.942811261698656e-08, + "loss": -0.0023, + "num_tokens": 11630842.0, + "reward": 7.211859226226807, + "reward_std": 1.0514270067214966, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.13740774989128113, + "rewards/kidney_reward/std": 1.366906762123108, + "rewards/length2tails_reward/mean": 0.7191427946090698, + "rewards/length2tails_reward/std": 0.3311437964439392, + "rewards/thermo_reward/mean": 0.7441422939300537, + "rewards/thermo_reward/std": 1.4894217252731323, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13656179141253233, + "epoch": 2.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7222561836242676, + "learning_rate": 6.863707943271324e-08, + "loss": 0.0083, + "num_tokens": 11639541.0, + "reward": 7.088050365447998, + "reward_std": 1.2890034914016724, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.010663531720638275, + "rewards/kidney_reward/std": 1.3522653579711914, + "rewards/length2tails_reward/mean": 0.7375493049621582, + "rewards/length2tails_reward/std": 0.306835800409317, + "rewards/thermo_reward/mean": 0.8200441598892212, + "rewards/thermo_reward/std": 1.6034045219421387, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1408556867390871, + "epoch": 2.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1318938732147217, + "learning_rate": 6.785041827221361e-08, + "loss": -0.0045, + "num_tokens": 11648263.0, + "reward": 6.90576171875, + "reward_std": 2.0183310508728027, + "rewards/fitness_reward/mean": 6.297613143920898, + "rewards/fitness_reward/std": 1.66133451461792, + "rewards/kidney_reward/mean": 0.44088125228881836, + "rewards/kidney_reward/std": 1.3548799753189087, + "rewards/length2tails_reward/mean": 0.746877133846283, + "rewards/length2tails_reward/std": 0.27125084400177, + "rewards/thermo_reward/mean": 0.4019775688648224, + "rewards/thermo_reward/std": 1.7240796089172363, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17034148424863815, + "epoch": 2.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0803120136260986, + "learning_rate": 6.706813282824919e-08, + "loss": 0.0263, + "num_tokens": 11657030.0, + "reward": 6.501561164855957, + "reward_std": 2.751814603805542, + "rewards/fitness_reward/mean": 5.837723255157471, + "rewards/fitness_reward/std": 2.6072683334350586, + "rewards/kidney_reward/mean": 0.410721093416214, + "rewards/kidney_reward/std": 1.297775149345398, + "rewards/length2tails_reward/mean": 0.8405717611312866, + "rewards/length2tails_reward/std": 0.17893728613853455, + "rewards/thermo_reward/mean": 0.49666929244995117, + "rewards/thermo_reward/std": 1.6130120754241943, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 275.28125, + "completions/mean_terminated_length": 275.28125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.15753164887428284, + "epoch": 2.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.274548888206482, + "learning_rate": 6.629022677304097e-08, + "loss": -0.0249, + "num_tokens": 11665871.0, + "reward": 6.239099502563477, + "reward_std": 2.7463698387145996, + "rewards/fitness_reward/mean": 5.863887310028076, + "rewards/fitness_reward/std": 2.509000301361084, + "rewards/kidney_reward/mean": 0.09711159765720367, + "rewards/kidney_reward/std": 1.2334481477737427, + "rewards/length2tails_reward/mean": 0.7769403457641602, + "rewards/length2tails_reward/std": 0.2995745539665222, + "rewards/thermo_reward/mean": 0.2648426592350006, + "rewards/thermo_reward/std": 1.8378100395202637, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13816013559699059, + "epoch": 2.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1341289281845093, + "learning_rate": 6.551670375825258e-08, + "loss": 0.0015, + "num_tokens": 11674548.0, + "reward": 7.198455810546875, + "reward_std": 1.2638441324234009, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.29033222794532776, + "rewards/kidney_reward/std": 1.2571746110916138, + "rewards/length2tails_reward/mean": 0.7745969295501709, + "rewards/length2tails_reward/std": 0.27354735136032104, + "rewards/thermo_reward/mean": 0.5366846919059753, + "rewards/thermo_reward/std": 1.6593728065490723, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1319007845595479, + "epoch": 2.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.092196464538574, + "learning_rate": 6.474756741497212e-08, + "loss": -0.0006, + "num_tokens": 11683210.0, + "reward": 7.187248229980469, + "reward_std": 1.0114020109176636, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2664000689983368, + "rewards/kidney_reward/std": 1.0829427242279053, + "rewards/length2tails_reward/mean": 0.7531062364578247, + "rewards/length2tails_reward/std": 0.27370181679725647, + "rewards/thermo_reward/mean": 0.548947274684906, + "rewards/thermo_reward/std": 1.7245827913284302, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 284.09375, + "completions/mean_terminated_length": 284.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1799558512866497, + "epoch": 2.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.348093032836914, + "learning_rate": 6.39828213536957e-08, + "loss": 0.1671, + "num_tokens": 11692333.0, + "reward": 6.723752975463867, + "reward_std": 3.076402425765991, + "rewards/fitness_reward/mean": 5.864776611328125, + "rewards/fitness_reward/std": 2.860888957977295, + "rewards/kidney_reward/mean": 0.5752601623535156, + "rewards/kidney_reward/std": 1.2405906915664673, + "rewards/length2tails_reward/mean": 0.7794958353042603, + "rewards/length2tails_reward/std": 0.29706573486328125, + "rewards/thermo_reward/mean": 0.7529445290565491, + "rewards/thermo_reward/std": 1.6066298484802246, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1299219885841012, + "epoch": 2.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4071826934814453, + "learning_rate": 6.322246916431107e-08, + "loss": 0.0039, + "num_tokens": 11701050.0, + "reward": 7.077556133270264, + "reward_std": 1.1443108320236206, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2756168246269226, + "rewards/kidney_reward/std": 1.3764065504074097, + "rewards/length2tails_reward/mean": 0.7562018632888794, + "rewards/length2tails_reward/std": 0.2866836488246918, + "rewards/thermo_reward/mean": 0.3187977075576782, + "rewards/thermo_reward/std": 1.951348066329956, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 287.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.19946142937988043, + "epoch": 2.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.392458915710449, + "learning_rate": 6.246651441607931e-08, + "loss": 0.1447, + "num_tokens": 11710266.0, + "reward": 6.865114212036133, + "reward_std": 1.9148271083831787, + "rewards/fitness_reward/mean": 6.257325172424316, + "rewards/fitness_reward/std": 1.8892370462417603, + "rewards/kidney_reward/mean": 0.3624148368835449, + "rewards/kidney_reward/std": 1.3455159664154053, + "rewards/length2tails_reward/mean": 0.7681008577346802, + "rewards/length2tails_reward/std": 0.28909823298454285, + "rewards/thermo_reward/mean": 0.4691126048564911, + "rewards/thermo_reward/std": 1.8528997898101807, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12626391276717186, + "epoch": 2.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1862074136734009, + "learning_rate": 6.171496065761983e-08, + "loss": 0.0049, + "num_tokens": 11718953.0, + "reward": 6.733834266662598, + "reward_std": 1.2381398677825928, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.03897637128829956, + "rewards/kidney_reward/std": 1.4643480777740479, + "rewards/length2tails_reward/mean": 0.7131487727165222, + "rewards/length2tails_reward/std": 0.2919859290122986, + "rewards/thermo_reward/mean": -0.03252635523676872, + "rewards/thermo_reward/std": 1.9319521188735962, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13573364913463593, + "epoch": 2.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5241156220436096, + "learning_rate": 6.096781141689222e-08, + "loss": -0.0031, + "num_tokens": 11727654.0, + "reward": 6.673149108886719, + "reward_std": 1.2449573278427124, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.08099489659070969, + "rewards/kidney_reward/std": 1.4221594333648682, + "rewards/length2tails_reward/mean": 0.7790415287017822, + "rewards/length2tails_reward/std": 0.275463342666626, + "rewards/thermo_reward/mean": 0.06115467846393585, + "rewards/thermo_reward/std": 2.0408029556274414, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1470300955697894, + "epoch": 2.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3671325743198395, + "learning_rate": 6.02250702011804e-08, + "loss": -0.0069, + "num_tokens": 11736350.0, + "reward": 7.459644317626953, + "reward_std": 1.002334475517273, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.49883562326431274, + "rewards/kidney_reward/std": 1.3434993028640747, + "rewards/length2tails_reward/mean": 0.7546119689941406, + "rewards/length2tails_reward/std": 0.2913599908351898, + "rewards/thermo_reward/mean": 0.8605509996414185, + "rewards/thermo_reward/std": 1.5172303915023804, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14443789143115282, + "epoch": 2.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4144374132156372, + "learning_rate": 5.948674049707603e-08, + "loss": -0.0021, + "num_tokens": 11745092.0, + "reward": 6.86317253112793, + "reward_std": 1.340377926826477, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2224719226360321, + "rewards/kidney_reward/std": 1.2887970209121704, + "rewards/length2tails_reward/mean": 0.7998417615890503, + "rewards/length2tails_reward/std": 0.28657302260398865, + "rewards/thermo_reward/mean": 0.1273343861103058, + "rewards/thermo_reward/std": 1.7813448905944824, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14203498512506485, + "epoch": 2.6879999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23765619099140167, + "learning_rate": 5.8752825770462476e-08, + "loss": -0.0031, + "num_tokens": 11753782.0, + "reward": 7.132554054260254, + "reward_std": 0.96638023853302, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.11732001602649689, + "rewards/kidney_reward/std": 1.1222326755523682, + "rewards/length2tails_reward/mean": 0.8079357147216797, + "rewards/length2tails_reward/std": 0.21169044077396393, + "rewards/thermo_reward/mean": 0.795863151550293, + "rewards/thermo_reward/std": 1.4343836307525635, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1269581038504839, + "epoch": 2.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5333405137062073, + "learning_rate": 5.802332946649757e-08, + "loss": 0.0001, + "num_tokens": 11762448.0, + "reward": 7.254700660705566, + "reward_std": 1.0646439790725708, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.41175907850265503, + "rewards/kidney_reward/std": 1.197927474975586, + "rewards/length2tails_reward/mean": 0.705089271068573, + "rewards/length2tails_reward/std": 0.32775092124938965, + "rewards/thermo_reward/mean": 0.5625014305114746, + "rewards/thermo_reward/std": 1.6212550401687622, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14380416460335255, + "epoch": 2.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2502833604812622, + "learning_rate": 5.729825500959884e-08, + "loss": -0.0046, + "num_tokens": 11771141.0, + "reward": 6.842307090759277, + "reward_std": 2.212759494781494, + "rewards/fitness_reward/mean": 6.272851943969727, + "rewards/fitness_reward/std": 1.8014047145843506, + "rewards/kidney_reward/mean": 0.5223565101623535, + "rewards/kidney_reward/std": 1.2939668893814087, + "rewards/length2tails_reward/mean": 0.811774492263794, + "rewards/length2tails_reward/std": 0.25121423602104187, + "rewards/thermo_reward/mean": 0.21066659688949585, + "rewards/thermo_reward/std": 1.9078917503356934, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13658114336431026, + "epoch": 2.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6625174283981323, + "learning_rate": 5.65776058034263e-08, + "loss": 0.0038, + "num_tokens": 11779867.0, + "reward": 6.960326194763184, + "reward_std": 1.8195083141326904, + "rewards/fitness_reward/mean": 6.17750883102417, + "rewards/fitness_reward/std": 1.8342394828796387, + "rewards/kidney_reward/mean": 0.12361077964305878, + "rewards/kidney_reward/std": 1.2412046194076538, + "rewards/length2tails_reward/mean": 0.8329655528068542, + "rewards/length2tails_reward/std": 0.2396375685930252, + "rewards/thermo_reward/mean": 1.0255413055419922, + "rewards/thermo_reward/std": 1.3548256158828735, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.13464761432260275, + "epoch": 2.6959999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5395203828811646, + "learning_rate": 5.58613852308667e-08, + "loss": -0.04, + "num_tokens": 11788502.0, + "reward": 6.326935291290283, + "reward_std": 2.776808977127075, + "rewards/fitness_reward/mean": 5.778857707977295, + "rewards/fitness_reward/std": 2.82637882232666, + "rewards/kidney_reward/mean": 0.5551795363426208, + "rewards/kidney_reward/std": 1.222622036933899, + "rewards/length2tails_reward/mean": 0.7663679122924805, + "rewards/length2tails_reward/std": 0.2829931080341339, + "rewards/thermo_reward/mean": 0.1577915996313095, + "rewards/thermo_reward/std": 2.067166566848755, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15657263342291117, + "epoch": 2.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.376021146774292, + "learning_rate": 5.514959665401819e-08, + "loss": 0.0474, + "num_tokens": 11797286.0, + "reward": 6.372640609741211, + "reward_std": 2.9671270847320557, + "rewards/fitness_reward/mean": 5.9128594398498535, + "rewards/fitness_reward/std": 2.255034923553467, + "rewards/kidney_reward/mean": 0.09083954989910126, + "rewards/kidney_reward/std": 1.3195194005966187, + "rewards/length2tails_reward/mean": 0.776763916015625, + "rewards/length2tails_reward/std": 0.29985958337783813, + "rewards/thermo_reward/mean": 0.4403420090675354, + "rewards/thermo_reward/std": 1.8964015245437622, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 267.0625, + "completions/mean_terminated_length": 267.0625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.160486807115376, + "epoch": 2.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2090129852294922, + "learning_rate": 5.444224341417392e-08, + "loss": -0.0989, + "num_tokens": 11805864.0, + "reward": 7.069103240966797, + "reward_std": 2.2392852306365967, + "rewards/fitness_reward/mean": 6.13365364074707, + "rewards/fitness_reward/std": 2.0709850788116455, + "rewards/kidney_reward/mean": 0.5347890257835388, + "rewards/kidney_reward/std": 1.354805827140808, + "rewards/length2tails_reward/mean": 0.803483247756958, + "rewards/length2tails_reward/std": 0.24051831662654877, + "rewards/thermo_reward/mean": 0.9343682527542114, + "rewards/thermo_reward/std": 1.4329479932785034, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.145858489908278, + "epoch": 2.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.404083490371704, + "learning_rate": 5.373932883180654e-08, + "loss": 0.0311, + "num_tokens": 11814613.0, + "reward": 6.961205005645752, + "reward_std": 2.7935259342193604, + "rewards/fitness_reward/mean": 6.236160755157471, + "rewards/fitness_reward/std": 2.0089597702026367, + "rewards/kidney_reward/mean": 0.3038008213043213, + "rewards/kidney_reward/std": 1.3177944421768188, + "rewards/length2tails_reward/mean": 0.7664103507995605, + "rewards/length2tails_reward/std": 0.2786445617675781, + "rewards/thermo_reward/mean": 0.7630823850631714, + "rewards/thermo_reward/std": 1.6050466299057007, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.18189921788871288, + "epoch": 2.7039999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.00908088684082, + "learning_rate": 5.304085620655263e-08, + "loss": 0.0751, + "num_tokens": 11823469.0, + "reward": 6.474761009216309, + "reward_std": 2.892199754714966, + "rewards/fitness_reward/mean": 5.851008415222168, + "rewards/fitness_reward/std": 2.914987325668335, + "rewards/kidney_reward/mean": 0.057575903832912445, + "rewards/kidney_reward/std": 1.2942359447479248, + "rewards/length2tails_reward/mean": 0.7830846309661865, + "rewards/length2tails_reward/std": 0.25321516394615173, + "rewards/thermo_reward/mean": 0.7983863353729248, + "rewards/thermo_reward/std": 1.527421474456787, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13791055604815483, + "epoch": 2.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6262022256851196, + "learning_rate": 5.234682881719765e-08, + "loss": -0.0033, + "num_tokens": 11832180.0, + "reward": 7.076658725738525, + "reward_std": 2.1574857234954834, + "rewards/fitness_reward/mean": 6.3067240715026855, + "rewards/fitness_reward/std": 1.6097934246063232, + "rewards/kidney_reward/mean": 0.6647336483001709, + "rewards/kidney_reward/std": 1.3057494163513184, + "rewards/length2tails_reward/mean": 0.8185875415802002, + "rewards/length2tails_reward/std": 0.2547774612903595, + "rewards/thermo_reward/mean": 0.4658415913581848, + "rewards/thermo_reward/std": 1.7345285415649414, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13840149715542793, + "epoch": 2.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.336580276489258, + "learning_rate": 5.165724992165954e-08, + "loss": -0.0016, + "num_tokens": 11840862.0, + "reward": 6.373121738433838, + "reward_std": 2.3925251960754395, + "rewards/fitness_reward/mean": 6.018747329711914, + "rewards/fitness_reward/std": 2.191136121749878, + "rewards/kidney_reward/mean": 0.07362078130245209, + "rewards/kidney_reward/std": 1.2522526979446411, + "rewards/length2tails_reward/mean": 0.7948795557022095, + "rewards/length2tails_reward/std": 0.22825320065021515, + "rewards/thermo_reward/mean": 0.23768886923789978, + "rewards/thermo_reward/std": 1.736041784286499, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 278.0625, + "completions/mean_terminated_length": 278.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16176360473036766, + "epoch": 2.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0292506217956543, + "learning_rate": 5.09721227569746e-08, + "loss": 0.0339, + "num_tokens": 11849792.0, + "reward": 6.567011833190918, + "reward_std": 2.453436851501465, + "rewards/fitness_reward/mean": 6.113492965698242, + "rewards/fitness_reward/std": 2.1806681156158447, + "rewards/kidney_reward/mean": 0.22597810626029968, + "rewards/kidney_reward/std": 1.1912392377853394, + "rewards/length2tails_reward/mean": 0.733343243598938, + "rewards/length2tails_reward/std": 0.31190648674964905, + "rewards/thermo_reward/mean": 0.31438836455345154, + "rewards/thermo_reward/std": 1.6750223636627197, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13236729428172112, + "epoch": 2.7119999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5011467933654785, + "learning_rate": 5.02914505392813e-08, + "loss": 0.002, + "num_tokens": 11858463.0, + "reward": 6.979236602783203, + "reward_std": 1.5250182151794434, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.12507648766040802, + "rewards/kidney_reward/std": 1.4478554725646973, + "rewards/length2tails_reward/mean": 0.6914030313491821, + "rewards/length2tails_reward/std": 0.31283634901046753, + "rewards/thermo_reward/mean": 0.717054009437561, + "rewards/thermo_reward/std": 1.5815379619598389, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 276.6875, + "completions/mean_terminated_length": 276.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15863915253430605, + "epoch": 2.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.830939292907715, + "learning_rate": 4.961523646380561e-08, + "loss": 0.069, + "num_tokens": 11867349.0, + "reward": 6.5764336585998535, + "reward_std": 2.674323797225952, + "rewards/fitness_reward/mean": 5.960606575012207, + "rewards/fitness_reward/std": 2.500142812728882, + "rewards/kidney_reward/mean": 0.3769088387489319, + "rewards/kidney_reward/std": 1.3546323776245117, + "rewards/length2tails_reward/mean": 0.7789443731307983, + "rewards/length2tails_reward/std": 0.27722620964050293, + "rewards/thermo_reward/mean": 0.4652731418609619, + "rewards/thermo_reward/std": 1.6984999179840088, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13640568871051073, + "epoch": 2.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4220622777938843, + "learning_rate": 4.8943483704846465e-08, + "loss": -0.004, + "num_tokens": 11876028.0, + "reward": 6.746973514556885, + "reward_std": 2.267575979232788, + "rewards/fitness_reward/mean": 5.982248783111572, + "rewards/fitness_reward/std": 1.9085924625396729, + "rewards/kidney_reward/mean": 0.5677748322486877, + "rewards/kidney_reward/std": 1.2933520078659058, + "rewards/length2tails_reward/mean": 0.7782508730888367, + "rewards/length2tails_reward/std": 0.25282543897628784, + "rewards/thermo_reward/mean": 0.5725493431091309, + "rewards/thermo_reward/std": 1.5764100551605225, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.15007026121020317, + "epoch": 2.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.693939745426178, + "learning_rate": 4.827619541575967e-08, + "loss": 0.0021, + "num_tokens": 11884686.0, + "reward": 7.132630825042725, + "reward_std": 0.9852175116539001, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.03717784583568573, + "rewards/kidney_reward/std": 1.2284526824951172, + "rewards/length2tails_reward/mean": 0.8566329479217529, + "rewards/length2tails_reward/std": 0.18639251589775085, + "rewards/thermo_reward/mean": 0.8231485486030579, + "rewards/thermo_reward/std": 1.4900275468826294, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12843878660351038, + "epoch": 2.7199999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9017809629440308, + "learning_rate": 4.761337472894478e-08, + "loss": -0.0038, + "num_tokens": 11893396.0, + "reward": 7.196900367736816, + "reward_std": 1.1897774934768677, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2059190571308136, + "rewards/kidney_reward/std": 1.4282766580581665, + "rewards/length2tails_reward/mean": 0.7752040028572083, + "rewards/length2tails_reward/std": 0.30827951431274414, + "rewards/thermo_reward/mean": 0.8236618638038635, + "rewards/thermo_reward/std": 1.340000033378601, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 282.15625, + "completions/mean_terminated_length": 282.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16562678385525942, + "epoch": 2.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2261569499969482, + "learning_rate": 4.695502475582813e-08, + "loss": 0.0327, + "num_tokens": 11902457.0, + "reward": 6.22656774520874, + "reward_std": 3.389939546585083, + "rewards/fitness_reward/mean": 5.920053482055664, + "rewards/fitness_reward/std": 2.668562650680542, + "rewards/kidney_reward/mean": -0.03889282047748566, + "rewards/kidney_reward/std": 1.2940609455108643, + "rewards/length2tails_reward/mean": 0.8277677297592163, + "rewards/length2tails_reward/std": 0.16781653463840485, + "rewards/thermo_reward/mean": 0.23803797364234924, + "rewards/thermo_reward/std": 1.9139777421951294, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 284.34375, + "completions/mean_terminated_length": 284.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16163287684321404, + "epoch": 2.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.093640327453613, + "learning_rate": 4.630114858685075e-08, + "loss": 0.1876, + "num_tokens": 11911588.0, + "reward": 6.713284492492676, + "reward_std": 2.1978862285614014, + "rewards/fitness_reward/mean": 6.158529281616211, + "rewards/fitness_reward/std": 1.9363502264022827, + "rewards/kidney_reward/mean": 0.29178085923194885, + "rewards/kidney_reward/std": 1.4394662380218506, + "rewards/length2tails_reward/mean": 0.8142867088317871, + "rewards/length2tails_reward/std": 0.2254250943660736, + "rewards/thermo_reward/mean": 0.4105863571166992, + "rewards/thermo_reward/std": 1.8308995962142944, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13997112307697535, + "epoch": 2.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1205456256866455, + "learning_rate": 4.565174929145188e-08, + "loss": 0.0053, + "num_tokens": 11920324.0, + "reward": 7.037109375, + "reward_std": 1.1757491827011108, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.07398329675197601, + "rewards/kidney_reward/std": 1.360016942024231, + "rewards/length2tails_reward/mean": 0.8392004370689392, + "rewards/length2tails_reward/std": 0.20252938568592072, + "rewards/thermo_reward/mean": 0.39803916215896606, + "rewards/thermo_reward/std": 1.7302331924438477, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.40625, + "completions/mean_terminated_length": 263.40625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.1366047989577055, + "epoch": 2.7279999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3998750448226929, + "learning_rate": 4.5006829918055934e-08, + "loss": -0.109, + "num_tokens": 11928785.0, + "reward": 6.8421478271484375, + "reward_std": 2.3292365074157715, + "rewards/fitness_reward/mean": 6.0077409744262695, + "rewards/fitness_reward/std": 2.249107599258423, + "rewards/kidney_reward/mean": 0.6135181188583374, + "rewards/kidney_reward/std": 1.3728998899459839, + "rewards/length2tails_reward/mean": 0.8019629716873169, + "rewards/length2tails_reward/std": 0.2485700398683548, + "rewards/thermo_reward/mean": 0.6543141603469849, + "rewards/thermo_reward/std": 1.7695351839065552, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1359203588217497, + "epoch": 2.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4309622347354889, + "learning_rate": 4.436639349405713e-08, + "loss": -0.0016, + "num_tokens": 11937475.0, + "reward": 6.703495502471924, + "reward_std": 2.525789499282837, + "rewards/fitness_reward/mean": 6.167916297912598, + "rewards/fitness_reward/std": 1.8857755661010742, + "rewards/kidney_reward/mean": 0.32618939876556396, + "rewards/kidney_reward/std": 1.3779423236846924, + "rewards/length2tails_reward/mean": 0.7738263010978699, + "rewards/length2tails_reward/std": 0.27380529046058655, + "rewards/thermo_reward/mean": 0.35805609822273254, + "rewards/thermo_reward/std": 1.57369065284729, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14700028486549854, + "epoch": 2.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.748185634613037, + "learning_rate": 4.373044302580553e-08, + "loss": 0.0355, + "num_tokens": 11946193.0, + "reward": 6.904634475708008, + "reward_std": 2.523054361343384, + "rewards/fitness_reward/mean": 6.118351936340332, + "rewards/fitness_reward/std": 2.154191493988037, + "rewards/kidney_reward/mean": 0.6995481252670288, + "rewards/kidney_reward/std": 1.1936233043670654, + "rewards/length2tails_reward/mean": 0.716191291809082, + "rewards/length2tails_reward/std": 0.3228929340839386, + "rewards/thermo_reward/mean": 0.5149211883544922, + "rewards/thermo_reward/std": 1.6761047840118408, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14697847422212362, + "epoch": 2.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8904600143432617, + "learning_rate": 4.3098981498593925e-08, + "loss": 0.0225, + "num_tokens": 11954931.0, + "reward": 6.661285400390625, + "reward_std": 2.401127338409424, + "rewards/fitness_reward/mean": 6.208806037902832, + "rewards/fitness_reward/std": 2.1637020111083984, + "rewards/kidney_reward/mean": 0.33133307099342346, + "rewards/kidney_reward/std": 1.3005090951919556, + "rewards/length2tails_reward/mean": 0.8409561514854431, + "rewards/length2tails_reward/std": 0.18553099036216736, + "rewards/thermo_reward/mean": 0.15314829349517822, + "rewards/thermo_reward/std": 1.9080272912979126, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.16296476125717163, + "epoch": 2.7359999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8008642196655273, + "learning_rate": 4.247201187664218e-08, + "loss": -0.0333, + "num_tokens": 11963629.0, + "reward": 6.82436466217041, + "reward_std": 2.5558340549468994, + "rewards/fitness_reward/mean": 6.197336196899414, + "rewards/fitness_reward/std": 2.2285847663879395, + "rewards/kidney_reward/mean": 0.242277592420578, + "rewards/kidney_reward/std": 1.3202170133590698, + "rewards/length2tails_reward/mean": 0.7877407073974609, + "rewards/length2tails_reward/std": 0.2843776345252991, + "rewards/thermo_reward/mean": 0.6179081797599792, + "rewards/thermo_reward/std": 1.7444396018981934, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 275.9375, + "completions/mean_terminated_length": 275.9375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.1906990958377719, + "epoch": 2.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4926505088806152, + "learning_rate": 4.184953710308492e-08, + "loss": 0.0357, + "num_tokens": 11972491.0, + "reward": 6.4579057693481445, + "reward_std": 3.318767547607422, + "rewards/fitness_reward/mean": 5.573611259460449, + "rewards/fitness_reward/std": 3.2241921424865723, + "rewards/kidney_reward/mean": 0.5792473554611206, + "rewards/kidney_reward/std": 1.259211540222168, + "rewards/length2tails_reward/mean": 0.8268831968307495, + "rewards/length2tails_reward/std": 0.2265845090150833, + "rewards/thermo_reward/mean": 0.7758994698524475, + "rewards/thermo_reward/std": 1.750178337097168, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 278.625, + "completions/mean_terminated_length": 278.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.20206231530755758, + "epoch": 2.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.437255859375, + "learning_rate": 4.1231560099956095e-08, + "loss": 0.1063, + "num_tokens": 11981439.0, + "reward": 6.66567850112915, + "reward_std": 2.4327332973480225, + "rewards/fitness_reward/mean": 6.116811275482178, + "rewards/fitness_reward/std": 2.162583589553833, + "rewards/kidney_reward/mean": 0.17771363258361816, + "rewards/kidney_reward/std": 1.2348171472549438, + "rewards/length2tails_reward/mean": 0.8009012937545776, + "rewards/length2tails_reward/std": 0.2830302119255066, + "rewards/thermo_reward/mean": 0.5195697546005249, + "rewards/thermo_reward/std": 1.7213000059127808, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.1291458187624812, + "epoch": 2.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2061984539031982, + "learning_rate": 4.061808376817699e-08, + "loss": -0.0276, + "num_tokens": 11990079.0, + "reward": 6.479552268981934, + "reward_std": 2.4343228340148926, + "rewards/fitness_reward/mean": 6.000798225402832, + "rewards/fitness_reward/std": 2.2857935428619385, + "rewards/kidney_reward/mean": 0.061824701726436615, + "rewards/kidney_reward/std": 1.323279857635498, + "rewards/length2tails_reward/mean": 0.7987549304962158, + "rewards/length2tails_reward/std": 0.3239048719406128, + "rewards/thermo_reward/mean": 0.496306836605072, + "rewards/thermo_reward/std": 1.67344069480896, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 283.9375, + "completions/mean_terminated_length": 268.7742004394531, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.18455950170755386, + "epoch": 2.7439999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.811509132385254, + "learning_rate": 4.0009110987540897e-08, + "loss": 0.1499, + "num_tokens": 11999197.0, + "reward": 6.038165092468262, + "reward_std": 2.731981039047241, + "rewards/fitness_reward/mean": 5.7914323806762695, + "rewards/fitness_reward/std": 2.777022123336792, + "rewards/kidney_reward/mean": 0.3378433585166931, + "rewards/kidney_reward/std": 1.2647459506988525, + "rewards/length2tails_reward/mean": 0.8329230546951294, + "rewards/length2tails_reward/std": 0.2584356665611267, + "rewards/thermo_reward/mean": -0.2608391344547272, + "rewards/thermo_reward/std": 2.0964627265930176, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 284.28125, + "completions/mean_terminated_length": 284.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.19760250486433506, + "epoch": 2.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.010517120361328, + "learning_rate": 3.940464461670134e-08, + "loss": 0.1119, + "num_tokens": 12008326.0, + "reward": 6.249445915222168, + "reward_std": 3.315345287322998, + "rewards/fitness_reward/mean": 5.612913608551025, + "rewards/fitness_reward/std": 3.0996086597442627, + "rewards/kidney_reward/mean": 0.6843425035476685, + "rewards/kidney_reward/std": 1.289900302886963, + "rewards/length2tails_reward/mean": 0.7207305431365967, + "rewards/length2tails_reward/std": 0.33688831329345703, + "rewards/thermo_reward/mean": 0.22835737466812134, + "rewards/thermo_reward/std": 1.8605788946151733, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13593508768826723, + "epoch": 2.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8115772008895874, + "learning_rate": 3.8804687493157126e-08, + "loss": 0.0044, + "num_tokens": 12017044.0, + "reward": 6.815313339233398, + "reward_std": 2.307612419128418, + "rewards/fitness_reward/mean": 6.216263294219971, + "rewards/fitness_reward/std": 2.121516466140747, + "rewards/kidney_reward/mean": 0.42642927169799805, + "rewards/kidney_reward/std": 1.4143224954605103, + "rewards/length2tails_reward/mean": 0.7997623682022095, + "rewards/length2tails_reward/std": 0.2172497808933258, + "rewards/thermo_reward/mean": 0.3717886209487915, + "rewards/thermo_reward/std": 1.664808988571167, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13157103024423122, + "epoch": 2.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44969069957733154, + "learning_rate": 3.820924243323975e-08, + "loss": 0.0026, + "num_tokens": 12025714.0, + "reward": 7.177164077758789, + "reward_std": 1.0633214712142944, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.20766639709472656, + "rewards/kidney_reward/std": 1.236377239227295, + "rewards/length2tails_reward/mean": 0.7373136878013611, + "rewards/length2tails_reward/std": 0.2896072566509247, + "rewards/thermo_reward/mean": 0.5954080820083618, + "rewards/thermo_reward/std": 1.5263421535491943, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.15400750190019608, + "epoch": 2.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8164786100387573, + "learning_rate": 3.761831223210032e-08, + "loss": -0.0365, + "num_tokens": 12034381.0, + "reward": 6.845125198364258, + "reward_std": 2.610117197036743, + "rewards/fitness_reward/mean": 6.20970344543457, + "rewards/fitness_reward/std": 2.1586263179779053, + "rewards/kidney_reward/mean": 0.32128429412841797, + "rewards/kidney_reward/std": 1.3398067951202393, + "rewards/length2tails_reward/mean": 0.8446570038795471, + "rewards/length2tails_reward/std": 0.20664137601852417, + "rewards/thermo_reward/mean": 0.5272300243377686, + "rewards/thermo_reward/std": 1.5364784002304077, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 277.8125, + "completions/mean_terminated_length": 277.8125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.1391786402091384, + "epoch": 2.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6257619261741638, + "learning_rate": 3.70318996636958e-08, + "loss": -0.0068, + "num_tokens": 12043303.0, + "reward": 7.029253959655762, + "reward_std": 2.3377981185913086, + "rewards/fitness_reward/mean": 6.306334495544434, + "rewards/fitness_reward/std": 1.6119990348815918, + "rewards/kidney_reward/mean": 0.57759028673172, + "rewards/kidney_reward/std": 1.3743318319320679, + "rewards/length2tails_reward/mean": 0.8022366762161255, + "rewards/length2tails_reward/std": 0.24409647285938263, + "rewards/thermo_reward/mean": 0.467130184173584, + "rewards/thermo_reward/std": 1.8315050601959229, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13867716770619154, + "epoch": 2.7560000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7776877284049988, + "learning_rate": 3.645000748077709e-08, + "loss": 0.0022, + "num_tokens": 12052015.0, + "reward": 6.741634368896484, + "reward_std": 1.5786770582199097, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.06209002435207367, + "rewards/kidney_reward/std": 1.2694224119186401, + "rewards/length2tails_reward/mean": 0.7945544719696045, + "rewards/length2tails_reward/std": 0.26558464765548706, + "rewards/thermo_reward/mean": 0.37744101881980896, + "rewards/thermo_reward/std": 1.7706300020217896, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.14085942599922419, + "epoch": 2.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.00225830078125, + "learning_rate": 3.587263841487454e-08, + "loss": -0.019, + "num_tokens": 12060683.0, + "reward": 6.290828704833984, + "reward_std": 2.748077869415283, + "rewards/fitness_reward/mean": 6.014713287353516, + "rewards/fitness_reward/std": 2.212355852127075, + "rewards/kidney_reward/mean": 0.02659381926059723, + "rewards/kidney_reward/std": 1.2283685207366943, + "rewards/length2tails_reward/mean": 0.8128451108932495, + "rewards/length2tails_reward/std": 0.2597150504589081, + "rewards/thermo_reward/mean": 0.11921501904726028, + "rewards/thermo_reward/std": 2.010701894760132, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1457380848005414, + "epoch": 2.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3292429447174072, + "learning_rate": 3.529979517628645e-08, + "loss": 0.0041, + "num_tokens": 12069373.0, + "reward": 6.995060443878174, + "reward_std": 1.3562374114990234, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.35094720125198364, + "rewards/kidney_reward/std": 1.2961034774780273, + "rewards/length2tails_reward/mean": 0.8212077617645264, + "rewards/length2tails_reward/std": 0.2185034602880478, + "rewards/thermo_reward/mean": 0.25195080041885376, + "rewards/thermo_reward/std": 1.8120089769363403, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 277.75, + "completions/mean_terminated_length": 277.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14618656039237976, + "epoch": 2.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1789528131484985, + "learning_rate": 3.473148045406582e-08, + "loss": 0.0266, + "num_tokens": 12078293.0, + "reward": 6.593124866485596, + "reward_std": 2.2084155082702637, + "rewards/fitness_reward/mean": 6.240286350250244, + "rewards/fitness_reward/std": 1.9856224060058594, + "rewards/kidney_reward/mean": 0.12119641900062561, + "rewards/kidney_reward/std": 1.3631325960159302, + "rewards/length2tails_reward/mean": 0.78724205493927, + "rewards/length2tails_reward/std": 0.25483232736587524, + "rewards/thermo_reward/mean": 0.19085893034934998, + "rewards/thermo_reward/std": 1.9360727071762085, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.14091461338102818, + "epoch": 2.7640000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4482347965240479, + "learning_rate": 3.416769691600807e-08, + "loss": -0.0037, + "num_tokens": 12086992.0, + "reward": 7.242701053619385, + "reward_std": 1.267257809638977, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.6575721502304077, + "rewards/kidney_reward/std": 1.2137041091918945, + "rewards/length2tails_reward/mean": 0.8081504702568054, + "rewards/length2tails_reward/std": 0.29068055748939514, + "rewards/thermo_reward/mean": 0.2411583662033081, + "rewards/thermo_reward/std": 1.8590071201324463, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.625, + "completions/mean_terminated_length": 267.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.14683069474995136, + "epoch": 2.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.383288621902466, + "learning_rate": 3.360844720863765e-08, + "loss": -0.0378, + "num_tokens": 12095588.0, + "reward": 6.943121910095215, + "reward_std": 2.0272014141082764, + "rewards/fitness_reward/mean": 6.256612777709961, + "rewards/fitness_reward/std": 1.8932645320892334, + "rewards/kidney_reward/mean": 0.5298535227775574, + "rewards/kidney_reward/std": 1.389146327972412, + "rewards/length2tails_reward/mean": 0.7345486879348755, + "rewards/length2tails_reward/std": 0.2960755228996277, + "rewards/thermo_reward/mean": 0.4758894443511963, + "rewards/thermo_reward/std": 1.7048187255859375, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 275.9375, + "completions/mean_terminated_length": 275.9375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.1698553366586566, + "epoch": 2.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.133066177368164, + "learning_rate": 3.305373395719646e-08, + "loss": 0.0362, + "num_tokens": 12104450.0, + "reward": 5.9990997314453125, + "reward_std": 3.4583353996276855, + "rewards/fitness_reward/mean": 5.505334854125977, + "rewards/fitness_reward/std": 3.1463253498077393, + "rewards/kidney_reward/mean": 0.25862759351730347, + "rewards/kidney_reward/std": 1.5000079870224, + "rewards/length2tails_reward/mean": 0.790450930595398, + "rewards/length2tails_reward/std": 0.28898459672927856, + "rewards/thermo_reward/mean": 0.3336772322654724, + "rewards/thermo_reward/std": 1.8008426427841187, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 284.34375, + "completions/mean_terminated_length": 284.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15228595305234194, + "epoch": 2.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.83298921585083, + "learning_rate": 3.250355976563157e-08, + "loss": 0.2327, + "num_tokens": 12113581.0, + "reward": 6.601200580596924, + "reward_std": 1.9666966199874878, + "rewards/fitness_reward/mean": 6.232110977172852, + "rewards/fitness_reward/std": 2.031869411468506, + "rewards/kidney_reward/mean": 0.07887567579746246, + "rewards/kidney_reward/std": 1.3335744142532349, + "rewards/length2tails_reward/mean": 0.7148258686065674, + "rewards/length2tails_reward/std": 0.2918531000614166, + "rewards/thermo_reward/mean": 0.30189049243927, + "rewards/thermo_reward/std": 1.894866943359375, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13052846398204565, + "epoch": 2.7720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0129870176315308, + "learning_rate": 3.19579272165823e-08, + "loss": 0.0052, + "num_tokens": 12122269.0, + "reward": 6.961553573608398, + "reward_std": 1.0991709232330322, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.24836504459381104, + "rewards/kidney_reward/std": 1.268311858177185, + "rewards/length2tails_reward/mean": 0.750042200088501, + "rewards/length2tails_reward/std": 0.30712106823921204, + "rewards/thermo_reward/mean": 0.11712482571601868, + "rewards/thermo_reward/std": 1.8461724519729614, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13619148917496204, + "epoch": 2.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0559543371200562, + "learning_rate": 3.141683887136892e-08, + "loss": -0.0014, + "num_tokens": 12130977.0, + "reward": 7.015812873840332, + "reward_std": 1.262722134590149, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.23605507612228394, + "rewards/kidney_reward/std": 1.2778043746948242, + "rewards/length2tails_reward/mean": 0.7887672185897827, + "rewards/length2tails_reward/std": 0.30865004658699036, + "rewards/thermo_reward/mean": 0.42456838488578796, + "rewards/thermo_reward/std": 1.6570998430252075, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1499113915488124, + "epoch": 2.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.430222749710083, + "learning_rate": 3.088029726997965e-08, + "loss": 0.025, + "num_tokens": 12139725.0, + "reward": 6.694531440734863, + "reward_std": 2.9209022521972656, + "rewards/fitness_reward/mean": 5.93999719619751, + "rewards/fitness_reward/std": 2.571115493774414, + "rewards/kidney_reward/mean": 0.3971708118915558, + "rewards/kidney_reward/std": 1.3934671878814697, + "rewards/length2tails_reward/mean": 0.7748485803604126, + "rewards/length2tails_reward/std": 0.283083438873291, + "rewards/thermo_reward/mean": 0.7244739532470703, + "rewards/thermo_reward/std": 1.552345633506775, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12718303967267275, + "epoch": 2.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5291167497634888, + "learning_rate": 3.0348304931059554e-08, + "loss": -0.0065, + "num_tokens": 12148413.0, + "reward": 7.442338943481445, + "reward_std": 1.2427260875701904, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5419756770133972, + "rewards/kidney_reward/std": 1.307005524635315, + "rewards/length2tails_reward/mean": 0.7703384757041931, + "rewards/length2tails_reward/std": 0.29194435477256775, + "rewards/thermo_reward/mean": 0.9809144735336304, + "rewards/thermo_reward/std": 1.2797433137893677, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.137383877299726, + "epoch": 2.7800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4119565486907959, + "learning_rate": 2.982086435189857e-08, + "loss": 0.004, + "num_tokens": 12157109.0, + "reward": 7.2604522705078125, + "reward_std": 1.3057117462158203, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.29762351512908936, + "rewards/kidney_reward/std": 1.5208665132522583, + "rewards/length2tails_reward/mean": 0.7458254098892212, + "rewards/length2tails_reward/std": 0.30860573053359985, + "rewards/thermo_reward/mean": 0.6677724123001099, + "rewards/thermo_reward/std": 1.7590993642807007, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12867050804197788, + "epoch": 2.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6359583735466003, + "learning_rate": 2.92979780084196e-08, + "loss": 0.0005, + "num_tokens": 12165760.0, + "reward": 7.202739715576172, + "reward_std": 1.1364457607269287, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.29861557483673096, + "rewards/kidney_reward/std": 1.287266492843628, + "rewards/length2tails_reward/mean": 0.7322765588760376, + "rewards/length2tails_reward/std": 0.27144795656204224, + "rewards/thermo_reward/mean": 0.558128833770752, + "rewards/thermo_reward/std": 1.5960760116577148, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 275.15625, + "completions/mean_terminated_length": 275.15625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.2022443925961852, + "epoch": 2.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9093527793884277, + "learning_rate": 2.8779648355166775e-08, + "loss": 0.0531, + "num_tokens": 12174597.0, + "reward": 6.351833343505859, + "reward_std": 3.174617290496826, + "rewards/fitness_reward/mean": 5.753119468688965, + "rewards/fitness_reward/std": 2.9245524406433105, + "rewards/kidney_reward/mean": 0.19309160113334656, + "rewards/kidney_reward/std": 1.240492820739746, + "rewards/length2tails_reward/mean": 0.7798339128494263, + "rewards/length2tails_reward/std": 0.26415783166885376, + "rewards/thermo_reward/mean": 0.6144194602966309, + "rewards/thermo_reward/std": 1.660841941833496, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.4375, + "completions/mean_terminated_length": 268.4375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1267564482986927, + "epoch": 2.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7434492111206055, + "learning_rate": 2.8265877825294436e-08, + "loss": 0.0, + "num_tokens": 12183219.0, + "reward": 6.738852500915527, + "reward_std": 1.5002015829086304, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2940756380558014, + "rewards/kidney_reward/std": 1.302889347076416, + "rewards/length2tails_reward/mean": 0.7471587657928467, + "rewards/length2tails_reward/std": 0.32901930809020996, + "rewards/thermo_reward/mean": -0.1665678322315216, + "rewards/thermo_reward/std": 2.0622611045837402, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12958719301968813, + "epoch": 2.7880000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7161638736724854, + "learning_rate": 2.7756668830555164e-08, + "loss": -0.0016, + "num_tokens": 12191915.0, + "reward": 7.542000770568848, + "reward_std": 1.0370919704437256, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.8532626628875732, + "rewards/kidney_reward/std": 1.4079391956329346, + "rewards/length2tails_reward/mean": 0.7894229888916016, + "rewards/length2tails_reward/std": 0.2670515775680542, + "rewards/thermo_reward/mean": 0.6534300446510315, + "rewards/thermo_reward/std": 1.6680938005447388, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 277.40625, + "completions/mean_terminated_length": 277.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13378724548965693, + "epoch": 2.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8356918096542358, + "learning_rate": 2.7252023761288545e-08, + "loss": 0.0526, + "num_tokens": 12200824.0, + "reward": 6.2811808586120605, + "reward_std": 2.6240885257720947, + "rewards/fitness_reward/mean": 5.86088228225708, + "rewards/fitness_reward/std": 2.524839162826538, + "rewards/kidney_reward/mean": 0.5898606181144714, + "rewards/kidney_reward/std": 1.1377627849578857, + "rewards/length2tails_reward/mean": 0.7692900896072388, + "rewards/length2tails_reward/std": 0.31870412826538086, + "rewards/thermo_reward/mean": -0.13390877842903137, + "rewards/thermo_reward/std": 2.1867923736572266, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 280.75, + "completions/mean_terminated_length": 280.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1583602288737893, + "epoch": 2.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.360617637634277, + "learning_rate": 2.6751944986410534e-08, + "loss": 0.1423, + "num_tokens": 12209840.0, + "reward": 6.132345199584961, + "reward_std": 3.3895745277404785, + "rewards/fitness_reward/mean": 5.9402971267700195, + "rewards/fitness_reward/std": 2.581296682357788, + "rewards/kidney_reward/mean": 0.1940900981426239, + "rewards/kidney_reward/std": 1.3034378290176392, + "rewards/length2tails_reward/mean": 0.722872257232666, + "rewards/length2tails_reward/std": 0.3282071053981781, + "rewards/thermo_reward/mean": -0.17143036425113678, + "rewards/thermo_reward/std": 2.0448641777038574, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13323568273335695, + "epoch": 2.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7109805345535278, + "learning_rate": 2.6256434853401454e-08, + "loss": -0.0028, + "num_tokens": 12218538.0, + "reward": 6.803411483764648, + "reward_std": 1.3989784717559814, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.022500373423099518, + "rewards/kidney_reward/std": 1.3823429346084595, + "rewards/length2tails_reward/mean": 0.7946922779083252, + "rewards/length2tails_reward/std": 0.2838398218154907, + "rewards/thermo_reward/mean": 0.4613376259803772, + "rewards/thermo_reward/std": 1.6504029035568237, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 274.90625, + "completions/mean_terminated_length": 274.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14801400527358055, + "epoch": 2.7960000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2296959161758423, + "learning_rate": 2.576549568829578e-08, + "loss": 0.0136, + "num_tokens": 12227367.0, + "reward": 7.0101752281188965, + "reward_std": 1.5851190090179443, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.46369969844818115, + "rewards/kidney_reward/std": 1.3782895803451538, + "rewards/length2tails_reward/mean": 0.7520022392272949, + "rewards/length2tails_reward/std": 0.2987624704837799, + "rewards/thermo_reward/mean": 0.6159876585006714, + "rewards/thermo_reward/std": 1.5296481847763062, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14190233405679464, + "epoch": 2.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9372859597206116, + "learning_rate": 2.5279129795670374e-08, + "loss": 0.0, + "num_tokens": 12236062.0, + "reward": 6.936478614807129, + "reward_std": 1.4244898557662964, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.21702787280082703, + "rewards/kidney_reward/std": 1.4649279117584229, + "rewards/length2tails_reward/mean": 0.7701707482337952, + "rewards/length2tails_reward/std": 0.30488836765289307, + "rewards/thermo_reward/mean": 0.2942262887954712, + "rewards/thermo_reward/std": 1.9266523122787476, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 274.46875, + "completions/mean_terminated_length": 274.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15141561813652515, + "epoch": 2.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.337566375732422, + "learning_rate": 2.4797339458634714e-08, + "loss": 0.0526, + "num_tokens": 12244877.0, + "reward": 6.4765801429748535, + "reward_std": 2.632657766342163, + "rewards/fitness_reward/mean": 5.952982425689697, + "rewards/fitness_reward/std": 2.5230064392089844, + "rewards/kidney_reward/mean": 0.40527093410491943, + "rewards/kidney_reward/std": 1.2532072067260742, + "rewards/length2tails_reward/mean": 0.8602679967880249, + "rewards/length2tails_reward/std": 0.12169806659221649, + "rewards/thermo_reward/mean": 0.21179035305976868, + "rewards/thermo_reward/std": 1.5593935251235962, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 274.5, + "completions/mean_terminated_length": 274.5, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.16778626758605242, + "epoch": 2.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.920297861099243, + "learning_rate": 2.4320126938819018e-08, + "loss": 0.0746, + "num_tokens": 12253693.0, + "reward": 6.534785270690918, + "reward_std": 3.377420663833618, + "rewards/fitness_reward/mean": 5.569459915161133, + "rewards/fitness_reward/std": 3.255430221557617, + "rewards/kidney_reward/mean": 0.47506052255630493, + "rewards/kidney_reward/std": 1.2883639335632324, + "rewards/length2tails_reward/mean": 0.7930936813354492, + "rewards/length2tails_reward/std": 0.25949952006340027, + "rewards/thermo_reward/mean": 1.059043526649475, + "rewards/thermo_reward/std": 1.2792052030563354, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15403460152447224, + "epoch": 2.8040000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4414594173431396, + "learning_rate": 2.3847494476364805e-08, + "loss": 0.0648, + "num_tokens": 12262511.0, + "reward": 6.508338928222656, + "reward_std": 2.739027976989746, + "rewards/fitness_reward/mean": 6.223488807678223, + "rewards/fitness_reward/std": 2.0806422233581543, + "rewards/kidney_reward/mean": 0.3197479844093323, + "rewards/kidney_reward/std": 1.200230360031128, + "rewards/length2tails_reward/mean": 0.7983761429786682, + "rewards/length2tails_reward/std": 0.2772468328475952, + "rewards/thermo_reward/mean": -0.14923696219921112, + "rewards/thermo_reward/std": 2.14689302444458, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 277.53125, + "completions/mean_terminated_length": 277.53125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.14836242888122797, + "epoch": 2.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7786521911621094, + "learning_rate": 2.337944428991334e-08, + "loss": 0.088, + "num_tokens": 12271424.0, + "reward": 6.458310604095459, + "reward_std": 2.7207114696502686, + "rewards/fitness_reward/mean": 5.911310195922852, + "rewards/fitness_reward/std": 2.682255983352661, + "rewards/kidney_reward/mean": 0.2806072235107422, + "rewards/kidney_reward/std": 1.3979253768920898, + "rewards/length2tails_reward/mean": 0.828407883644104, + "rewards/length2tails_reward/std": 0.19976463913917542, + "rewards/thermo_reward/mean": 0.3991893529891968, + "rewards/thermo_reward/std": 1.706862449645996, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.125, + "completions/mean_terminated_length": 265.125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.16497320402413607, + "epoch": 2.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5587619543075562, + "learning_rate": 2.2915978576595885e-08, + "loss": -0.0788, + "num_tokens": 12279940.0, + "reward": 6.309449195861816, + "reward_std": 2.909780740737915, + "rewards/fitness_reward/mean": 5.844907283782959, + "rewards/fitness_reward/std": 2.5762150287628174, + "rewards/kidney_reward/mean": -0.04822586476802826, + "rewards/kidney_reward/std": 1.346443772315979, + "rewards/length2tails_reward/mean": 0.8379120826721191, + "rewards/length2tails_reward/std": 0.24232549965381622, + "rewards/thermo_reward/mean": 0.5583528280258179, + "rewards/thermo_reward/std": 1.6966190338134766, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.16020658891648054, + "epoch": 2.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3049333095550537, + "learning_rate": 2.2457099512023124e-08, + "loss": -0.066, + "num_tokens": 12288531.0, + "reward": 6.542539119720459, + "reward_std": 2.297462224960327, + "rewards/fitness_reward/mean": 6.245963096618652, + "rewards/fitness_reward/std": 1.95350980758667, + "rewards/kidney_reward/mean": 0.05394989252090454, + "rewards/kidney_reward/std": 1.2599544525146484, + "rewards/length2tails_reward/mean": 0.8381986021995544, + "rewards/length2tails_reward/std": 0.2220676988363266, + "rewards/thermo_reward/mean": 0.12010223418474197, + "rewards/thermo_reward/std": 1.84904146194458, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14450675528496504, + "epoch": 2.8120000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43417951464653015, + "learning_rate": 2.200280925027498e-08, + "loss": -0.0011, + "num_tokens": 12297227.0, + "reward": 7.2437944412231445, + "reward_std": 0.9547690749168396, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.11382076889276505, + "rewards/kidney_reward/std": 1.4074387550354004, + "rewards/length2tails_reward/mean": 0.8178130388259888, + "rewards/length2tails_reward/std": 0.22537489235401154, + "rewards/thermo_reward/mean": 0.7822650671005249, + "rewards/thermo_reward/std": 1.5678163766860962, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1362843131646514, + "epoch": 2.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6944310665130615, + "learning_rate": 2.15531099238907e-08, + "loss": 0.0004, + "num_tokens": 12305875.0, + "reward": 6.629751205444336, + "reward_std": 1.753425121307373, + "rewards/fitness_reward/mean": 6.293053150177002, + "rewards/fitness_reward/std": 1.6871278285980225, + "rewards/kidney_reward/mean": 0.29153525829315186, + "rewards/kidney_reward/std": 1.1796852350234985, + "rewards/length2tails_reward/mean": 0.6542400121688843, + "rewards/length2tails_reward/std": 0.3630537986755371, + "rewards/thermo_reward/mean": 0.0547400638461113, + "rewards/thermo_reward/std": 1.8363370895385742, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1543297441676259, + "epoch": 2.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8271833062171936, + "learning_rate": 2.110800364385812e-08, + "loss": -0.0063, + "num_tokens": 12314679.0, + "reward": 6.467617034912109, + "reward_std": 2.2177252769470215, + "rewards/fitness_reward/mean": 6.084590911865234, + "rewards/fitness_reward/std": 1.8501527309417725, + "rewards/kidney_reward/mean": -0.03308005630970001, + "rewards/kidney_reward/std": 1.3994696140289307, + "rewards/length2tails_reward/mean": 0.8121023178100586, + "rewards/length2tails_reward/std": 0.2523571848869324, + "rewards/thermo_reward/mean": 0.39308077096939087, + "rewards/thermo_reward/std": 1.8132723569869995, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1487204898148775, + "epoch": 2.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4872162342071533, + "learning_rate": 2.0667492499604977e-08, + "loss": 0.0067, + "num_tokens": 12323373.0, + "reward": 6.872705459594727, + "reward_std": 1.279822826385498, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.21303850412368774, + "rewards/kidney_reward/std": 1.252140760421753, + "rewards/length2tails_reward/mean": 0.7515128254890442, + "rewards/length2tails_reward/std": 0.2889956533908844, + "rewards/thermo_reward/mean": 0.812053382396698, + "rewards/thermo_reward/std": 1.6321959495544434, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 274.15625, + "completions/mean_terminated_length": 274.15625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.1653293939307332, + "epoch": 2.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.761055588722229, + "learning_rate": 2.023157855898794e-08, + "loss": -0.0195, + "num_tokens": 12332178.0, + "reward": 7.164825439453125, + "reward_std": 2.2404799461364746, + "rewards/fitness_reward/mean": 6.1121063232421875, + "rewards/fitness_reward/std": 2.1882286071777344, + "rewards/kidney_reward/mean": 0.5506860017776489, + "rewards/kidney_reward/std": 1.3578096628189087, + "rewards/length2tails_reward/mean": 0.7617408037185669, + "rewards/length2tails_reward/std": 0.2711496353149414, + "rewards/thermo_reward/mean": 1.1738829612731934, + "rewards/thermo_reward/std": 1.1864036321640015, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.13870614860206842, + "epoch": 2.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5952174067497253, + "learning_rate": 1.9800263868283707e-08, + "loss": -0.0017, + "num_tokens": 12340893.0, + "reward": 6.992000579833984, + "reward_std": 1.3440306186676025, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2195071578025818, + "rewards/kidney_reward/std": 1.300525188446045, + "rewards/length2tails_reward/mean": 0.8274776339530945, + "rewards/length2tails_reward/std": 0.19963759183883667, + "rewards/thermo_reward/mean": 0.3741362988948822, + "rewards/thermo_reward/std": 1.6312953233718872, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14124279841780663, + "epoch": 2.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47751161456108093, + "learning_rate": 1.937355045217892e-08, + "loss": 0.0034, + "num_tokens": 12349563.0, + "reward": 6.788661479949951, + "reward_std": 1.5129404067993164, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.11229953169822693, + "rewards/kidney_reward/std": 1.5040749311447144, + "rewards/length2tails_reward/mean": 0.7822368741035461, + "rewards/length2tails_reward/std": 0.2576814889907837, + "rewards/thermo_reward/mean": 0.09728699922561646, + "rewards/thermo_reward/std": 1.8031628131866455, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13555827178061008, + "epoch": 2.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6221144795417786, + "learning_rate": 1.8951440313760836e-08, + "loss": 0.0045, + "num_tokens": 12358253.0, + "reward": 6.719213485717773, + "reward_std": 1.7934808731079102, + "rewards/fitness_reward/mean": 6.209277153015137, + "rewards/fitness_reward/std": 1.6648041009902954, + "rewards/kidney_reward/mean": 0.1541314423084259, + "rewards/kidney_reward/std": 1.4177526235580444, + "rewards/length2tails_reward/mean": 0.8040240406990051, + "rewards/length2tails_reward/std": 0.2245865911245346, + "rewards/thermo_reward/mean": 0.4637297987937927, + "rewards/thermo_reward/std": 1.6221325397491455, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14387110993266106, + "epoch": 2.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7101742625236511, + "learning_rate": 1.8533935434507876e-08, + "loss": -0.0037, + "num_tokens": 12367010.0, + "reward": 7.285333156585693, + "reward_std": 1.122195839881897, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2577993869781494, + "rewards/kidney_reward/std": 1.5215656757354736, + "rewards/length2tails_reward/mean": 0.824920117855072, + "rewards/length2tails_reward/std": 0.18817827105522156, + "rewards/thermo_reward/mean": 0.7178107500076294, + "rewards/thermo_reward/std": 1.4351853132247925, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13892004918307066, + "epoch": 2.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.729738473892212, + "learning_rate": 1.8121037774280314e-08, + "loss": 0.0195, + "num_tokens": 12375768.0, + "reward": 6.714914321899414, + "reward_std": 2.297588348388672, + "rewards/fitness_reward/mean": 6.239622116088867, + "rewards/fitness_reward/std": 1.9893792867660522, + "rewards/kidney_reward/mean": 0.17417140305042267, + "rewards/kidney_reward/std": 1.349327564239502, + "rewards/length2tails_reward/mean": 0.8263484239578247, + "rewards/length2tails_reward/std": 0.21277137100696564, + "rewards/thermo_reward/mean": 0.36323752999305725, + "rewards/thermo_reward/std": 1.7828742265701294, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.16235277615487576, + "epoch": 2.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.621870994567871, + "learning_rate": 1.771274927131139e-08, + "loss": -0.0292, + "num_tokens": 12384469.0, + "reward": 6.7870354652404785, + "reward_std": 2.7964541912078857, + "rewards/fitness_reward/mean": 5.911896705627441, + "rewards/fitness_reward/std": 2.680070161819458, + "rewards/kidney_reward/mean": 0.34682175517082214, + "rewards/kidney_reward/std": 1.0858098268508911, + "rewards/length2tails_reward/mean": 0.8713294267654419, + "rewards/length2tails_reward/std": 0.21592596173286438, + "rewards/thermo_reward/mean": 0.9677914977073669, + "rewards/thermo_reward/std": 1.4624353647232056, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13292887713760138, + "epoch": 2.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8371862769126892, + "learning_rate": 1.7309071842197543e-08, + "loss": 0.0056, + "num_tokens": 12393134.0, + "reward": 6.806947708129883, + "reward_std": 2.350537061691284, + "rewards/fitness_reward/mean": 6.2250657081604, + "rewards/fitness_reward/std": 2.0717239379882812, + "rewards/kidney_reward/mean": 0.08140504360198975, + "rewards/kidney_reward/std": 1.4241693019866943, + "rewards/length2tails_reward/mean": 0.7418514490127563, + "rewards/length2tails_reward/std": 0.3014945387840271, + "rewards/thermo_reward/mean": 0.7114340662956238, + "rewards/thermo_reward/std": 1.4445003271102905, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.14086098410189152, + "epoch": 2.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49018678069114685, + "learning_rate": 1.6910007381890078e-08, + "loss": -0.0061, + "num_tokens": 12401846.0, + "reward": 6.921857833862305, + "reward_std": 2.269130229949951, + "rewards/fitness_reward/mean": 6.186952590942383, + "rewards/fitness_reward/std": 1.7836624383926392, + "rewards/kidney_reward/mean": 0.37685301899909973, + "rewards/kidney_reward/std": 1.334747076034546, + "rewards/length2tails_reward/mean": 0.7858234643936157, + "rewards/length2tails_reward/std": 0.300854355096817, + "rewards/thermo_reward/mean": 0.7000460028648376, + "rewards/thermo_reward/std": 1.331705927848816, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1357526509091258, + "epoch": 2.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0056061744689941, + "learning_rate": 1.6515557763685962e-08, + "loss": 0.0029, + "num_tokens": 12410529.0, + "reward": 6.92407751083374, + "reward_std": 1.1765470504760742, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1278037428855896, + "rewards/kidney_reward/std": 1.2013734579086304, + "rewards/length2tails_reward/mean": 0.7471013069152832, + "rewards/length2tails_reward/std": 0.2978944778442383, + "rewards/thermo_reward/mean": 0.16420452296733856, + "rewards/thermo_reward/std": 1.8253247737884521, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.14655262231826782, + "epoch": 2.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7096866369247437, + "learning_rate": 1.6125724839219035e-08, + "loss": -0.005, + "num_tokens": 12419201.0, + "reward": 6.932196617126465, + "reward_std": 2.394421339035034, + "rewards/fitness_reward/mean": 6.235196590423584, + "rewards/fitness_reward/std": 2.0144150257110596, + "rewards/kidney_reward/mean": 0.300119012594223, + "rewards/kidney_reward/std": 1.4254791736602783, + "rewards/length2tails_reward/mean": 0.7911103963851929, + "rewards/length2tails_reward/std": 0.30017784237861633, + "rewards/thermo_reward/mean": 0.6983258724212646, + "rewards/thermo_reward/std": 1.608771800994873, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.625, + "completions/mean_terminated_length": 267.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.17857135366648436, + "epoch": 2.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.670794129371643, + "learning_rate": 1.574051043845137e-08, + "loss": -0.0435, + "num_tokens": 12427797.0, + "reward": 5.834834098815918, + "reward_std": 3.735726833343506, + "rewards/fitness_reward/mean": 5.490056037902832, + "rewards/fitness_reward/std": 3.176297187805176, + "rewards/kidney_reward/mean": 0.4961375296115875, + "rewards/kidney_reward/std": 1.4461764097213745, + "rewards/length2tails_reward/mean": 0.8003699779510498, + "rewards/length2tails_reward/std": 0.2881245017051697, + "rewards/thermo_reward/mean": -0.20676587522029877, + "rewards/thermo_reward/std": 2.2276103496551514, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.125218759290874, + "epoch": 2.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6695669889450073, + "learning_rate": 1.5359916369664607e-08, + "loss": -0.0012, + "num_tokens": 12436481.0, + "reward": 7.43724250793457, + "reward_std": 1.19163179397583, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5621359348297119, + "rewards/kidney_reward/std": 1.319887399673462, + "rewards/length2tails_reward/mean": 0.7570602297782898, + "rewards/length2tails_reward/std": 0.28190967440605164, + "rewards/thermo_reward/mean": 0.7512223720550537, + "rewards/thermo_reward/std": 1.505037784576416, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.75, + "completions/mean_terminated_length": 266.75, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.1462242305278778, + "epoch": 2.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.253634214401245, + "learning_rate": 1.498394441945161e-08, + "loss": -0.0045, + "num_tokens": 12445049.0, + "reward": 6.654323577880859, + "reward_std": 2.183377742767334, + "rewards/fitness_reward/mean": 6.1567816734313965, + "rewards/fitness_reward/std": 1.9457803964614868, + "rewards/kidney_reward/mean": 0.3069812059402466, + "rewards/kidney_reward/std": 1.575376272201538, + "rewards/length2tails_reward/mean": 0.721645712852478, + "rewards/length2tails_reward/std": 0.3427678644657135, + "rewards/thermo_reward/mean": 0.32727959752082825, + "rewards/thermo_reward/std": 1.682885766029358, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.15394303388893604, + "epoch": 2.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3760063648223877, + "learning_rate": 1.4612596352708061e-08, + "loss": -0.0436, + "num_tokens": 12453726.0, + "reward": 6.9324445724487305, + "reward_std": 2.6650846004486084, + "rewards/fitness_reward/mean": 6.035008430480957, + "rewards/fitness_reward/std": 2.105931520462036, + "rewards/kidney_reward/mean": 0.6347125768661499, + "rewards/kidney_reward/std": 1.5480690002441406, + "rewards/length2tails_reward/mean": 0.7982813715934753, + "rewards/length2tails_reward/std": 0.2665309011936188, + "rewards/thermo_reward/mean": 0.7610183954238892, + "rewards/thermo_reward/std": 1.3520636558532715, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11911861784756184, + "epoch": 2.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5578207969665527, + "learning_rate": 1.4245873912623774e-08, + "loss": -0.0045, + "num_tokens": 12462392.0, + "reward": 6.748727321624756, + "reward_std": 2.153978109359741, + "rewards/fitness_reward/mean": 6.093750953674316, + "rewards/fitness_reward/std": 1.803702473640442, + "rewards/kidney_reward/mean": 0.4472007751464844, + "rewards/kidney_reward/std": 1.362303614616394, + "rewards/length2tails_reward/mean": 0.7099416255950928, + "rewards/length2tails_reward/std": 0.31876564025878906, + "rewards/thermo_reward/mean": 0.5077807903289795, + "rewards/thermo_reward/std": 1.657357931137085, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14151615649461746, + "epoch": 2.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4380311369895935, + "learning_rate": 1.3883778820675263e-08, + "loss": 0.0031, + "num_tokens": 12471127.0, + "reward": 7.1902618408203125, + "reward_std": 1.3406881093978882, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.6640706062316895, + "rewards/kidney_reward/std": 1.2684632539749146, + "rewards/length2tails_reward/mean": 0.8558312654495239, + "rewards/length2tails_reward/std": 0.19466395676136017, + "rewards/thermo_reward/mean": 0.10594163089990616, + "rewards/thermo_reward/std": 1.9827232360839844, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.13970914110541344, + "epoch": 2.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7682489156723022, + "learning_rate": 1.3526312776617088e-08, + "loss": -0.0136, + "num_tokens": 12479993.0, + "reward": 6.941483497619629, + "reward_std": 1.218990683555603, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.34110909700393677, + "rewards/kidney_reward/std": 1.41777503490448, + "rewards/length2tails_reward/mean": 0.7707187533378601, + "rewards/length2tails_reward/std": 0.2676590085029602, + "rewards/thermo_reward/mean": 0.17987973988056183, + "rewards/thermo_reward/std": 1.8115893602371216, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 276.65625, + "completions/mean_terminated_length": 276.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14775683544576168, + "epoch": 2.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.730597198009491, + "learning_rate": 1.3173477458473859e-08, + "loss": -0.022, + "num_tokens": 12488878.0, + "reward": 7.058040618896484, + "reward_std": 1.3192298412322998, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.15102064609527588, + "rewards/kidney_reward/std": 1.2240631580352783, + "rewards/length2tails_reward/mean": 0.7774417400360107, + "rewards/length2tails_reward/std": 0.29362088441848755, + "rewards/thermo_reward/mean": 0.5997215509414673, + "rewards/thermo_reward/std": 1.765357494354248, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.13810347020626068, + "epoch": 2.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7822426557540894, + "learning_rate": 1.2825274522532792e-08, + "loss": -0.0246, + "num_tokens": 12497638.0, + "reward": 6.468446731567383, + "reward_std": 2.949239492416382, + "rewards/fitness_reward/mean": 5.919968128204346, + "rewards/fitness_reward/std": 2.6634886264801025, + "rewards/kidney_reward/mean": 0.3845357894897461, + "rewards/kidney_reward/std": 1.3434585332870483, + "rewards/length2tails_reward/mean": 0.6921239495277405, + "rewards/length2tails_reward/std": 0.33831697702407837, + "rewards/thermo_reward/mean": 0.36635881662368774, + "rewards/thermo_reward/std": 1.6304407119750977, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.13473457004874945, + "epoch": 2.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6593548655509949, + "learning_rate": 1.2481705603335501e-08, + "loss": 0.0023, + "num_tokens": 12506350.0, + "reward": 7.10593318939209, + "reward_std": 1.6469697952270508, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.42517513036727905, + "rewards/kidney_reward/std": 1.2692887783050537, + "rewards/length2tails_reward/mean": 0.7936328649520874, + "rewards/length2tails_reward/std": 0.23213301599025726, + "rewards/thermo_reward/mean": 0.4132559597492218, + "rewards/thermo_reward/std": 1.9096404314041138, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13170104566961527, + "epoch": 2.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9489842057228088, + "learning_rate": 1.214277231367078e-08, + "loss": 0.0033, + "num_tokens": 12515038.0, + "reward": 7.002340793609619, + "reward_std": 1.231827974319458, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.01643287017941475, + "rewards/kidney_reward/std": 1.2471287250518799, + "rewards/length2tails_reward/mean": 0.7688412666320801, + "rewards/length2tails_reward/std": 0.25632792711257935, + "rewards/thermo_reward/mean": 0.45409780740737915, + "rewards/thermo_reward/std": 1.8726102113723755, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13555537443608046, + "epoch": 2.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1965844631195068, + "learning_rate": 1.1808476244566268e-08, + "loss": -0.0016, + "num_tokens": 12523730.0, + "reward": 7.308147430419922, + "reward_std": 1.1990786790847778, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.18481695652008057, + "rewards/kidney_reward/std": 1.4109687805175781, + "rewards/length2tails_reward/mean": 0.7541860342025757, + "rewards/length2tails_reward/std": 0.28595858812332153, + "rewards/thermo_reward/mean": 0.8717880845069885, + "rewards/thermo_reward/std": 1.522525429725647, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.78125, + "completions/mean_terminated_length": 267.78125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.12667144183069468, + "epoch": 2.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2155684232711792, + "learning_rate": 1.1478818965281911e-08, + "loss": -0.0025, + "num_tokens": 12532331.0, + "reward": 6.8437323570251465, + "reward_std": 1.9640074968338013, + "rewards/fitness_reward/mean": 6.286189079284668, + "rewards/fitness_reward/std": 1.7259576320648193, + "rewards/kidney_reward/mean": 0.3497192859649658, + "rewards/kidney_reward/std": 1.3481497764587402, + "rewards/length2tails_reward/mean": 0.7529022693634033, + "rewards/length2tails_reward/std": 0.27475279569625854, + "rewards/thermo_reward/mean": 0.3889157176017761, + "rewards/thermo_reward/std": 1.590811848640442, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.17365591693669558, + "epoch": 2.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2700657844543457, + "learning_rate": 1.1153802023301739e-08, + "loss": 0.0251, + "num_tokens": 12541057.0, + "reward": 6.711121559143066, + "reward_std": 2.7307705879211426, + "rewards/fitness_reward/mean": 5.908278465270996, + "rewards/fitness_reward/std": 2.2784316539764404, + "rewards/kidney_reward/mean": 0.4974465072154999, + "rewards/kidney_reward/std": 1.3248136043548584, + "rewards/length2tails_reward/mean": 0.8110289573669434, + "rewards/length2tails_reward/std": 0.2779410779476166, + "rewards/thermo_reward/mean": 0.7027250528335571, + "rewards/thermo_reward/std": 1.4618151187896729, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12462200038135052, + "epoch": 2.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0973374843597412, + "learning_rate": 1.083342694432754e-08, + "loss": -0.002, + "num_tokens": 12549721.0, + "reward": 6.924505233764648, + "reward_std": 1.9361951351165771, + "rewards/fitness_reward/mean": 6.004807472229004, + "rewards/fitness_reward/std": 1.800040602684021, + "rewards/kidney_reward/mean": 0.726659893989563, + "rewards/kidney_reward/std": 1.276934027671814, + "rewards/length2tails_reward/mean": 0.720918595790863, + "rewards/length2tails_reward/std": 0.2926802635192871, + "rewards/thermo_reward/mean": 0.7522759437561035, + "rewards/thermo_reward/std": 1.4213367700576782, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 290.1875, + "completions/mean_terminated_length": 290.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.18476249743252993, + "epoch": 2.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9985179901123047, + "learning_rate": 1.0517695232270752e-08, + "loss": 0.1886, + "num_tokens": 12559039.0, + "reward": 6.432023048400879, + "reward_std": 3.3190929889678955, + "rewards/fitness_reward/mean": 5.785904884338379, + "rewards/fitness_reward/std": 2.79811954498291, + "rewards/kidney_reward/mean": 0.18803167343139648, + "rewards/kidney_reward/std": 1.4805896282196045, + "rewards/length2tails_reward/mean": 0.7879659533500671, + "rewards/length2tails_reward/std": 0.24849654734134674, + "rewards/thermo_reward/mean": 0.7102215886116028, + "rewards/thermo_reward/std": 1.6738128662109375, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.13494328875094652, + "epoch": 2.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7340171337127686, + "learning_rate": 1.0206608369245806e-08, + "loss": 0.0663, + "num_tokens": 12567800.0, + "reward": 6.821213245391846, + "reward_std": 2.2489399909973145, + "rewards/fitness_reward/mean": 6.233017921447754, + "rewards/fitness_reward/std": 2.0267374515533447, + "rewards/kidney_reward/mean": 0.14065757393836975, + "rewards/kidney_reward/std": 1.215805172920227, + "rewards/length2tails_reward/mean": 0.7271273136138916, + "rewards/length2tails_reward/std": 0.3262440264225006, + "rewards/thermo_reward/mean": 0.6721683740615845, + "rewards/thermo_reward/std": 1.7998899221420288, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13407836388796568, + "epoch": 2.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.962937355041504, + "learning_rate": 9.900167815563464e-09, + "loss": -0.0086, + "num_tokens": 12576423.0, + "reward": 6.002249717712402, + "reward_std": 3.873012065887451, + "rewards/fitness_reward/mean": 5.448002815246582, + "rewards/fitness_reward/std": 3.3158228397369385, + "rewards/kidney_reward/mean": 0.48057061433792114, + "rewards/kidney_reward/std": 1.3584699630737305, + "rewards/length2tails_reward/mean": 0.7161372900009155, + "rewards/length2tails_reward/std": 0.27225834131240845, + "rewards/thermo_reward/mean": 0.2698539197444916, + "rewards/thermo_reward/std": 1.8259103298187256, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.14022604562342167, + "epoch": 2.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5946846008300781, + "learning_rate": 9.598375009723602e-09, + "loss": -0.0095, + "num_tokens": 12585102.0, + "reward": 6.442773342132568, + "reward_std": 2.6569581031799316, + "rewards/fitness_reward/mean": 6.1227264404296875, + "rewards/fitness_reward/std": 2.130378484725952, + "rewards/kidney_reward/mean": 0.052609775215387344, + "rewards/kidney_reward/std": 1.4129655361175537, + "rewards/length2tails_reward/mean": 0.816627562046051, + "rewards/length2tails_reward/std": 0.23909829556941986, + "rewards/thermo_reward/mean": 0.17917059361934662, + "rewards/thermo_reward/std": 1.7966289520263672, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.13946919236332178, + "epoch": 2.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7692407369613647, + "learning_rate": 9.301231368408324e-09, + "loss": 0.0, + "num_tokens": 12593780.0, + "reward": 7.400271415710449, + "reward_std": 1.1493571996688843, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.36759209632873535, + "rewards/kidney_reward/std": 1.2643353939056396, + "rewards/length2tails_reward/mean": 0.8107408881187439, + "rewards/length2tails_reward/std": 0.21436426043510437, + "rewards/thermo_reward/mean": 0.8449838161468506, + "rewards/thermo_reward/std": 1.5716559886932373, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14793408382683992, + "epoch": 2.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6871620416641235, + "learning_rate": 9.008738286475748e-09, + "loss": 0.017, + "num_tokens": 12602531.0, + "reward": 6.514265060424805, + "reward_std": 2.082831621170044, + "rewards/fitness_reward/mean": 5.958763599395752, + "rewards/fitness_reward/std": 2.023963689804077, + "rewards/kidney_reward/mean": 0.38745027780532837, + "rewards/kidney_reward/std": 1.3079309463500977, + "rewards/length2tails_reward/mean": 0.8158104419708252, + "rewards/length2tails_reward/std": 0.25582075119018555, + "rewards/thermo_reward/mean": 0.3156468868255615, + "rewards/thermo_reward/std": 1.5188398361206055, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.15060217771679163, + "epoch": 2.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5781137347221375, + "learning_rate": 8.72089713695312e-09, + "loss": 0.0052, + "num_tokens": 12611279.0, + "reward": 7.3282928466796875, + "reward_std": 0.9550822973251343, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.20819398760795593, + "rewards/kidney_reward/std": 1.2715450525283813, + "rewards/length2tails_reward/mean": 0.7916375398635864, + "rewards/length2tails_reward/std": 0.2064710110425949, + "rewards/thermo_reward/mean": 0.869976282119751, + "rewards/thermo_reward/std": 1.2325146198272705, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.71875, + "completions/mean_terminated_length": 268.71875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.14390655141323805, + "epoch": 2.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1622700691223145, + "learning_rate": 8.437709271030602e-09, + "loss": -0.0256, + "num_tokens": 12619910.0, + "reward": 6.496492385864258, + "reward_std": 2.8575756549835205, + "rewards/fitness_reward/mean": 6.00910758972168, + "rewards/fitness_reward/std": 2.2418978214263916, + "rewards/kidney_reward/mean": 0.38257062435150146, + "rewards/kidney_reward/std": 1.1316463947296143, + "rewards/length2tails_reward/mean": 0.7363088726997375, + "rewards/length2tails_reward/std": 0.3371676206588745, + "rewards/thermo_reward/mean": 0.22404572367668152, + "rewards/thermo_reward/std": 1.937560796737671, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14074919559061527, + "epoch": 2.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5030248165130615, + "learning_rate": 8.159176018054714e-09, + "loss": 0.0047, + "num_tokens": 12628587.0, + "reward": 7.095517158508301, + "reward_std": 1.5111889839172363, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.42837631702423096, + "rewards/kidney_reward/std": 1.3469523191452026, + "rewards/length2tails_reward/mean": 0.7558008432388306, + "rewards/length2tails_reward/std": 0.28064581751823425, + "rewards/thermo_reward/mean": 0.40813910961151123, + "rewards/thermo_reward/std": 1.8127702474594116, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13455157168209553, + "epoch": 2.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6451231837272644, + "learning_rate": 7.885298685522235e-09, + "loss": -0.011, + "num_tokens": 12637334.0, + "reward": 7.2803802490234375, + "reward_std": 0.9431139230728149, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2864418625831604, + "rewards/kidney_reward/std": 1.3635408878326416, + "rewards/length2tails_reward/mean": 0.7700046300888062, + "rewards/length2tails_reward/std": 0.27584001421928406, + "rewards/thermo_reward/mean": 0.7067201137542725, + "rewards/thermo_reward/std": 1.4738563299179077, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14169297832995653, + "epoch": 2.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0626550912857056, + "learning_rate": 7.616078559073868e-09, + "loss": 0.0046, + "num_tokens": 12646050.0, + "reward": 6.921530723571777, + "reward_std": 1.2653937339782715, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.07574322074651718, + "rewards/kidney_reward/std": 1.29454505443573, + "rewards/length2tails_reward/mean": 0.8175597190856934, + "rewards/length2tails_reward/std": 0.2737521231174469, + "rewards/thermo_reward/mean": 0.32742786407470703, + "rewards/thermo_reward/std": 1.8196754455566406, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 276.09375, + "completions/mean_terminated_length": 276.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1375004155561328, + "epoch": 2.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.017873525619507, + "learning_rate": 7.351516902488697e-09, + "loss": 0.1036, + "num_tokens": 12654917.0, + "reward": 5.817282676696777, + "reward_std": 2.7030675411224365, + "rewards/fitness_reward/mean": 5.912364482879639, + "rewards/fitness_reward/std": 2.2575597763061523, + "rewards/kidney_reward/mean": -0.18207262456417084, + "rewards/kidney_reward/std": 1.3396426439285278, + "rewards/length2tails_reward/mean": 0.7266623973846436, + "rewards/length2tails_reward/std": 0.3229210376739502, + "rewards/thermo_reward/mean": -0.37142127752304077, + "rewards/thermo_reward/std": 1.8730424642562866, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14180856104940176, + "epoch": 2.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7809602618217468, + "learning_rate": 7.091614957677516e-09, + "loss": -0.0015, + "num_tokens": 12663623.0, + "reward": 7.308341979980469, + "reward_std": 1.3192280530929565, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.37692025303840637, + "rewards/kidney_reward/std": 1.3429784774780273, + "rewards/length2tails_reward/mean": 0.8764455914497375, + "rewards/length2tails_reward/std": 0.15134894847869873, + "rewards/thermo_reward/mean": 0.6189439296722412, + "rewards/thermo_reward/std": 1.6462007761001587, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1516916686668992, + "epoch": 2.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5762298107147217, + "learning_rate": 6.836373944677953e-09, + "loss": 0.0096, + "num_tokens": 12672334.0, + "reward": 6.723543167114258, + "reward_std": 2.90553879737854, + "rewards/fitness_reward/mean": 6.109837055206299, + "rewards/fitness_reward/std": 2.2006046772003174, + "rewards/kidney_reward/mean": 0.5687717199325562, + "rewards/kidney_reward/std": 1.4330580234527588, + "rewards/length2tails_reward/mean": 0.7892628908157349, + "rewards/length2tails_reward/std": 0.24311278760433197, + "rewards/thermo_reward/mean": 0.2640092968940735, + "rewards/thermo_reward/std": 1.8858898878097534, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.1582466997206211, + "epoch": 2.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.391456961631775, + "learning_rate": 6.585795061647359e-09, + "loss": -0.0477, + "num_tokens": 12680985.0, + "reward": 6.445952415466309, + "reward_std": 3.293947696685791, + "rewards/fitness_reward/mean": 5.770969867706299, + "rewards/fitness_reward/std": 2.8558413982391357, + "rewards/kidney_reward/mean": 0.25056761503219604, + "rewards/kidney_reward/std": 1.2453705072402954, + "rewards/length2tails_reward/mean": 0.8212426900863647, + "rewards/length2tails_reward/std": 0.2772662341594696, + "rewards/thermo_reward/mean": 0.6887751221656799, + "rewards/thermo_reward/std": 1.6371608972549438, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 264.6875, + "completions/mean_terminated_length": 264.6875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.1541215404868126, + "epoch": 2.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1623852252960205, + "learning_rate": 6.3398794848589234e-09, + "loss": -0.0825, + "num_tokens": 12689487.0, + "reward": 6.433993339538574, + "reward_std": 2.615723133087158, + "rewards/fitness_reward/mean": 5.925602436065674, + "rewards/fitness_reward/std": 2.6288352012634277, + "rewards/kidney_reward/mean": 0.14905336499214172, + "rewards/kidney_reward/std": 1.0788512229919434, + "rewards/length2tails_reward/mean": 0.7655574083328247, + "rewards/length2tails_reward/std": 0.2662903368473053, + "rewards/thermo_reward/mean": 0.48495009541511536, + "rewards/thermo_reward/std": 1.595316767692566, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 279.0, + "completions/mean_terminated_length": 279.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.15801042038947344, + "epoch": 2.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.236201047897339, + "learning_rate": 6.098628368694458e-09, + "loss": 0.0165, + "num_tokens": 12698447.0, + "reward": 7.014389991760254, + "reward_std": 2.270691394805908, + "rewards/fitness_reward/mean": 6.226443290710449, + "rewards/fitness_reward/std": 2.0639288425445557, + "rewards/kidney_reward/mean": 0.5009498000144958, + "rewards/kidney_reward/std": 1.3135877847671509, + "rewards/length2tails_reward/mean": 0.6947227716445923, + "rewards/length2tails_reward/std": 0.30098757147789, + "rewards/thermo_reward/mean": 0.7275813817977905, + "rewards/thermo_reward/std": 1.6598973274230957, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15467301569879055, + "epoch": 2.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5553257465362549, + "learning_rate": 5.862042845640403e-09, + "loss": 0.0005, + "num_tokens": 12707161.0, + "reward": 6.831840515136719, + "reward_std": 2.0688636302948, + "rewards/fitness_reward/mean": 6.289404392242432, + "rewards/fitness_reward/std": 1.707769513130188, + "rewards/kidney_reward/mean": 0.21025590598583221, + "rewards/kidney_reward/std": 1.3362786769866943, + "rewards/length2tails_reward/mean": 0.8093304634094238, + "rewards/length2tails_reward/std": 0.266846626996994, + "rewards/thermo_reward/mean": 0.46995168924331665, + "rewards/thermo_reward/std": 1.6251285076141357, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16247744299471378, + "epoch": 2.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.19881010055542, + "learning_rate": 5.6301240262814906e-09, + "loss": 0.0435, + "num_tokens": 12715951.0, + "reward": 5.877201080322266, + "reward_std": 3.4015395641326904, + "rewards/fitness_reward/mean": 5.550357818603516, + "rewards/fitness_reward/std": 2.964296817779541, + "rewards/kidney_reward/mean": 0.07011695951223373, + "rewards/kidney_reward/std": 1.4410039186477661, + "rewards/length2tails_reward/mean": 0.7873591184616089, + "rewards/length2tails_reward/std": 0.28878599405288696, + "rewards/thermo_reward/mean": 0.18988877534866333, + "rewards/thermo_reward/std": 1.9826947450637817, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.15738764591515064, + "epoch": 2.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5995246171951294, + "learning_rate": 5.402872999295871e-09, + "loss": -0.0129, + "num_tokens": 12724771.0, + "reward": 6.355648517608643, + "reward_std": 3.300568103790283, + "rewards/fitness_reward/mean": 5.623927116394043, + "rewards/fitness_reward/std": 3.079596996307373, + "rewards/kidney_reward/mean": 0.6385465860366821, + "rewards/kidney_reward/std": 1.4216870069503784, + "rewards/length2tails_reward/mean": 0.8031167984008789, + "rewards/length2tails_reward/std": 0.2625901699066162, + "rewards/thermo_reward/mean": 0.423337459564209, + "rewards/thermo_reward/std": 1.691360354423523, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13713269773870707, + "epoch": 2.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33095431327819824, + "learning_rate": 5.18029083145044e-09, + "loss": 0.0025, + "num_tokens": 12733442.0, + "reward": 7.679625511169434, + "reward_std": 0.8440757393836975, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.7131369113922119, + "rewards/kidney_reward/std": 1.152511477470398, + "rewards/length2tails_reward/mean": 0.7369496822357178, + "rewards/length2tails_reward/std": 0.2672252655029297, + "rewards/thermo_reward/mean": 1.0950433015823364, + "rewards/thermo_reward/std": 1.3486934900283813, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15165000595152378, + "epoch": 2.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.504826307296753, + "learning_rate": 4.96237856759496e-09, + "loss": 0.0503, + "num_tokens": 12742211.0, + "reward": 7.213679313659668, + "reward_std": 1.834496259689331, + "rewards/fitness_reward/mean": 6.275476455688477, + "rewards/fitness_reward/std": 1.7865570783615112, + "rewards/kidney_reward/mean": 0.3893027901649475, + "rewards/kidney_reward/std": 1.0794671773910522, + "rewards/length2tails_reward/mean": 0.7702823877334595, + "rewards/length2tails_reward/std": 0.2766615152359009, + "rewards/thermo_reward/mean": 1.101961612701416, + "rewards/thermo_reward/std": 1.2780330181121826, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12490935996174812, + "epoch": 2.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5967586040496826, + "learning_rate": 4.749137230658062e-09, + "loss": -0.0006, + "num_tokens": 12750920.0, + "reward": 7.00774621963501, + "reward_std": 0.9727635383605957, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.18259793519973755, + "rewards/kidney_reward/std": 1.2373865842819214, + "rewards/length2tails_reward/mean": 0.8531543016433716, + "rewards/length2tails_reward/std": 0.16220837831497192, + "rewards/thermo_reward/mean": 0.5889166593551636, + "rewards/thermo_reward/std": 1.7927863597869873, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13828979432582855, + "epoch": 2.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5980755090713501, + "learning_rate": 4.540567821641583e-09, + "loss": 0.0001, + "num_tokens": 12759610.0, + "reward": 6.783051490783691, + "reward_std": 2.335374116897583, + "rewards/fitness_reward/mean": 6.204986572265625, + "rewards/fitness_reward/std": 1.687563180923462, + "rewards/kidney_reward/mean": 0.20775341987609863, + "rewards/kidney_reward/std": 1.3032476902008057, + "rewards/length2tails_reward/mean": 0.8109624981880188, + "rewards/length2tails_reward/std": 0.2163923978805542, + "rewards/thermo_reward/mean": 0.5428949594497681, + "rewards/thermo_reward/std": 1.7521591186523438, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.13300664722919464, + "epoch": 2.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5044702887535095, + "learning_rate": 4.33667131961657e-09, + "loss": -0.0013, + "num_tokens": 12768279.0, + "reward": 7.189572334289551, + "reward_std": 1.2418421506881714, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.20704029500484467, + "rewards/kidney_reward/std": 1.3732868432998657, + "rewards/length2tails_reward/mean": 0.8197420835494995, + "rewards/length2tails_reward/std": 0.2270992547273636, + "rewards/thermo_reward/mean": 0.5796371698379517, + "rewards/thermo_reward/std": 1.671095848083496, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13399848714470863, + "epoch": 2.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6382166147232056, + "learning_rate": 4.137448681718392e-09, + "loss": 0.0008, + "num_tokens": 12776965.0, + "reward": 7.289574146270752, + "reward_std": 1.192184567451477, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.4481619596481323, + "rewards/kidney_reward/std": 1.2837142944335938, + "rewards/length2tails_reward/mean": 0.8042483329772949, + "rewards/length2tails_reward/std": 0.2358108013868332, + "rewards/thermo_reward/mean": 0.546265721321106, + "rewards/thermo_reward/std": 1.6167054176330566, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14154429361224174, + "epoch": 2.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5424478054046631, + "learning_rate": 3.942900843142305e-09, + "loss": -0.0039, + "num_tokens": 12785673.0, + "reward": 7.24534797668457, + "reward_std": 1.043375015258789, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.7512714266777039, + "rewards/kidney_reward/std": 1.1718850135803223, + "rewards/length2tails_reward/mean": 0.7533770799636841, + "rewards/length2tails_reward/std": 0.2646474540233612, + "rewards/thermo_reward/mean": 0.18013982474803925, + "rewards/thermo_reward/std": 1.796936273574829, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13358979113399982, + "epoch": 2.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9458522200584412, + "learning_rate": 3.753028717138784e-09, + "loss": 0.0026, + "num_tokens": 12794342.0, + "reward": 7.18497896194458, + "reward_std": 1.0747334957122803, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.41478344798088074, + "rewards/kidney_reward/std": 1.363236904144287, + "rewards/length2tails_reward/mean": 0.7006873488426208, + "rewards/length2tails_reward/std": 0.32383599877357483, + "rewards/thermo_reward/mean": 0.6282134056091309, + "rewards/thermo_reward/std": 1.717767357826233, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 273.59375, + "completions/mean_terminated_length": 273.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.18609529174864292, + "epoch": 2.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5746772289276123, + "learning_rate": 3.5678331950096395e-09, + "loss": 0.0343, + "num_tokens": 12803129.0, + "reward": 6.027772426605225, + "reward_std": 3.386929988861084, + "rewards/fitness_reward/mean": 5.454624176025391, + "rewards/fitness_reward/std": 3.287524700164795, + "rewards/kidney_reward/mean": 0.26886725425720215, + "rewards/kidney_reward/std": 1.4162918329238892, + "rewards/length2tails_reward/mean": 0.8081879615783691, + "rewards/length2tails_reward/std": 0.2594278156757355, + "rewards/thermo_reward/mean": 0.47333452105522156, + "rewards/thermo_reward/std": 1.7276191711425781, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13801335915923119, + "epoch": 2.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7756869196891785, + "learning_rate": 3.3873151461037973e-09, + "loss": 0.013, + "num_tokens": 12811862.0, + "reward": 7.520531177520752, + "reward_std": 1.13658607006073, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.8265535831451416, + "rewards/kidney_reward/std": 1.3319242000579834, + "rewards/length2tails_reward/mean": 0.7390146255493164, + "rewards/length2tails_reward/std": 0.29337626695632935, + "rewards/thermo_reward/mean": 0.6624047756195068, + "rewards/thermo_reward/std": 1.5161179304122925, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14003800600767136, + "epoch": 2.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6539826989173889, + "learning_rate": 3.211475417812748e-09, + "loss": -0.0049, + "num_tokens": 12820552.0, + "reward": 7.051595211029053, + "reward_std": 1.0269882678985596, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.12804999947547913, + "rewards/kidney_reward/std": 1.2092504501342773, + "rewards/length2tails_reward/mean": 0.75218665599823, + "rewards/length2tails_reward/std": 0.277934730052948, + "rewards/thermo_reward/mean": 0.4164506793022156, + "rewards/thermo_reward/std": 1.5604767799377441, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.142989844083786, + "epoch": 2.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8601912260055542, + "learning_rate": 3.040314835567326e-09, + "loss": 0.0052, + "num_tokens": 12829279.0, + "reward": 6.415733337402344, + "reward_std": 2.177215576171875, + "rewards/fitness_reward/mean": 6.004148483276367, + "rewards/fitness_reward/std": 1.803175926208496, + "rewards/kidney_reward/mean": 0.41347983479499817, + "rewards/kidney_reward/std": 1.2116390466690063, + "rewards/length2tails_reward/mean": 0.755215048789978, + "rewards/length2tails_reward/std": 0.3013410270214081, + "rewards/thermo_reward/mean": 0.03208187222480774, + "rewards/thermo_reward/std": 1.896373987197876, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14948764815926552, + "epoch": 2.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7861649394035339, + "learning_rate": 2.8738342028331584e-09, + "loss": -0.0007, + "num_tokens": 12838027.0, + "reward": 6.963781356811523, + "reward_std": 1.1723902225494385, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.39005857706069946, + "rewards/kidney_reward/std": 1.4512755870819092, + "rewards/length2tails_reward/mean": 0.7364763021469116, + "rewards/length2tails_reward/std": 0.33358028531074524, + "rewards/thermo_reward/mean": 0.39862534403800964, + "rewards/thermo_reward/std": 1.5978807210922241, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 274.09375, + "completions/mean_terminated_length": 274.09375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.1500695589929819, + "epoch": 2.9379999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5259214639663696, + "learning_rate": 2.7120343011071133e-09, + "loss": 0.0012, + "num_tokens": 12846830.0, + "reward": 7.226581573486328, + "reward_std": 1.1350669860839844, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.33697712421417236, + "rewards/kidney_reward/std": 1.5126299858093262, + "rewards/length2tails_reward/mean": 0.7741527557373047, + "rewards/length2tails_reward/std": 0.3223993182182312, + "rewards/thermo_reward/mean": 0.5465137958526611, + "rewards/thermo_reward/std": 1.735576868057251, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13979172240942717, + "epoch": 2.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4293266832828522, + "learning_rate": 2.5549158899137447e-09, + "loss": 0.0008, + "num_tokens": 12855545.0, + "reward": 6.8875627517700195, + "reward_std": 1.01347815990448, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.2681354880332947, + "rewards/kidney_reward/std": 1.1730067729949951, + "rewards/length2tails_reward/mean": 0.8164932131767273, + "rewards/length2tails_reward/std": 0.2584230601787567, + "rewards/thermo_reward/mean": -0.08385242521762848, + "rewards/thermo_reward/std": 1.9532597064971924, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 264.09375, + "completions/mean_terminated_length": 264.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.13745386339724064, + "epoch": 2.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8528796434402466, + "learning_rate": 2.4024797068017412e-09, + "loss": -0.075, + "num_tokens": 12864028.0, + "reward": 6.052642822265625, + "reward_std": 2.7144711017608643, + "rewards/fitness_reward/mean": 5.679010391235352, + "rewards/fitness_reward/std": 2.8438870906829834, + "rewards/kidney_reward/mean": 0.4132249057292938, + "rewards/kidney_reward/std": 1.2994471788406372, + "rewards/length2tails_reward/mean": 0.7700828313827515, + "rewards/length2tails_reward/std": 0.3196830749511719, + "rewards/thermo_reward/mean": -0.05100217089056969, + "rewards/thermo_reward/std": 1.753897786140442, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11830961145460606, + "epoch": 2.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8448054194450378, + "learning_rate": 2.254726467340151e-09, + "loss": -0.0018, + "num_tokens": 12872709.0, + "reward": 6.7451863288879395, + "reward_std": 2.036768674850464, + "rewards/fitness_reward/mean": 6.309423923492432, + "rewards/fitness_reward/std": 1.594521403312683, + "rewards/kidney_reward/mean": -0.14827734231948853, + "rewards/kidney_reward/std": 1.3583015203475952, + "rewards/length2tails_reward/mean": 0.7265836000442505, + "rewards/length2tails_reward/std": 0.32428255677223206, + "rewards/thermo_reward/mean": 0.6565101146697998, + "rewards/thermo_reward/std": 1.7385002374649048, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13755716290324926, + "epoch": 2.9459999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8877173066139221, + "learning_rate": 2.1116568651156073e-09, + "loss": -0.0054, + "num_tokens": 12881408.0, + "reward": 7.135970115661621, + "reward_std": 1.9548718929290771, + "rewards/fitness_reward/mean": 6.212075233459473, + "rewards/fitness_reward/std": 1.6499855518341064, + "rewards/kidney_reward/mean": 0.7503572702407837, + "rewards/kidney_reward/std": 1.0756021738052368, + "rewards/length2tails_reward/mean": 0.7937425971031189, + "rewards/length2tails_reward/std": 0.2745879590511322, + "rewards/thermo_reward/mean": 0.700560986995697, + "rewards/thermo_reward/std": 1.700540542602539, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 285.0625, + "completions/mean_terminated_length": 285.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16017275676131248, + "epoch": 2.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.878459453582764, + "learning_rate": 1.973271571728441e-09, + "loss": 0.1639, + "num_tokens": 12890562.0, + "reward": 6.446413516998291, + "reward_std": 2.290283679962158, + "rewards/fitness_reward/mean": 6.015828609466553, + "rewards/fitness_reward/std": 2.20648455619812, + "rewards/kidney_reward/mean": 0.24198131263256073, + "rewards/kidney_reward/std": 1.5187300443649292, + "rewards/length2tails_reward/mean": 0.782778263092041, + "rewards/length2tails_reward/std": 0.2801356613636017, + "rewards/thermo_reward/mean": 0.2277996987104416, + "rewards/thermo_reward/std": 1.7942554950714111, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1201626705005765, + "epoch": 2.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5637052655220032, + "learning_rate": 1.8395712367897942e-09, + "loss": -0.0002, + "num_tokens": 12899197.0, + "reward": 7.328199863433838, + "reward_std": 1.0085116624832153, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.5194838643074036, + "rewards/kidney_reward/std": 1.315508246421814, + "rewards/length2tails_reward/mean": 0.6775139570236206, + "rewards/length2tails_reward/std": 0.31684550642967224, + "rewards/thermo_reward/mean": 0.6155627369880676, + "rewards/thermo_reward/std": 1.471408486366272, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 284.03125, + "completions/mean_terminated_length": 284.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1678213281556964, + "epoch": 2.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8360488414764404, + "learning_rate": 1.710556487918624e-09, + "loss": 0.0072, + "num_tokens": 12908318.0, + "reward": 7.245866298675537, + "reward_std": 0.979259192943573, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.22946840524673462, + "rewards/kidney_reward/std": 1.2244020700454712, + "rewards/length2tails_reward/mean": 0.7997192144393921, + "rewards/length2tails_reward/std": 0.22223548591136932, + "rewards/thermo_reward/mean": 0.6798077821731567, + "rewards/thermo_reward/std": 1.5259937047958374, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13489855360239744, + "epoch": 2.9539999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5896024703979492, + "learning_rate": 1.5862279307388149e-09, + "loss": 0.0022, + "num_tokens": 12917023.0, + "reward": 7.4311089515686035, + "reward_std": 0.6194378733634949, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.21066860854625702, + "rewards/kidney_reward/std": 1.2731415033340454, + "rewards/length2tails_reward/mean": 0.8221845626831055, + "rewards/length2tails_reward/std": 0.18835249543190002, + "rewards/thermo_reward/mean": 1.0578607320785522, + "rewards/thermo_reward/std": 0.9952807426452637, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14065147377550602, + "epoch": 2.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.607245922088623, + "learning_rate": 1.466586148876181e-09, + "loss": 0.0131, + "num_tokens": 12925756.0, + "reward": 5.897775650024414, + "reward_std": 3.274827241897583, + "rewards/fitness_reward/mean": 5.540828704833984, + "rewards/fitness_reward/std": 3.3381597995758057, + "rewards/kidney_reward/mean": -0.1542062610387802, + "rewards/kidney_reward/std": 1.2925556898117065, + "rewards/length2tails_reward/mean": 0.8041127920150757, + "rewards/length2tails_reward/std": 0.249794140458107, + "rewards/thermo_reward/mean": 0.4660438001155853, + "rewards/thermo_reward/std": 1.823898434638977, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.13760568853467703, + "epoch": 2.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.317879557609558, + "learning_rate": 1.3516317039555803e-09, + "loss": -0.1278, + "num_tokens": 12934184.0, + "reward": 7.084677696228027, + "reward_std": 1.949188232421875, + "rewards/fitness_reward/mean": 6.237892150878906, + "rewards/fitness_reward/std": 1.9991644620895386, + "rewards/kidney_reward/mean": 0.3345673382282257, + "rewards/kidney_reward/std": 1.1622320413589478, + "rewards/length2tails_reward/mean": 0.7086790800094604, + "rewards/length2tails_reward/std": 0.28027620911598206, + "rewards/thermo_reward/mean": 1.0046638250350952, + "rewards/thermo_reward/std": 1.1032321453094482, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13646094594150782, + "epoch": 2.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8035528659820557, + "learning_rate": 1.2413651355986932e-09, + "loss": 0.0378, + "num_tokens": 12943128.0, + "reward": 6.995002746582031, + "reward_std": 0.952528178691864, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.5317928194999695, + "rewards/kidney_reward/std": 1.195467472076416, + "rewards/length2tails_reward/mean": 0.7140636444091797, + "rewards/length2tails_reward/std": 0.3175659477710724, + "rewards/thermo_reward/mean": 0.12456199526786804, + "rewards/thermo_reward/std": 1.7555283308029175, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.14361482299864292, + "epoch": 2.9619999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0149130821228027, + "learning_rate": 1.1357869614212478e-09, + "loss": -0.0342, + "num_tokens": 12951776.0, + "reward": 6.794735908508301, + "reward_std": 2.422370672225952, + "rewards/fitness_reward/mean": 6.226663589477539, + "rewards/fitness_reward/std": 2.06268572807312, + "rewards/kidney_reward/mean": 0.45078110694885254, + "rewards/kidney_reward/std": 1.3131287097930908, + "rewards/length2tails_reward/mean": 0.8022973537445068, + "rewards/length2tails_reward/std": 0.26289355754852295, + "rewards/thermo_reward/mean": 0.2842141389846802, + "rewards/thermo_reward/std": 1.751166820526123, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14039535727351904, + "epoch": 2.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4771628975868225, + "learning_rate": 1.0348976770305773e-09, + "loss": -0.0026, + "num_tokens": 12960471.0, + "reward": 7.01984977722168, + "reward_std": 1.2747634649276733, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4522587060928345, + "rewards/kidney_reward/std": 1.1824373006820679, + "rewards/length2tails_reward/mean": 0.7214146256446838, + "rewards/length2tails_reward/std": 0.33009421825408936, + "rewards/thermo_reward/mean": 0.25011470913887024, + "rewards/thermo_reward/std": 1.793310284614563, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14099798072129488, + "epoch": 2.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8739150166511536, + "learning_rate": 9.386977560232879e-10, + "loss": 0.0088, + "num_tokens": 12969202.0, + "reward": 6.931124687194824, + "reward_std": 1.0800544023513794, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.046764496713876724, + "rewards/kidney_reward/std": 1.2248467206954956, + "rewards/length2tails_reward/mean": 0.7640290260314941, + "rewards/length2tails_reward/std": 0.2615300416946411, + "rewards/thermo_reward/mean": 0.2508741319179535, + "rewards/thermo_reward/std": 1.8097829818725586, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.15939642302691936, + "epoch": 2.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.808926582336426, + "learning_rate": 8.471876499830388e-10, + "loss": 0.0333, + "num_tokens": 12978068.0, + "reward": 6.8016510009765625, + "reward_std": 2.7129480838775635, + "rewards/fitness_reward/mean": 5.8593645095825195, + "rewards/fitness_reward/std": 2.8814096450805664, + "rewards/kidney_reward/mean": 0.6056361198425293, + "rewards/kidney_reward/std": 1.2611591815948486, + "rewards/length2tails_reward/mean": 0.7760035395622253, + "rewards/length2tails_reward/std": 0.3093658685684204, + "rewards/thermo_reward/mean": 0.8909339904785156, + "rewards/thermo_reward/std": 1.5391558408737183, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 275.71875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14191575534641743, + "epoch": 2.9699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.040797472000122, + "learning_rate": 7.603677884787663e-10, + "loss": 0.0891, + "num_tokens": 12986923.0, + "reward": 6.818601608276367, + "reward_std": 2.46020245552063, + "rewards/fitness_reward/mean": 6.212231636047363, + "rewards/fitness_reward/std": 2.1443235874176025, + "rewards/kidney_reward/mean": 0.47110024094581604, + "rewards/kidney_reward/std": 1.346298336982727, + "rewards/length2tails_reward/mean": 0.8158657550811768, + "rewards/length2tails_reward/std": 0.2665424048900604, + "rewards/thermo_reward/mean": 0.3337061405181885, + "rewards/thermo_reward/std": 1.5895960330963135, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 288.3125, + "completions/mean_terminated_length": 273.2903137207031, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.17228106036782265, + "epoch": 2.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 16.767419815063477, + "learning_rate": 6.782385790617961e-10, + "loss": 0.2529, + "num_tokens": 12996181.0, + "reward": 6.993387699127197, + "reward_std": 2.466411590576172, + "rewards/fitness_reward/mean": 6.204681396484375, + "rewards/fitness_reward/std": 2.1870360374450684, + "rewards/kidney_reward/mean": 0.4054473042488098, + "rewards/kidney_reward/std": 1.5106313228607178, + "rewards/length2tails_reward/mean": 0.7668466567993164, + "rewards/length2tails_reward/std": 0.31008175015449524, + "rewards/thermo_reward/mean": 0.7885421514511108, + "rewards/thermo_reward/std": 1.6364628076553345, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15573112107813358, + "epoch": 2.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4071457386016846, + "learning_rate": 6.008004072650674e-10, + "loss": 0.0024, + "num_tokens": 13004863.0, + "reward": 6.719488620758057, + "reward_std": 2.2555627822875977, + "rewards/fitness_reward/mean": 6.114397048950195, + "rewards/fitness_reward/std": 2.17573881149292, + "rewards/kidney_reward/mean": 0.1855234056711197, + "rewards/kidney_reward/std": 1.3008853197097778, + "rewards/length2tails_reward/mean": 0.7519688010215759, + "rewards/length2tails_reward/std": 0.3069925606250763, + "rewards/thermo_reward/mean": 0.6486751437187195, + "rewards/thermo_reward/std": 1.4494085311889648, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.15691445488482714, + "epoch": 2.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0619285106658936, + "learning_rate": 5.280536366004673e-10, + "loss": -0.008, + "num_tokens": 13013528.0, + "reward": 6.336668014526367, + "reward_std": 3.090419292449951, + "rewards/fitness_reward/mean": 5.842085361480713, + "rewards/fitness_reward/std": 2.95082950592041, + "rewards/kidney_reward/mean": -0.09133400022983551, + "rewards/kidney_reward/std": 1.2801560163497925, + "rewards/length2tails_reward/mean": 0.7124618291854858, + "rewards/length2tails_reward/std": 0.30824965238571167, + "rewards/thermo_reward/mean": 0.7242677211761475, + "rewards/thermo_reward/std": 1.6709953546524048, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 277.125, + "completions/mean_terminated_length": 277.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1839960590004921, + "epoch": 2.9779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1689226627349854, + "learning_rate": 4.5999860855738815e-10, + "loss": 0.0562, + "num_tokens": 13022428.0, + "reward": 6.6264729499816895, + "reward_std": 2.6910088062286377, + "rewards/fitness_reward/mean": 5.9253315925598145, + "rewards/fitness_reward/std": 2.620548963546753, + "rewards/kidney_reward/mean": 0.28081417083740234, + "rewards/kidney_reward/std": 1.1797038316726685, + "rewards/length2tails_reward/mean": 0.771510124206543, + "rewards/length2tails_reward/std": 0.24603815376758575, + "rewards/thermo_reward/mean": 0.7357128262519836, + "rewards/thermo_reward/std": 1.4072482585906982, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14643418230116367, + "epoch": 2.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9222062826156616, + "learning_rate": 3.96635642601173e-10, + "loss": -0.0004, + "num_tokens": 13031107.0, + "reward": 6.780588150024414, + "reward_std": 1.2724297046661377, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.7047371864318848, + "rewards/kidney_reward/std": 1.3323123455047607, + "rewards/length2tails_reward/mean": 0.743053674697876, + "rewards/length2tails_reward/std": 0.31944894790649414, + "rewards/thermo_reward/mean": -0.07974910736083984, + "rewards/thermo_reward/std": 1.8582299947738647, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15591341443359852, + "epoch": 2.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0284464359283447, + "learning_rate": 3.3796503617167237e-10, + "loss": -0.0071, + "num_tokens": 13039822.0, + "reward": 6.5805206298828125, + "reward_std": 2.7856285572052, + "rewards/fitness_reward/mean": 5.933781623840332, + "rewards/fitness_reward/std": 2.6079893112182617, + "rewards/kidney_reward/mean": 0.4801759719848633, + "rewards/kidney_reward/std": 1.3672772645950317, + "rewards/length2tails_reward/mean": 0.7854573726654053, + "rewards/length2tails_reward/std": 0.2942323386669159, + "rewards/thermo_reward/mean": 0.4205727279186249, + "rewards/thermo_reward/std": 1.663835048675537, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11399362049996853, + "epoch": 2.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1794707775115967, + "learning_rate": 2.83987064681801e-10, + "loss": -0.0037, + "num_tokens": 13048488.0, + "reward": 6.966623306274414, + "reward_std": 1.1887325048446655, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.027587972581386566, + "rewards/kidney_reward/std": 1.41304612159729, + "rewards/length2tails_reward/mean": 0.7413458824157715, + "rewards/length2tails_reward/std": 0.2729589641094208, + "rewards/thermo_reward/mean": 0.6135426759719849, + "rewards/thermo_reward/std": 1.5830315351486206, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.21875, + "completions/mean_terminated_length": 265.21875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.13386992178857327, + "epoch": 2.9859999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0854442119598389, + "learning_rate": 2.347019815158724e-10, + "loss": -0.0863, + "num_tokens": 13057007.0, + "reward": 6.826526641845703, + "reward_std": 2.2675764560699463, + "rewards/fitness_reward/mean": 6.2466888427734375, + "rewards/fitness_reward/std": 1.9494024515151978, + "rewards/kidney_reward/mean": 0.566569447517395, + "rewards/kidney_reward/std": 1.3472126722335815, + "rewards/length2tails_reward/mean": 0.8220120668411255, + "rewards/length2tails_reward/std": 0.1419975459575653, + "rewards/thermo_reward/mean": 0.1820993721485138, + "rewards/thermo_reward/std": 1.5343799591064453, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14159501064568758, + "epoch": 2.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6729517579078674, + "learning_rate": 1.9011001802915483e-10, + "loss": 0.0004, + "num_tokens": 13065705.0, + "reward": 7.252538204193115, + "reward_std": 1.1515175104141235, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.4723457992076874, + "rewards/kidney_reward/std": 1.296017050743103, + "rewards/length2tails_reward/mean": 0.8068416714668274, + "rewards/length2tails_reward/std": 0.28029298782348633, + "rewards/thermo_reward/mean": 0.6526919007301331, + "rewards/thermo_reward/std": 1.6669418811798096, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13220036029815674, + "epoch": 2.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3101279735565186, + "learning_rate": 1.5021138354609497e-10, + "loss": 0.001, + "num_tokens": 13074428.0, + "reward": 6.931380271911621, + "reward_std": 1.2030127048492432, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.3753945529460907, + "rewards/kidney_reward/std": 1.3288666009902954, + "rewards/length2tails_reward/mean": 0.761831521987915, + "rewards/length2tails_reward/std": 0.31949740648269653, + "rewards/thermo_reward/mean": -0.07614654302597046, + "rewards/thermo_reward/std": 1.989808440208435, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 274.09375, + "completions/mean_terminated_length": 274.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1651429943740368, + "epoch": 2.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5398972034454346, + "learning_rate": 1.1500626535987379e-10, + "loss": 0.036, + "num_tokens": 13083231.0, + "reward": 6.410170078277588, + "reward_std": 2.9121439456939697, + "rewards/fitness_reward/mean": 5.732306480407715, + "rewards/fitness_reward/std": 2.660806179046631, + "rewards/kidney_reward/mean": 0.6740546822547913, + "rewards/kidney_reward/std": 1.2244349718093872, + "rewards/length2tails_reward/mean": 0.79660964012146, + "rewards/length2tails_reward/std": 0.30914241075515747, + "rewards/thermo_reward/mean": 0.2833682596683502, + "rewards/thermo_reward/std": 1.892885446548462, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15174808725714684, + "epoch": 2.9939999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8190692663192749, + "learning_rate": 8.449482873096325e-11, + "loss": 0.0114, + "num_tokens": 13092022.0, + "reward": 7.053735733032227, + "reward_std": 1.018854022026062, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.1245800331234932, + "rewards/kidney_reward/std": 1.032915711402893, + "rewards/length2tails_reward/mean": 0.8063247203826904, + "rewards/length2tails_reward/std": 0.26767203211784363, + "rewards/thermo_reward/mean": 0.6462920308113098, + "rewards/thermo_reward/std": 1.5545495748519897, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1364625096321106, + "epoch": 2.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38062989711761475, + "learning_rate": 5.86772168869043e-11, + "loss": -0.005, + "num_tokens": 13100727.0, + "reward": 6.96686315536499, + "reward_std": 1.962496280670166, + "rewards/fitness_reward/mean": 6.188211441040039, + "rewards/fitness_reward/std": 1.7769315242767334, + "rewards/kidney_reward/mean": 0.7297247648239136, + "rewards/kidney_reward/std": 1.239060401916504, + "rewards/length2tails_reward/mean": 0.8024195432662964, + "rewards/length2tails_reward/std": 0.25200769305229187, + "rewards/thermo_reward/mean": 0.42636924982070923, + "rewards/thermo_reward/std": 1.6208664178848267, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 273.9375, + "completions/mean_terminated_length": 273.9375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.13252860866487026, + "epoch": 2.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.886621117591858, + "learning_rate": 3.7553551021085593e-11, + "loss": -0.0157, + "num_tokens": 13109525.0, + "reward": 5.906866550445557, + "reward_std": 3.0820071697235107, + "rewards/fitness_reward/mean": 5.74906063079834, + "rewards/fitness_reward/std": 2.9400858879089355, + "rewards/kidney_reward/mean": -0.43569836020469666, + "rewards/kidney_reward/std": 1.1883536577224731, + "rewards/length2tails_reward/mean": 0.7798252701759338, + "rewards/length2tails_reward/std": 0.3008924722671509, + "rewards/thermo_reward/mean": 0.3613983988761902, + "rewards/thermo_reward/std": 1.8486649990081787, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 267.34375, + "completions/mean_terminated_length": 267.34375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.14323666971176863, + "epoch": 3.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4840319156646729, + "learning_rate": 2.1123930292965554e-11, + "loss": -0.085, + "num_tokens": 13118112.0, + "reward": 6.759047508239746, + "reward_std": 2.3054842948913574, + "rewards/fitness_reward/mean": 6.216947555541992, + "rewards/fitness_reward/std": 2.1176464557647705, + "rewards/kidney_reward/mean": 0.24084295332431793, + "rewards/kidney_reward/std": 1.4039134979248047, + "rewards/length2tails_reward/mean": 0.789250910282135, + "rewards/length2tails_reward/std": 0.2723137140274048, + "rewards/thermo_reward/mean": 0.4487316310405731, + "rewards/thermo_reward/std": 1.526945948600769, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 1500, + "num_input_tokens_seen": 13118112, + "num_train_epochs": 3, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}