diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2824 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4577259475218658, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0322265625, + "completions/max_length": 244.3125, + "completions/max_terminated_length": 204.375, + "completions/mean_length": 64.1787109375, + "completions/mean_terminated_length": 57.63671565055847, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 0.15918307239189744, + "epoch": 0.015549076773566569, + "frac_reward_zero_std": 0.453125, + "grad_norm": 0.7093124389648438, + "learning_rate": 5e-05, + "loss": -0.4337, + "num_tokens": 1189183.0, + "reward": 3.017539083957672, + "reward_std": 1.1567719243466854, + "rewards/bm25_retrieval_reward_fn/mean": 0.3280859384685755, + "rewards/bm25_retrieval_reward_fn/std": 0.38037889264523983, + "rewards/event_reward_fn/mean": 2.2734375, + "rewards/event_reward_fn/std": 3.2615081816911697, + "rewards/format_reward_fn/mean": 0.4160156287252903, + "rewards/format_reward_fn/std": 0.33771974220871925, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0478515625, + "completions/max_length": 240.6875, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 96.623046875, + "completions/mean_terminated_length": 88.71417284011841, + "completions/min_length": 7.3125, + "completions/min_terminated_length": 7.3125, + "entropy": 0.0870705652050674, + "epoch": 0.031098153547133137, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.11729823052883148, + "learning_rate": 5e-05, + "loss": -0.1999, + "num_tokens": 2439389.0, + "reward": 6.020513415336609, + "reward_std": 1.318240948021412, + "rewards/bm25_retrieval_reward_fn/mean": 0.636932659894228, + "rewards/bm25_retrieval_reward_fn/std": 0.4330139197409153, + "rewards/event_reward_fn/mean": 4.67578125, + "rewards/event_reward_fn/std": 4.073968470096588, + "rewards/format_reward_fn/mean": 0.7077994756400585, + "rewards/format_reward_fn/std": 0.37246643379330635, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0498046875, + "completions/max_length": 249.75, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 108.2373046875, + "completions/mean_terminated_length": 100.33520174026489, + "completions/min_length": 21.9375, + "completions/min_terminated_length": 21.9375, + "entropy": 0.06846319953911006, + "epoch": 0.04664723032069971, + "frac_reward_zero_std": 0.39453125, + "grad_norm": 0.11189325153827667, + "learning_rate": 5e-05, + "loss": -0.0794, + "num_tokens": 3697760.0, + "reward": 6.618032068014145, + "reward_std": 1.1964115016162395, + "rewards/bm25_retrieval_reward_fn/mean": 0.8193489573895931, + "rewards/bm25_retrieval_reward_fn/std": 0.3472439646720886, + "rewards/event_reward_fn/mean": 4.94921875, + "rewards/event_reward_fn/std": 4.1042004227638245, + "rewards/format_reward_fn/mean": 0.8494642823934555, + "rewards/format_reward_fn/std": 0.30338616110384464, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 251.625, + "completions/max_terminated_length": 232.75, + "completions/mean_length": 114.8154296875, + "completions/mean_terminated_length": 102.74591159820557, + "completions/min_length": 36.3125, + "completions/min_terminated_length": 36.3125, + "entropy": 0.0698625217191875, + "epoch": 0.062196307094266275, + "frac_reward_zero_std": 0.4140625, + "grad_norm": 0.1286889612674713, + "learning_rate": 5e-05, + "loss": -0.0521, + "num_tokens": 4908167.0, + "reward": 7.294231742620468, + "reward_std": 1.1466168127954006, + "rewards/bm25_retrieval_reward_fn/mean": 0.833867184817791, + "rewards/bm25_retrieval_reward_fn/std": 0.3500053770840168, + "rewards/event_reward_fn/mean": 5.6142578125, + "rewards/event_reward_fn/std": 4.4990804344415665, + "rewards/format_reward_fn/mean": 0.8461067788302898, + "rewards/format_reward_fn/std": 0.3374804314225912, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0712890625, + "completions/max_length": 246.625, + "completions/max_terminated_length": 219.25, + "completions/mean_length": 113.826171875, + "completions/mean_terminated_length": 103.01660490036011, + "completions/min_length": 38.6875, + "completions/min_terminated_length": 38.6875, + "entropy": 0.0676732650026679, + "epoch": 0.07774538386783285, + "frac_reward_zero_std": 0.39453125, + "grad_norm": 0.08458422869443893, + "learning_rate": 5e-05, + "loss": -0.0159, + "num_tokens": 6128157.0, + "reward": 7.6867459416389465, + "reward_std": 0.9968220815062523, + "rewards/bm25_retrieval_reward_fn/mean": 0.8819040954113007, + "rewards/bm25_retrieval_reward_fn/std": 0.29652632866054773, + "rewards/event_reward_fn/mean": 5.904296875, + "rewards/event_reward_fn/std": 4.028907224535942, + "rewards/format_reward_fn/mean": 0.9005450159311295, + "rewards/format_reward_fn/std": 0.2789665600284934, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0361328125, + "completions/max_length": 247.125, + "completions/max_terminated_length": 236.1875, + "completions/mean_length": 112.671875, + "completions/mean_terminated_length": 107.51665830612183, + "completions/min_length": 37.375, + "completions/min_terminated_length": 37.375, + "entropy": 0.06292958417907357, + "epoch": 0.09329446064139942, + "frac_reward_zero_std": 0.453125, + "grad_norm": 0.09709884226322174, + "learning_rate": 5e-05, + "loss": -0.0353, + "num_tokens": 7275181.0, + "reward": 7.98236358165741, + "reward_std": 1.0672973282635212, + "rewards/bm25_retrieval_reward_fn/mean": 0.9419283382594585, + "rewards/bm25_retrieval_reward_fn/std": 0.18250679067568853, + "rewards/event_reward_fn/mean": 6.091796875, + "rewards/event_reward_fn/std": 4.740213438868523, + "rewards/format_reward_fn/mean": 0.9486383907496929, + "rewards/format_reward_fn/std": 0.16436113324016333, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1083984375, + "completions/max_length": 255.1875, + "completions/max_terminated_length": 238.3125, + "completions/mean_length": 129.126953125, + "completions/mean_terminated_length": 113.86429929733276, + "completions/min_length": 41.4375, + "completions/min_terminated_length": 41.4375, + "entropy": 0.06285874638706446, + "epoch": 0.10884353741496598, + "frac_reward_zero_std": 0.46484375, + "grad_norm": 0.17054593563079834, + "learning_rate": 5e-05, + "loss": -0.0124, + "num_tokens": 8523631.0, + "reward": 7.8958849012851715, + "reward_std": 1.0757801569998264, + "rewards/bm25_retrieval_reward_fn/mean": 0.8616694211959839, + "rewards/bm25_retrieval_reward_fn/std": 0.31423775386065245, + "rewards/event_reward_fn/mean": 6.1513671875, + "rewards/event_reward_fn/std": 4.978878691792488, + "rewards/format_reward_fn/mean": 0.8828483074903488, + "rewards/format_reward_fn/std": 0.30156402476131916, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0849609375, + "completions/max_length": 251.375, + "completions/max_terminated_length": 228.4375, + "completions/mean_length": 127.2109375, + "completions/mean_terminated_length": 115.21368026733398, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.0637968888040632, + "epoch": 0.12439261418853255, + "frac_reward_zero_std": 0.390625, + "grad_norm": 0.1357981413602829, + "learning_rate": 5e-05, + "loss": -0.0186, + "num_tokens": 9707755.0, + "reward": 8.13655748963356, + "reward_std": 1.0848342552781105, + "rewards/bm25_retrieval_reward_fn/mean": 0.8800471648573875, + "rewards/bm25_retrieval_reward_fn/std": 0.29066222277469933, + "rewards/event_reward_fn/mean": 6.36328125, + "rewards/event_reward_fn/std": 4.869352951645851, + "rewards/format_reward_fn/mean": 0.8932291641831398, + "rewards/format_reward_fn/std": 0.28600863087922335, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1123046875, + "completions/max_length": 253.1875, + "completions/max_terminated_length": 237.5, + "completions/mean_length": 140.646484375, + "completions/mean_terminated_length": 126.51056718826294, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "entropy": 0.06207763450220227, + "epoch": 0.13994169096209913, + "frac_reward_zero_std": 0.43359375, + "grad_norm": 0.18493860960006714, + "learning_rate": 5e-05, + "loss": -0.0196, + "num_tokens": 10967133.0, + "reward": 8.799611210823059, + "reward_std": 0.9760072641074657, + "rewards/bm25_retrieval_reward_fn/mean": 0.8509132824838161, + "rewards/bm25_retrieval_reward_fn/std": 0.3210932519286871, + "rewards/event_reward_fn/mean": 7.080078125, + "rewards/event_reward_fn/std": 4.870284929871559, + "rewards/format_reward_fn/mean": 0.868619792163372, + "rewards/format_reward_fn/std": 0.3133174767717719, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.158203125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 243.6875, + "completions/mean_length": 147.4453125, + "completions/mean_terminated_length": 126.75604343414307, + "completions/min_length": 45.1875, + "completions/min_terminated_length": 45.1875, + "entropy": 0.07122921152040362, + "epoch": 0.1554907677356657, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.2770453989505768, + "learning_rate": 5e-05, + "loss": -0.0029, + "num_tokens": 12272289.0, + "reward": 8.378659665584564, + "reward_std": 1.1576978042721748, + "rewards/bm25_retrieval_reward_fn/mean": 0.814004722982645, + "rewards/bm25_retrieval_reward_fn/std": 0.3608710467815399, + "rewards/event_reward_fn/mean": 6.7333984375, + "rewards/event_reward_fn/std": 4.9148435443639755, + "rewards/format_reward_fn/mean": 0.8312565125524998, + "rewards/format_reward_fn/std": 0.3619283623993397, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.166015625, + "completions/max_length": 254.8125, + "completions/max_terminated_length": 245.5625, + "completions/mean_length": 147.28125, + "completions/mean_terminated_length": 126.25686740875244, + "completions/min_length": 47.875, + "completions/min_terminated_length": 47.875, + "entropy": 0.07063461863435805, + "epoch": 0.17103984450923226, + "frac_reward_zero_std": 0.359375, + "grad_norm": 0.20308320224285126, + "learning_rate": 5e-05, + "loss": -0.0226, + "num_tokens": 13517901.0, + "reward": 8.586736917495728, + "reward_std": 1.180733297020197, + "rewards/bm25_retrieval_reward_fn/mean": 0.7950377985835075, + "rewards/bm25_retrieval_reward_fn/std": 0.35726089123636484, + "rewards/event_reward_fn/mean": 6.9697265625, + "rewards/event_reward_fn/std": 4.731213182210922, + "rewards/format_reward_fn/mean": 0.8219726607203484, + "rewards/format_reward_fn/std": 0.35164750274270773, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1572265625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 238.375, + "completions/mean_length": 139.818359375, + "completions/mean_terminated_length": 118.0954179763794, + "completions/min_length": 40.8125, + "completions/min_terminated_length": 40.8125, + "entropy": 0.06718740961514413, + "epoch": 0.18658892128279883, + "frac_reward_zero_std": 0.41015625, + "grad_norm": 0.10902810841798782, + "learning_rate": 5e-05, + "loss": -0.014, + "num_tokens": 14756091.0, + "reward": 8.501474261283875, + "reward_std": 1.0462469272315502, + "rewards/bm25_retrieval_reward_fn/mean": 0.7894043922424316, + "rewards/bm25_retrieval_reward_fn/std": 0.36320002656430006, + "rewards/event_reward_fn/mean": 6.900390625, + "rewards/event_reward_fn/std": 4.841355547308922, + "rewards/format_reward_fn/mean": 0.8116793744266033, + "rewards/format_reward_fn/std": 0.36351621337234974, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1650390625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 240.75, + "completions/mean_length": 150.31640625, + "completions/mean_terminated_length": 129.93299293518066, + "completions/min_length": 47.3125, + "completions/min_terminated_length": 47.3125, + "entropy": 0.06464909669011831, + "epoch": 0.2021379980563654, + "frac_reward_zero_std": 0.37109375, + "grad_norm": 0.14535187184810638, + "learning_rate": 5e-05, + "loss": -0.0053, + "num_tokens": 16038167.0, + "reward": 9.072274684906006, + "reward_std": 1.216166764497757, + "rewards/bm25_retrieval_reward_fn/mean": 0.8119766861200333, + "rewards/bm25_retrieval_reward_fn/std": 0.3669638652354479, + "rewards/event_reward_fn/mean": 7.439453125, + "rewards/event_reward_fn/std": 5.443397417664528, + "rewards/format_reward_fn/mean": 0.8208449557423592, + "rewards/format_reward_fn/std": 0.3688422851264477, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.123046875, + "completions/max_length": 253.125, + "completions/max_terminated_length": 236.25, + "completions/mean_length": 134.2294921875, + "completions/mean_terminated_length": 117.79497241973877, + "completions/min_length": 37.875, + "completions/min_terminated_length": 37.875, + "entropy": 0.06634449982084334, + "epoch": 0.21768707482993196, + "frac_reward_zero_std": 0.35546875, + "grad_norm": 0.0848744735121727, + "learning_rate": 5e-05, + "loss": -0.0096, + "num_tokens": 17312310.0, + "reward": 8.67900961637497, + "reward_std": 1.2484335452318192, + "rewards/bm25_retrieval_reward_fn/mean": 0.8359074406325817, + "rewards/bm25_retrieval_reward_fn/std": 0.33306772634387016, + "rewards/event_reward_fn/mean": 6.998046875, + "rewards/event_reward_fn/std": 5.2749055325984955, + "rewards/format_reward_fn/mean": 0.8450553454458714, + "rewards/format_reward_fn/std": 0.3290289109572768, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1416015625, + "completions/max_length": 251.875, + "completions/max_terminated_length": 233.375, + "completions/mean_length": 138.087890625, + "completions/mean_terminated_length": 119.15170526504517, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "entropy": 0.06127542559988797, + "epoch": 0.23323615160349853, + "frac_reward_zero_std": 0.390625, + "grad_norm": 0.11555243283510208, + "learning_rate": 5e-05, + "loss": -0.0131, + "num_tokens": 18542780.0, + "reward": 8.50430566072464, + "reward_std": 0.9973306134343147, + "rewards/bm25_retrieval_reward_fn/mean": 0.8396431356668472, + "rewards/bm25_retrieval_reward_fn/std": 0.3250976144336164, + "rewards/event_reward_fn/mean": 6.8232421875, + "rewards/event_reward_fn/std": 4.391354620456696, + "rewards/format_reward_fn/mean": 0.8414202034473419, + "rewards/format_reward_fn/std": 0.32731985161080956, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.150390625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 238.3125, + "completions/mean_length": 145.9013671875, + "completions/mean_terminated_length": 126.70299863815308, + "completions/min_length": 42.625, + "completions/min_terminated_length": 42.625, + "entropy": 0.06281583779491484, + "epoch": 0.2487852283770651, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.18120716512203217, + "learning_rate": 5e-05, + "loss": -0.0064, + "num_tokens": 19826255.0, + "reward": 8.296767592430115, + "reward_std": 1.2354702651500702, + "rewards/bm25_retrieval_reward_fn/mean": 0.8322994858026505, + "rewards/bm25_retrieval_reward_fn/std": 0.347878853790462, + "rewards/event_reward_fn/mean": 6.6259765625, + "rewards/event_reward_fn/std": 4.630821079015732, + "rewards/format_reward_fn/mean": 0.8384914398193359, + "rewards/format_reward_fn/std": 0.3501485912129283, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.193359375, + "completions/max_length": 255.875, + "completions/max_terminated_length": 238.25, + "completions/mean_length": 149.7919921875, + "completions/mean_terminated_length": 124.16296577453613, + "completions/min_length": 45.875, + "completions/min_terminated_length": 45.875, + "entropy": 0.06578910606913269, + "epoch": 0.26433430515063167, + "frac_reward_zero_std": 0.359375, + "grad_norm": 0.1267288774251938, + "learning_rate": 5e-05, + "loss": -0.0136, + "num_tokens": 21125750.0, + "reward": 8.781003445386887, + "reward_std": 1.1383938118815422, + "rewards/bm25_retrieval_reward_fn/mean": 0.7845699526369572, + "rewards/bm25_retrieval_reward_fn/std": 0.37547132885083556, + "rewards/event_reward_fn/mean": 7.2001953125, + "rewards/event_reward_fn/std": 5.128496631979942, + "rewards/format_reward_fn/mean": 0.7962380684912205, + "rewards/format_reward_fn/std": 0.3762207794934511, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1591796875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 240.5625, + "completions/mean_length": 153.1923828125, + "completions/mean_terminated_length": 134.3278865814209, + "completions/min_length": 47.875, + "completions/min_terminated_length": 47.875, + "entropy": 0.06297733471728861, + "epoch": 0.27988338192419826, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.1375938057899475, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 22340679.0, + "reward": 8.529892146587372, + "reward_std": 0.9637267738580704, + "rewards/bm25_retrieval_reward_fn/mean": 0.8117021955549717, + "rewards/bm25_retrieval_reward_fn/std": 0.36230491753667593, + "rewards/event_reward_fn/mean": 6.8916015625, + "rewards/event_reward_fn/std": 4.614271923899651, + "rewards/format_reward_fn/mean": 0.8265885375440121, + "rewards/format_reward_fn/std": 0.3658856125548482, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 241.3125, + "completions/mean_length": 154.25, + "completions/mean_terminated_length": 135.76823568344116, + "completions/min_length": 48.5625, + "completions/min_terminated_length": 48.5625, + "entropy": 0.06781743955798447, + "epoch": 0.2954324586977648, + "frac_reward_zero_std": 0.37109375, + "grad_norm": 0.1462751030921936, + "learning_rate": 5e-05, + "loss": -0.0151, + "num_tokens": 23587243.0, + "reward": 8.23677259683609, + "reward_std": 1.0035298839211464, + "rewards/bm25_retrieval_reward_fn/mean": 0.8233126699924469, + "rewards/bm25_retrieval_reward_fn/std": 0.3517280900850892, + "rewards/event_reward_fn/mean": 6.5712890625, + "rewards/event_reward_fn/std": 4.795703008770943, + "rewards/format_reward_fn/mean": 0.8421707637608051, + "rewards/format_reward_fn/std": 0.35116075072437525, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1806640625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 249.75, + "completions/mean_length": 166.966796875, + "completions/mean_terminated_length": 147.62712383270264, + "completions/min_length": 61.375, + "completions/min_terminated_length": 61.375, + "entropy": 0.06373983481898904, + "epoch": 0.3109815354713314, + "frac_reward_zero_std": 0.3984375, + "grad_norm": 0.11124490946531296, + "learning_rate": 5e-05, + "loss": 0.003, + "num_tokens": 24903841.0, + "reward": 9.053372412919998, + "reward_std": 0.9619283508509398, + "rewards/bm25_retrieval_reward_fn/mean": 0.7753774374723434, + "rewards/bm25_retrieval_reward_fn/std": 0.390325166285038, + "rewards/event_reward_fn/mean": 7.4970703125, + "rewards/event_reward_fn/std": 4.754469409584999, + "rewards/format_reward_fn/mean": 0.7809244766831398, + "rewards/format_reward_fn/std": 0.39305115677416325, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2158203125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 242.0625, + "completions/mean_length": 168.8544921875, + "completions/mean_terminated_length": 144.7502179145813, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "entropy": 0.06674353126436472, + "epoch": 0.32653061224489793, + "frac_reward_zero_std": 0.3046875, + "grad_norm": 0.11352943629026413, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_tokens": 26176048.0, + "reward": 8.903641551733017, + "reward_std": 1.0895367171615362, + "rewards/bm25_retrieval_reward_fn/mean": 0.7484599277377129, + "rewards/bm25_retrieval_reward_fn/std": 0.39963601250201464, + "rewards/event_reward_fn/mean": 7.390625, + "rewards/event_reward_fn/std": 5.010941222310066, + "rewards/format_reward_fn/mean": 0.7645566947758198, + "rewards/format_reward_fn/std": 0.40272433403879404, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.201171875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 169.0126953125, + "completions/mean_terminated_length": 147.5876121520996, + "completions/min_length": 54.375, + "completions/min_terminated_length": 54.375, + "entropy": 0.06429841788485646, + "epoch": 0.34207968901846453, + "frac_reward_zero_std": 0.3828125, + "grad_norm": 0.15351000428199768, + "learning_rate": 5e-05, + "loss": -0.0046, + "num_tokens": 27486521.0, + "reward": 9.137612909078598, + "reward_std": 1.0497351847589016, + "rewards/bm25_retrieval_reward_fn/mean": 0.7691521309316158, + "rewards/bm25_retrieval_reward_fn/std": 0.3831571042537689, + "rewards/event_reward_fn/mean": 7.5888671875, + "rewards/event_reward_fn/std": 5.132935270667076, + "rewards/format_reward_fn/mean": 0.7795935608446598, + "rewards/format_reward_fn/std": 0.3872489295899868, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.18359375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 237.75, + "completions/mean_length": 163.7431640625, + "completions/mean_terminated_length": 143.10296440124512, + "completions/min_length": 51.8125, + "completions/min_terminated_length": 51.8125, + "entropy": 0.06813837168738246, + "epoch": 0.3576287657920311, + "frac_reward_zero_std": 0.3359375, + "grad_norm": 0.17290791869163513, + "learning_rate": 5e-05, + "loss": 0.0092, + "num_tokens": 28746262.0, + "reward": 9.347127586603165, + "reward_std": 1.1120197921991348, + "rewards/bm25_retrieval_reward_fn/mean": 0.7918755821883678, + "rewards/bm25_retrieval_reward_fn/std": 0.3813342722132802, + "rewards/event_reward_fn/mean": 7.75390625, + "rewards/event_reward_fn/std": 5.131016373634338, + "rewards/format_reward_fn/mean": 0.8013457953929901, + "rewards/format_reward_fn/std": 0.3841324523091316, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.169921875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 244.625, + "completions/mean_length": 163.630859375, + "completions/mean_terminated_length": 145.52464532852173, + "completions/min_length": 53.8125, + "completions/min_terminated_length": 53.8125, + "entropy": 0.06338186049833894, + "epoch": 0.37317784256559766, + "frac_reward_zero_std": 0.42578125, + "grad_norm": 0.15385830402374268, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_tokens": 29971032.0, + "reward": 8.972355782985687, + "reward_std": 0.9110043197870255, + "rewards/bm25_retrieval_reward_fn/mean": 0.8088140487670898, + "rewards/bm25_retrieval_reward_fn/std": 0.35705708153545856, + "rewards/event_reward_fn/mean": 7.345703125, + "rewards/event_reward_fn/std": 4.8114437609910965, + "rewards/format_reward_fn/mean": 0.817838542163372, + "rewards/format_reward_fn/std": 0.35746027156710625, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1533203125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 239.5, + "completions/mean_length": 163.6767578125, + "completions/mean_terminated_length": 147.22436618804932, + "completions/min_length": 49.375, + "completions/min_terminated_length": 49.375, + "entropy": 0.06737338448874652, + "epoch": 0.38872691933916426, + "frac_reward_zero_std": 0.3671875, + "grad_norm": 0.11516160517930984, + "learning_rate": 5e-05, + "loss": -0.0018, + "num_tokens": 31274657.0, + "reward": 9.454054236412048, + "reward_std": 1.1389728896319866, + "rewards/bm25_retrieval_reward_fn/mean": 0.8227716907858849, + "rewards/bm25_retrieval_reward_fn/std": 0.35767858382314444, + "rewards/event_reward_fn/mean": 7.80078125, + "rewards/event_reward_fn/std": 4.638232260942459, + "rewards/format_reward_fn/mean": 0.8305013030767441, + "rewards/format_reward_fn/std": 0.3585221981629729, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.212890625, + "completions/max_length": 255.9375, + "completions/max_terminated_length": 249.6875, + "completions/mean_length": 176.8935546875, + "completions/mean_terminated_length": 156.1867184638977, + "completions/min_length": 67.1875, + "completions/min_terminated_length": 67.1875, + "entropy": 0.07263953145593405, + "epoch": 0.4042759961127308, + "frac_reward_zero_std": 0.32421875, + "grad_norm": 0.12655992805957794, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_tokens": 32597096.0, + "reward": 9.407644420862198, + "reward_std": 1.1765358839184046, + "rewards/bm25_retrieval_reward_fn/mean": 0.7457954213023186, + "rewards/bm25_retrieval_reward_fn/std": 0.4088666429743171, + "rewards/event_reward_fn/mean": 7.8974609375, + "rewards/event_reward_fn/std": 5.147656410932541, + "rewards/format_reward_fn/mean": 0.764388021081686, + "rewards/format_reward_fn/std": 0.39705412182956934, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2021484375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 245.125, + "completions/mean_length": 175.6865234375, + "completions/mean_terminated_length": 155.94458389282227, + "completions/min_length": 69.0625, + "completions/min_terminated_length": 69.0625, + "entropy": 0.0728312199935317, + "epoch": 0.4198250728862974, + "frac_reward_zero_std": 0.3203125, + "grad_norm": 0.14607630670070648, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_tokens": 33869635.0, + "reward": 8.863501250743866, + "reward_std": 1.1473261304199696, + "rewards/bm25_retrieval_reward_fn/mean": 0.7608970887959003, + "rewards/bm25_retrieval_reward_fn/std": 0.39875176921486855, + "rewards/event_reward_fn/mean": 7.32421875, + "rewards/event_reward_fn/std": 4.745256543159485, + "rewards/format_reward_fn/mean": 0.7783854156732559, + "rewards/format_reward_fn/std": 0.4040640462189913, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2216796875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 184.185546875, + "completions/mean_terminated_length": 164.68915843963623, + "completions/min_length": 79.1875, + "completions/min_terminated_length": 79.1875, + "entropy": 0.07330505712889135, + "epoch": 0.43537414965986393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.11223267763853073, + "learning_rate": 5e-05, + "loss": 0.0046, + "num_tokens": 35198117.0, + "reward": 9.225192874670029, + "reward_std": 1.0972841531038284, + "rewards/bm25_retrieval_reward_fn/mean": 0.756638091057539, + "rewards/bm25_retrieval_reward_fn/std": 0.39518506824970245, + "rewards/event_reward_fn/mean": 7.6982421875, + "rewards/event_reward_fn/std": 4.850593596696854, + "rewards/format_reward_fn/mean": 0.7703125029802322, + "rewards/format_reward_fn/std": 0.39637050684541464, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.181640625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 243.4375, + "completions/mean_length": 167.8017578125, + "completions/mean_terminated_length": 149.08932733535767, + "completions/min_length": 67.0625, + "completions/min_terminated_length": 67.0625, + "entropy": 0.06937285792082548, + "epoch": 0.4509232264334305, + "frac_reward_zero_std": 0.34765625, + "grad_norm": 0.277972549200058, + "learning_rate": 5e-05, + "loss": 0.005, + "num_tokens": 36479498.0, + "reward": 9.970476865768433, + "reward_std": 1.2180952616035938, + "rewards/bm25_retrieval_reward_fn/mean": 0.7747513987123966, + "rewards/bm25_retrieval_reward_fn/std": 0.3933687787503004, + "rewards/event_reward_fn/mean": 8.412109375, + "rewards/event_reward_fn/std": 5.536467835307121, + "rewards/format_reward_fn/mean": 0.7836160659790039, + "rewards/format_reward_fn/std": 0.3935097064822912, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1650390625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 241.75, + "completions/mean_length": 165.2724609375, + "completions/mean_terminated_length": 147.48859310150146, + "completions/min_length": 61.3125, + "completions/min_terminated_length": 61.3125, + "entropy": 0.062284021405503154, + "epoch": 0.46647230320699706, + "frac_reward_zero_std": 0.3203125, + "grad_norm": 0.19548486173152924, + "learning_rate": 5e-05, + "loss": -0.0031, + "num_tokens": 37738681.0, + "reward": 9.709998965263367, + "reward_std": 1.080780379474163, + "rewards/bm25_retrieval_reward_fn/mean": 0.7859884761273861, + "rewards/bm25_retrieval_reward_fn/std": 0.3875921927392483, + "rewards/event_reward_fn/mean": 8.123046875, + "rewards/event_reward_fn/std": 5.239727973937988, + "rewards/format_reward_fn/mean": 0.8009635433554649, + "rewards/format_reward_fn/std": 0.3870681691914797, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.177734375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 243.4375, + "completions/mean_length": 175.16015625, + "completions/mean_terminated_length": 158.66053676605225, + "completions/min_length": 70.4375, + "completions/min_terminated_length": 70.4375, + "entropy": 0.06954935006797314, + "epoch": 0.48202137998056366, + "frac_reward_zero_std": 0.3515625, + "grad_norm": 0.20231647789478302, + "learning_rate": 5e-05, + "loss": -0.0035, + "num_tokens": 39016317.0, + "reward": 9.523818492889404, + "reward_std": 1.1278588809072971, + "rewards/bm25_retrieval_reward_fn/mean": 0.7954656668007374, + "rewards/bm25_retrieval_reward_fn/std": 0.380647461861372, + "rewards/event_reward_fn/mean": 7.9248046875, + "rewards/event_reward_fn/std": 5.266398847103119, + "rewards/format_reward_fn/mean": 0.8035481758415699, + "rewards/format_reward_fn/std": 0.3839748175814748, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2158203125, + "completions/max_length": 255.375, + "completions/max_terminated_length": 247.625, + "completions/mean_length": 189.3232421875, + "completions/mean_terminated_length": 170.41810989379883, + "completions/min_length": 91.3125, + "completions/min_terminated_length": 91.3125, + "entropy": 0.07108506350778043, + "epoch": 0.4975704567541302, + "frac_reward_zero_std": 0.3046875, + "grad_norm": 0.05343855917453766, + "learning_rate": 5e-05, + "loss": 0.0085, + "num_tokens": 40322552.0, + "reward": 9.111008793115616, + "reward_std": 1.051011398434639, + "rewards/bm25_retrieval_reward_fn/mean": 0.7504944987595081, + "rewards/bm25_retrieval_reward_fn/std": 0.3889648839831352, + "rewards/event_reward_fn/mean": 7.58984375, + "rewards/event_reward_fn/std": 4.890510141849518, + "rewards/format_reward_fn/mean": 0.7706705778837204, + "rewards/format_reward_fn/std": 0.39065420906990767, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.185546875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 244.25, + "completions/mean_length": 181.51953125, + "completions/mean_terminated_length": 164.5301055908203, + "completions/min_length": 85.625, + "completions/min_terminated_length": 85.625, + "entropy": 0.07393265794962645, + "epoch": 0.5131195335276968, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.13203799724578857, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_tokens": 41624024.0, + "reward": 9.416305720806122, + "reward_std": 1.0736836642026901, + "rewards/bm25_retrieval_reward_fn/mean": 0.749378640204668, + "rewards/bm25_retrieval_reward_fn/std": 0.39308065082877874, + "rewards/event_reward_fn/mean": 7.89453125, + "rewards/event_reward_fn/std": 5.149897053837776, + "rewards/format_reward_fn/mean": 0.7723958268761635, + "rewards/format_reward_fn/std": 0.3941022912040353, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1455078125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 248.3125, + "completions/mean_length": 183.716796875, + "completions/mean_terminated_length": 171.55659580230713, + "completions/min_length": 93.8125, + "completions/min_terminated_length": 93.8125, + "entropy": 0.07057009753771126, + "epoch": 0.5286686103012633, + "frac_reward_zero_std": 0.30859375, + "grad_norm": 0.14103946089744568, + "learning_rate": 5e-05, + "loss": 0.0125, + "num_tokens": 42874182.0, + "reward": 9.582858800888062, + "reward_std": 0.9491388313472271, + "rewards/bm25_retrieval_reward_fn/mean": 0.8273874409496784, + "rewards/bm25_retrieval_reward_fn/std": 0.33826882019639015, + "rewards/event_reward_fn/mean": 7.9111328125, + "rewards/event_reward_fn/std": 4.9791994243860245, + "rewards/format_reward_fn/mean": 0.844338733702898, + "rewards/format_reward_fn/std": 0.3412060188129544, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1806640625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 241.75, + "completions/mean_length": 182.1201171875, + "completions/mean_terminated_length": 165.9890251159668, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.07039901474490762, + "epoch": 0.54421768707483, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.1732136756181717, + "learning_rate": 5e-05, + "loss": -0.0085, + "num_tokens": 44221753.0, + "reward": 9.929603159427643, + "reward_std": 1.1261513829231262, + "rewards/bm25_retrieval_reward_fn/mean": 0.7931484319269657, + "rewards/bm25_retrieval_reward_fn/std": 0.3795010205358267, + "rewards/event_reward_fn/mean": 8.328125, + "rewards/event_reward_fn/std": 4.941879317164421, + "rewards/format_reward_fn/mean": 0.8083296120166779, + "rewards/format_reward_fn/std": 0.3848690167069435, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 255.75, + "completions/max_terminated_length": 248.5625, + "completions/mean_length": 183.498046875, + "completions/mean_terminated_length": 172.7189416885376, + "completions/min_length": 89.5, + "completions/min_terminated_length": 89.5, + "entropy": 0.06706819240935147, + "epoch": 0.5597667638483965, + "frac_reward_zero_std": 0.359375, + "grad_norm": 0.18046796321868896, + "learning_rate": 5e-05, + "loss": 0.0046, + "num_tokens": 45493775.0, + "reward": 9.964149117469788, + "reward_std": 1.0226014591753483, + "rewards/bm25_retrieval_reward_fn/mean": 0.8349823988974094, + "rewards/bm25_retrieval_reward_fn/std": 0.3266658801585436, + "rewards/event_reward_fn/mean": 8.2734375, + "rewards/event_reward_fn/std": 5.3706135004758835, + "rewards/format_reward_fn/mean": 0.855729166418314, + "rewards/format_reward_fn/std": 0.33050147350877523, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.123046875, + "completions/max_length": 254.125, + "completions/max_terminated_length": 240.3125, + "completions/mean_length": 157.6474609375, + "completions/mean_terminated_length": 144.27598762512207, + "completions/min_length": 70.4375, + "completions/min_terminated_length": 70.4375, + "entropy": 0.06692534498870373, + "epoch": 0.5753158406219631, + "frac_reward_zero_std": 0.41015625, + "grad_norm": 0.18845033645629883, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 46845374.0, + "reward": 9.60788244009018, + "reward_std": 1.077907931059599, + "rewards/bm25_retrieval_reward_fn/mean": 0.8490281663835049, + "rewards/bm25_retrieval_reward_fn/std": 0.3180042654275894, + "rewards/event_reward_fn/mean": 7.8984375, + "rewards/event_reward_fn/std": 5.3387322425842285, + "rewards/format_reward_fn/mean": 0.8604166693985462, + "rewards/format_reward_fn/std": 0.3179410183802247, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 253.375, + "completions/max_terminated_length": 237.9375, + "completions/mean_length": 141.9638671875, + "completions/mean_terminated_length": 132.3677864074707, + "completions/min_length": 62.875, + "completions/min_terminated_length": 62.875, + "entropy": 0.06903915433213115, + "epoch": 0.5908649173955296, + "frac_reward_zero_std": 0.3984375, + "grad_norm": 0.5333549380302429, + "learning_rate": 5e-05, + "loss": -0.0115, + "num_tokens": 48101641.0, + "reward": 9.540560752153397, + "reward_std": 0.925620548427105, + "rewards/bm25_retrieval_reward_fn/mean": 0.8527389727532864, + "rewards/bm25_retrieval_reward_fn/std": 0.3208633568137884, + "rewards/event_reward_fn/mean": 7.8310546875, + "rewards/event_reward_fn/std": 4.8098659962415695, + "rewards/format_reward_fn/mean": 0.8567671179771423, + "rewards/format_reward_fn/std": 0.3000659542158246, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 255.0625, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 166.4150390625, + "completions/mean_terminated_length": 156.72006034851074, + "completions/min_length": 85.125, + "completions/min_terminated_length": 85.125, + "entropy": 0.0729338163509965, + "epoch": 0.6064139941690962, + "frac_reward_zero_std": 0.3359375, + "grad_norm": 0.24336110055446625, + "learning_rate": 5e-05, + "loss": -0.0083, + "num_tokens": 49369398.0, + "reward": 8.873536258935928, + "reward_std": 1.0537522435188293, + "rewards/bm25_retrieval_reward_fn/mean": 0.5898543912917376, + "rewards/bm25_retrieval_reward_fn/std": 0.43156279996037483, + "rewards/event_reward_fn/mean": 7.6513671875, + "rewards/event_reward_fn/std": 4.6099734753370285, + "rewards/format_reward_fn/mean": 0.6323146112263203, + "rewards/format_reward_fn/std": 0.3545870538800955, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 255.875, + "completions/max_terminated_length": 247.125, + "completions/mean_length": 177.6005859375, + "completions/mean_terminated_length": 167.23325157165527, + "completions/min_length": 87.3125, + "completions/min_terminated_length": 87.3125, + "entropy": 0.07631410518661141, + "epoch": 0.6219630709426628, + "frac_reward_zero_std": 0.29296875, + "grad_norm": 0.20491930842399597, + "learning_rate": 5e-05, + "loss": -0.0025, + "num_tokens": 50684041.0, + "reward": 10.097908169031143, + "reward_std": 1.099338386207819, + "rewards/bm25_retrieval_reward_fn/mean": 0.8222074285149574, + "rewards/bm25_retrieval_reward_fn/std": 0.3467689296230674, + "rewards/event_reward_fn/mean": 8.4619140625, + "rewards/event_reward_fn/std": 5.3020381182432175, + "rewards/format_reward_fn/mean": 0.8137868903577328, + "rewards/format_reward_fn/std": 0.34406947437673807, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 254.125, + "completions/max_terminated_length": 246.4375, + "completions/mean_length": 175.4794921875, + "completions/mean_terminated_length": 168.33812427520752, + "completions/min_length": 90.8125, + "completions/min_terminated_length": 90.8125, + "entropy": 0.0735539214219898, + "epoch": 0.6375121477162293, + "frac_reward_zero_std": 0.328125, + "grad_norm": 0.09572162479162216, + "learning_rate": 5e-05, + "loss": -0.0013, + "num_tokens": 52012688.0, + "reward": 10.082605361938477, + "reward_std": 0.8642270974814892, + "rewards/bm25_retrieval_reward_fn/mean": 0.8720975369215012, + "rewards/bm25_retrieval_reward_fn/std": 0.265175896929577, + "rewards/event_reward_fn/mean": 8.3173828125, + "rewards/event_reward_fn/std": 5.005625352263451, + "rewards/format_reward_fn/mean": 0.8931249976158142, + "rewards/format_reward_fn/std": 0.26579738268628716, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1337890625, + "completions/max_length": 254.5, + "completions/max_terminated_length": 248.5, + "completions/mean_length": 188.3662109375, + "completions/mean_terminated_length": 178.33765697479248, + "completions/min_length": 97.0625, + "completions/min_terminated_length": 97.0625, + "entropy": 0.07762870891019702, + "epoch": 0.6530612244897959, + "frac_reward_zero_std": 0.30078125, + "grad_norm": 0.12062438577413559, + "learning_rate": 5e-05, + "loss": -0.0038, + "num_tokens": 53302267.0, + "reward": 10.330396890640259, + "reward_std": 0.9276157356798649, + "rewards/bm25_retrieval_reward_fn/mean": 0.8132092356681824, + "rewards/bm25_retrieval_reward_fn/std": 0.33317599166184664, + "rewards/event_reward_fn/mean": 8.6669921875, + "rewards/event_reward_fn/std": 5.405571684241295, + "rewards/format_reward_fn/mean": 0.8501953110098839, + "rewards/format_reward_fn/std": 0.3391735916957259, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1806640625, + "completions/max_length": 255.5, + "completions/max_terminated_length": 252.0625, + "completions/mean_length": 195.8662109375, + "completions/mean_terminated_length": 182.70546627044678, + "completions/min_length": 118.8125, + "completions/min_terminated_length": 118.8125, + "entropy": 0.07736558141186833, + "epoch": 0.6686103012633625, + "frac_reward_zero_std": 0.29296875, + "grad_norm": 0.1318187564611435, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_tokens": 54548578.0, + "reward": 10.327336311340332, + "reward_std": 1.0552778337150812, + "rewards/bm25_retrieval_reward_fn/mean": 0.7777924984693527, + "rewards/bm25_retrieval_reward_fn/std": 0.36456546862609684, + "rewards/event_reward_fn/mean": 8.7490234375, + "rewards/event_reward_fn/std": 5.162845477461815, + "rewards/format_reward_fn/mean": 0.8005203679203987, + "rewards/format_reward_fn/std": 0.36596682760864496, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.169921875, + "completions/max_length": 254.9375, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 192.677734375, + "completions/mean_terminated_length": 180.396879196167, + "completions/min_length": 110.875, + "completions/min_terminated_length": 110.875, + "entropy": 0.07653255970217288, + "epoch": 0.6841593780369291, + "frac_reward_zero_std": 0.29296875, + "grad_norm": 0.11105561256408691, + "learning_rate": 5e-05, + "loss": 0.0111, + "num_tokens": 55838164.0, + "reward": 10.28305697441101, + "reward_std": 0.9923089742660522, + "rewards/bm25_retrieval_reward_fn/mean": 0.8106463178992271, + "rewards/bm25_retrieval_reward_fn/std": 0.33590539428405464, + "rewards/event_reward_fn/mean": 8.654296875, + "rewards/event_reward_fn/std": 5.304804667830467, + "rewards/format_reward_fn/mean": 0.8181138336658478, + "rewards/format_reward_fn/std": 0.33803721610456705, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 250.6875, + "completions/max_terminated_length": 234.9375, + "completions/mean_length": 174.64453125, + "completions/mean_terminated_length": 166.46386337280273, + "completions/min_length": 96.4375, + "completions/min_terminated_length": 96.4375, + "entropy": 0.0745530491694808, + "epoch": 0.6997084548104956, + "frac_reward_zero_std": 0.33984375, + "grad_norm": 0.23312747478485107, + "learning_rate": 5e-05, + "loss": -0.0036, + "num_tokens": 57159160.0, + "reward": 10.450001329183578, + "reward_std": 0.9695746805518866, + "rewards/bm25_retrieval_reward_fn/mean": 0.8385304771363735, + "rewards/bm25_retrieval_reward_fn/std": 0.32711231615394354, + "rewards/event_reward_fn/mean": 8.76953125, + "rewards/event_reward_fn/std": 5.160630002617836, + "rewards/format_reward_fn/mean": 0.841939639300108, + "rewards/format_reward_fn/std": 0.3311331504955888, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1201171875, + "completions/max_length": 252.625, + "completions/max_terminated_length": 244.75, + "completions/mean_length": 186.2763671875, + "completions/mean_terminated_length": 177.73645687103271, + "completions/min_length": 108.3125, + "completions/min_terminated_length": 108.3125, + "entropy": 0.06967416848056018, + "epoch": 0.7152575315840622, + "frac_reward_zero_std": 0.34765625, + "grad_norm": 0.18518145382404327, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_tokens": 58498911.0, + "reward": 10.422120094299316, + "reward_std": 0.8677894007414579, + "rewards/bm25_retrieval_reward_fn/mean": 0.8313977345824242, + "rewards/bm25_retrieval_reward_fn/std": 0.3058948842808604, + "rewards/event_reward_fn/mean": 8.7490234375, + "rewards/event_reward_fn/std": 5.314541980624199, + "rewards/format_reward_fn/mean": 0.8416987583041191, + "rewards/format_reward_fn/std": 0.30255721998400986, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0556640625, + "completions/max_length": 250.3125, + "completions/max_terminated_length": 239.4375, + "completions/mean_length": 165.4521484375, + "completions/mean_terminated_length": 160.36469841003418, + "completions/min_length": 98.875, + "completions/min_terminated_length": 98.875, + "entropy": 0.07365260319784284, + "epoch": 0.7308066083576288, + "frac_reward_zero_std": 0.3515625, + "grad_norm": 0.1640687733888626, + "learning_rate": 5e-05, + "loss": -0.0024, + "num_tokens": 59787238.0, + "reward": 9.981308668851852, + "reward_std": 0.8546336572617292, + "rewards/bm25_retrieval_reward_fn/mean": 0.8880095556378365, + "rewards/bm25_retrieval_reward_fn/std": 0.26380802411586046, + "rewards/event_reward_fn/mean": 8.1982421875, + "rewards/event_reward_fn/std": 5.018857464194298, + "rewards/format_reward_fn/mean": 0.8950570411980152, + "rewards/format_reward_fn/std": 0.2686548628844321, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 237.9375, + "completions/max_terminated_length": 218.5, + "completions/mean_length": 147.5947265625, + "completions/mean_terminated_length": 141.85034561157227, + "completions/min_length": 84.9375, + "completions/min_terminated_length": 84.9375, + "entropy": 0.0634845974855125, + "epoch": 0.7463556851311953, + "frac_reward_zero_std": 0.37109375, + "grad_norm": 0.20938096940517426, + "learning_rate": 5e-05, + "loss": -0.0135, + "num_tokens": 61026539.0, + "reward": 10.265896439552307, + "reward_std": 0.9515191409736872, + "rewards/bm25_retrieval_reward_fn/mean": 0.9135676696896553, + "rewards/bm25_retrieval_reward_fn/std": 0.21515763795468956, + "rewards/event_reward_fn/mean": 8.4326171875, + "rewards/event_reward_fn/std": 5.086499974131584, + "rewards/format_reward_fn/mean": 0.9197116829454899, + "rewards/format_reward_fn/std": 0.20483782514929771, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1162109375, + "completions/max_length": 253.375, + "completions/max_terminated_length": 245.1875, + "completions/mean_length": 187.802734375, + "completions/mean_terminated_length": 179.58960628509521, + "completions/min_length": 114.1875, + "completions/min_terminated_length": 114.1875, + "entropy": 0.0702137725893408, + "epoch": 0.7619047619047619, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.18974582850933075, + "learning_rate": 5e-05, + "loss": 0.0031, + "num_tokens": 62313941.0, + "reward": 10.483202993869781, + "reward_std": 0.943267323076725, + "rewards/bm25_retrieval_reward_fn/mean": 0.84468699619174, + "rewards/bm25_retrieval_reward_fn/std": 0.3197319367900491, + "rewards/event_reward_fn/mean": 8.7783203125, + "rewards/event_reward_fn/std": 5.225345551967621, + "rewards/format_reward_fn/mean": 0.8601957745850086, + "rewards/format_reward_fn/std": 0.32167423889040947, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1279296875, + "completions/max_length": 252.5625, + "completions/max_terminated_length": 243.5625, + "completions/mean_length": 186.220703125, + "completions/mean_terminated_length": 176.8216428756714, + "completions/min_length": 120.125, + "completions/min_terminated_length": 120.125, + "entropy": 0.07592986570671201, + "epoch": 0.7774538386783285, + "frac_reward_zero_std": 0.33203125, + "grad_norm": 0.1492370367050171, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_tokens": 63617551.0, + "reward": 9.786191403865814, + "reward_std": 0.9563806988298893, + "rewards/bm25_retrieval_reward_fn/mean": 0.8338383696973324, + "rewards/bm25_retrieval_reward_fn/std": 0.31574585498310626, + "rewards/event_reward_fn/mean": 8.1015625, + "rewards/event_reward_fn/std": 4.919276848435402, + "rewards/format_reward_fn/mean": 0.850790549069643, + "rewards/format_reward_fn/std": 0.31896755122579634, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0654296875, + "completions/max_length": 249.75, + "completions/max_terminated_length": 233.875, + "completions/mean_length": 160.158203125, + "completions/mean_terminated_length": 153.3260145187378, + "completions/min_length": 89.625, + "completions/min_terminated_length": 89.625, + "entropy": 0.06539753102697432, + "epoch": 0.793002915451895, + "frac_reward_zero_std": 0.33984375, + "grad_norm": 0.20876899361610413, + "learning_rate": 5e-05, + "loss": 0.0054, + "num_tokens": 64856573.0, + "reward": 9.658368825912476, + "reward_std": 0.9620554894208908, + "rewards/bm25_retrieval_reward_fn/mean": 0.895107377320528, + "rewards/bm25_retrieval_reward_fn/std": 0.25602476752828807, + "rewards/event_reward_fn/mean": 7.857421875, + "rewards/event_reward_fn/std": 4.493844509124756, + "rewards/format_reward_fn/mean": 0.9058398455381393, + "rewards/format_reward_fn/std": 0.2541873576119542, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0712890625, + "completions/max_length": 249.5, + "completions/max_terminated_length": 236.25, + "completions/mean_length": 169.5673828125, + "completions/mean_terminated_length": 163.07803535461426, + "completions/min_length": 100.75, + "completions/min_terminated_length": 100.75, + "entropy": 0.06818107352592051, + "epoch": 0.8085519922254616, + "frac_reward_zero_std": 0.33203125, + "grad_norm": 0.20962977409362793, + "learning_rate": 5e-05, + "loss": 0.0018, + "num_tokens": 66158990.0, + "reward": 10.353489756584167, + "reward_std": 0.9985193219035864, + "rewards/bm25_retrieval_reward_fn/mean": 0.8550090603530407, + "rewards/bm25_retrieval_reward_fn/std": 0.3100271187722683, + "rewards/event_reward_fn/mean": 8.62890625, + "rewards/event_reward_fn/std": 4.978297606110573, + "rewards/format_reward_fn/mean": 0.8695743456482887, + "rewards/format_reward_fn/std": 0.31247875466942787, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 255.625, + "completions/max_terminated_length": 245.0625, + "completions/mean_length": 183.2822265625, + "completions/mean_terminated_length": 172.68112754821777, + "completions/min_length": 108.0625, + "completions/min_terminated_length": 108.0625, + "entropy": 0.07087993528693914, + "epoch": 0.8241010689990281, + "frac_reward_zero_std": 0.328125, + "grad_norm": 0.17762945592403412, + "learning_rate": 5e-05, + "loss": -0.0049, + "num_tokens": 67501687.0, + "reward": 10.45748645067215, + "reward_std": 0.9843454174697399, + "rewards/bm25_retrieval_reward_fn/mean": 0.8369721993803978, + "rewards/bm25_retrieval_reward_fn/std": 0.33558181021362543, + "rewards/event_reward_fn/mean": 8.7734375, + "rewards/event_reward_fn/std": 5.215954706072807, + "rewards/format_reward_fn/mean": 0.8470768220722675, + "rewards/format_reward_fn/std": 0.33892421517521143, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.169921875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 250.8125, + "completions/mean_length": 204.1787109375, + "completions/mean_terminated_length": 193.6075620651245, + "completions/min_length": 138.125, + "completions/min_terminated_length": 138.125, + "entropy": 0.07261033216491342, + "epoch": 0.8396501457725948, + "frac_reward_zero_std": 0.3203125, + "grad_norm": 0.18910834193229675, + "learning_rate": 5e-05, + "loss": 0.0076, + "num_tokens": 68769010.0, + "reward": 10.237849026918411, + "reward_std": 0.9591232761740685, + "rewards/bm25_retrieval_reward_fn/mean": 0.7683875225484371, + "rewards/bm25_retrieval_reward_fn/std": 0.39661576971411705, + "rewards/event_reward_fn/mean": 8.6796875, + "rewards/event_reward_fn/std": 5.074413627386093, + "rewards/format_reward_fn/mean": 0.7897739969193935, + "rewards/format_reward_fn/std": 0.4046425260603428, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 255.75, + "completions/max_terminated_length": 243.25, + "completions/mean_length": 185.24609375, + "completions/mean_terminated_length": 176.98966312408447, + "completions/min_length": 115.1875, + "completions/min_terminated_length": 115.1875, + "entropy": 0.07335718860849738, + "epoch": 0.8551992225461613, + "frac_reward_zero_std": 0.30859375, + "grad_norm": 0.19497302174568176, + "learning_rate": 5e-05, + "loss": -0.0045, + "num_tokens": 70072946.0, + "reward": 9.902502715587616, + "reward_std": 1.0277547165751457, + "rewards/bm25_retrieval_reward_fn/mean": 0.8267017714679241, + "rewards/bm25_retrieval_reward_fn/std": 0.33686008118093014, + "rewards/event_reward_fn/mean": 8.2275390625, + "rewards/event_reward_fn/std": 4.601325109601021, + "rewards/format_reward_fn/mean": 0.848261721432209, + "rewards/format_reward_fn/std": 0.33925584983080626, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0771484375, + "completions/max_length": 253.25, + "completions/max_terminated_length": 235.625, + "completions/mean_length": 168.2685546875, + "completions/mean_terminated_length": 160.67493724822998, + "completions/min_length": 91.75, + "completions/min_terminated_length": 91.75, + "entropy": 0.07022251281887293, + "epoch": 0.8707482993197279, + "frac_reward_zero_std": 0.31640625, + "grad_norm": 0.22896689176559448, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 71408625.0, + "reward": 10.46012270450592, + "reward_std": 0.9469396620988846, + "rewards/bm25_retrieval_reward_fn/mean": 0.8612882420420647, + "rewards/bm25_retrieval_reward_fn/std": 0.27462146105244756, + "rewards/event_reward_fn/mean": 8.716796875, + "rewards/event_reward_fn/std": 5.030902713537216, + "rewards/format_reward_fn/mean": 0.8820377588272095, + "rewards/format_reward_fn/std": 0.2647479181177914, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 242.9375, + "completions/max_terminated_length": 230.4375, + "completions/mean_length": 145.1064453125, + "completions/mean_terminated_length": 139.08607959747314, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.06589728174731135, + "epoch": 0.8862973760932945, + "frac_reward_zero_std": 0.3671875, + "grad_norm": 0.3958277702331543, + "learning_rate": 5e-05, + "loss": -0.0044, + "num_tokens": 72662642.0, + "reward": 10.200001657009125, + "reward_std": 0.9421045333147049, + "rewards/bm25_retrieval_reward_fn/mean": 0.9011733010411263, + "rewards/bm25_retrieval_reward_fn/std": 0.2283891054103151, + "rewards/event_reward_fn/mean": 8.373046875, + "rewards/event_reward_fn/std": 4.713611409068108, + "rewards/format_reward_fn/mean": 0.92578125, + "rewards/format_reward_fn/std": 0.21754403738304973, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0283203125, + "completions/max_length": 238.5, + "completions/max_terminated_length": 225.9375, + "completions/mean_length": 132.8330078125, + "completions/mean_terminated_length": 129.05783081054688, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.060941929230466485, + "epoch": 0.901846452866861, + "frac_reward_zero_std": 0.4296875, + "grad_norm": 0.17806340754032135, + "learning_rate": 5e-05, + "loss": -0.0042, + "num_tokens": 73875967.0, + "reward": 9.45022863149643, + "reward_std": 0.7097359485924244, + "rewards/bm25_retrieval_reward_fn/mean": 0.935970850288868, + "rewards/bm25_retrieval_reward_fn/std": 0.16215045971330255, + "rewards/event_reward_fn/mean": 7.5576171875, + "rewards/event_reward_fn/std": 4.745793879032135, + "rewards/format_reward_fn/mean": 0.9566406235098839, + "rewards/format_reward_fn/std": 0.13776301313191652, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 251.5, + "completions/max_terminated_length": 235.5, + "completions/mean_length": 162.619140625, + "completions/mean_terminated_length": 155.03940200805664, + "completions/min_length": 95.9375, + "completions/min_terminated_length": 95.9375, + "entropy": 0.06535043194890022, + "epoch": 0.9173955296404276, + "frac_reward_zero_std": 0.3671875, + "grad_norm": 0.3161742687225342, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 75159981.0, + "reward": 10.09964656829834, + "reward_std": 0.951118241995573, + "rewards/bm25_retrieval_reward_fn/mean": 0.8730840981006622, + "rewards/bm25_retrieval_reward_fn/std": 0.2608068126719445, + "rewards/event_reward_fn/mean": 8.326171875, + "rewards/event_reward_fn/std": 5.628681242465973, + "rewards/format_reward_fn/mean": 0.900390625, + "rewards/format_reward_fn/std": 0.25107863638550043, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1083984375, + "completions/max_length": 252.875, + "completions/max_terminated_length": 241.9375, + "completions/mean_length": 188.71875, + "completions/mean_terminated_length": 181.2633514404297, + "completions/min_length": 117.625, + "completions/min_terminated_length": 117.625, + "entropy": 0.07137463777326047, + "epoch": 0.9329446064139941, + "frac_reward_zero_std": 0.36328125, + "grad_norm": 0.2805193066596985, + "learning_rate": 5e-05, + "loss": -0.0023, + "num_tokens": 76415209.0, + "reward": 10.079432845115662, + "reward_std": 0.7802535220980644, + "rewards/bm25_retrieval_reward_fn/mean": 0.8216202445328236, + "rewards/bm25_retrieval_reward_fn/std": 0.3154827356338501, + "rewards/event_reward_fn/mean": 8.40625, + "rewards/event_reward_fn/std": 5.284300252795219, + "rewards/format_reward_fn/mean": 0.8515625, + "rewards/format_reward_fn/std": 0.3159356191754341, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.162109375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 246.875, + "completions/mean_length": 205.09375, + "completions/mean_terminated_length": 195.3371343612671, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.06873999303206801, + "epoch": 0.9484936831875608, + "frac_reward_zero_std": 0.328125, + "grad_norm": 0.10008546710014343, + "learning_rate": 5e-05, + "loss": 0.0041, + "num_tokens": 77770037.0, + "reward": 10.352019369602203, + "reward_std": 0.7665594182908535, + "rewards/bm25_retrieval_reward_fn/mean": 0.7921560294926167, + "rewards/bm25_retrieval_reward_fn/std": 0.3632864858955145, + "rewards/event_reward_fn/mean": 8.732421875, + "rewards/event_reward_fn/std": 5.339399605989456, + "rewards/format_reward_fn/mean": 0.8274414055049419, + "rewards/format_reward_fn/std": 0.37096375692635775, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0654296875, + "completions/max_length": 254.0, + "completions/max_terminated_length": 241.5, + "completions/mean_length": 184.2421875, + "completions/mean_terminated_length": 179.25956344604492, + "completions/min_length": 118.875, + "completions/min_terminated_length": 118.875, + "entropy": 0.07302290247753263, + "epoch": 0.9640427599611273, + "frac_reward_zero_std": 0.32421875, + "grad_norm": 0.11569506675004959, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 79058385.0, + "reward": 10.270190715789795, + "reward_std": 0.9035040959715843, + "rewards/bm25_retrieval_reward_fn/mean": 0.8707767426967621, + "rewards/bm25_retrieval_reward_fn/std": 0.2817615191452205, + "rewards/event_reward_fn/mean": 8.4990234375, + "rewards/event_reward_fn/std": 5.17444010078907, + "rewards/format_reward_fn/mean": 0.900390625, + "rewards/format_reward_fn/std": 0.27954914048314095, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 241.625, + "completions/mean_length": 177.517578125, + "completions/mean_terminated_length": 170.22113609313965, + "completions/min_length": 109.5625, + "completions/min_terminated_length": 109.5625, + "entropy": 0.07096637412905693, + "epoch": 0.9795918367346939, + "frac_reward_zero_std": 0.328125, + "grad_norm": 0.24779611825942993, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_tokens": 80333979.0, + "reward": 10.588190495967865, + "reward_std": 0.8662494085729122, + "rewards/bm25_retrieval_reward_fn/mean": 0.8662735223770142, + "rewards/bm25_retrieval_reward_fn/std": 0.29072041157633066, + "rewards/event_reward_fn/mean": 8.833984375, + "rewards/event_reward_fn/std": 5.076077088713646, + "rewards/format_reward_fn/mean": 0.8879324793815613, + "rewards/format_reward_fn/std": 0.28930215165019035, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0673828125, + "completions/max_length": 252.1875, + "completions/max_terminated_length": 235.3125, + "completions/mean_length": 171.123046875, + "completions/mean_terminated_length": 165.06201934814453, + "completions/min_length": 99.5625, + "completions/min_terminated_length": 99.5625, + "entropy": 0.06698882719501853, + "epoch": 0.9951409135082604, + "frac_reward_zero_std": 0.35546875, + "grad_norm": 0.19038081169128418, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 81595449.0, + "reward": 10.268950939178467, + "reward_std": 0.8857842069119215, + "rewards/bm25_retrieval_reward_fn/mean": 0.8906435556709766, + "rewards/bm25_retrieval_reward_fn/std": 0.25779614597558975, + "rewards/event_reward_fn/mean": 8.462890625, + "rewards/event_reward_fn/std": 5.074081584811211, + "rewards/format_reward_fn/mean": 0.915416669100523, + "rewards/format_reward_fn/std": 0.25232047867029905, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08806818181818182, + "completions/max_length": 254.45454545454547, + "completions/max_terminated_length": 244.9090909090909, + "completions/mean_length": 178.65482954545453, + "completions/mean_terminated_length": 171.0937597101385, + "completions/min_length": 112.27272727272727, + "completions/min_terminated_length": 112.27272727272727, + "entropy": 0.07033889300443909, + "epoch": 1.010689990281827, + "frac_reward_zero_std": 0.3465909090909091, + "grad_norm": 0.13537470996379852, + "learning_rate": 5e-05, + "loss": -0.0016, + "num_tokens": 82875743.0, + "reward": 10.102936571294611, + "reward_std": 0.9548169591210105, + "rewards/bm25_retrieval_reward_fn/mean": 0.8734858144413341, + "rewards/bm25_retrieval_reward_fn/std": 0.2784238914874467, + "rewards/event_reward_fn/mean": 8.342329545454545, + "rewards/event_reward_fn/std": 4.7735207947817715, + "rewards/format_reward_fn/mean": 0.8871212113987316, + "rewards/format_reward_fn/std": 0.27951826561581006, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.123046875, + "completions/max_length": 255.5625, + "completions/max_terminated_length": 248.1875, + "completions/mean_length": 187.521484375, + "completions/mean_terminated_length": 177.78209781646729, + "completions/min_length": 120.125, + "completions/min_terminated_length": 120.125, + "entropy": 0.06871294020675123, + "epoch": 1.0262390670553936, + "frac_reward_zero_std": 0.38671875, + "grad_norm": 0.14529068768024445, + "learning_rate": 5e-05, + "loss": -0.0064, + "num_tokens": 84193629.0, + "reward": 10.711235225200653, + "reward_std": 0.9264990799129009, + "rewards/bm25_retrieval_reward_fn/mean": 0.8358808867633343, + "rewards/bm25_retrieval_reward_fn/std": 0.3423158023506403, + "rewards/event_reward_fn/mean": 9.029296875, + "rewards/event_reward_fn/std": 5.407557427883148, + "rewards/format_reward_fn/mean": 0.8460574820637703, + "rewards/format_reward_fn/std": 0.34603168070316315, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 245.625, + "completions/mean_length": 188.896484375, + "completions/mean_terminated_length": 181.97695064544678, + "completions/min_length": 120.8125, + "completions/min_terminated_length": 120.8125, + "entropy": 0.06455810344778001, + "epoch": 1.0417881438289602, + "frac_reward_zero_std": 0.34765625, + "grad_norm": 0.17880740761756897, + "learning_rate": 5e-05, + "loss": -0.0014, + "num_tokens": 85537059.0, + "reward": 10.491520524024963, + "reward_std": 0.9325292967259884, + "rewards/bm25_retrieval_reward_fn/mean": 0.841754749417305, + "rewards/bm25_retrieval_reward_fn/std": 0.3239448321983218, + "rewards/event_reward_fn/mean": 8.7900390625, + "rewards/event_reward_fn/std": 5.293820217251778, + "rewards/format_reward_fn/mean": 0.8597265593707561, + "rewards/format_reward_fn/std": 0.3177201831713319, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 255.5, + "completions/max_terminated_length": 244.5, + "completions/mean_length": 200.1123046875, + "completions/mean_terminated_length": 190.06258392333984, + "completions/min_length": 137.1875, + "completions/min_terminated_length": 137.1875, + "entropy": 0.06795511720702052, + "epoch": 1.0573372206025267, + "frac_reward_zero_std": 0.29296875, + "grad_norm": 0.08574163913726807, + "learning_rate": 5e-05, + "loss": -0.0069, + "num_tokens": 86860678.0, + "reward": 10.812386631965637, + "reward_std": 1.0998864620923996, + "rewards/bm25_retrieval_reward_fn/mean": 0.7638319730758667, + "rewards/bm25_retrieval_reward_fn/std": 0.3960200799629092, + "rewards/event_reward_fn/mean": 9.2724609375, + "rewards/event_reward_fn/std": 6.0772674679756165, + "rewards/format_reward_fn/mean": 0.7760937549173832, + "rewards/format_reward_fn/std": 0.4006781214848161, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2978515625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 220.5654296875, + "completions/mean_terminated_length": 206.34245109558105, + "completions/min_length": 159.6875, + "completions/min_terminated_length": 159.6875, + "entropy": 0.07582874409854412, + "epoch": 1.0728862973760933, + "frac_reward_zero_std": 0.234375, + "grad_norm": 0.20225036144256592, + "learning_rate": 5e-05, + "loss": 0.0128, + "num_tokens": 88190949.0, + "reward": 10.548758864402771, + "reward_std": 0.9557082541286945, + "rewards/bm25_retrieval_reward_fn/mean": 0.6565842125564814, + "rewards/bm25_retrieval_reward_fn/std": 0.42724930588155985, + "rewards/event_reward_fn/mean": 9.2119140625, + "rewards/event_reward_fn/std": 5.52141310274601, + "rewards/format_reward_fn/mean": 0.6802604161202908, + "rewards/format_reward_fn/std": 0.4410219779238105, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 254.9375, + "completions/max_terminated_length": 243.1875, + "completions/mean_length": 181.650390625, + "completions/mean_terminated_length": 176.39045429229736, + "completions/min_length": 119.75, + "completions/min_terminated_length": 119.75, + "entropy": 0.07010088441893458, + "epoch": 1.08843537414966, + "frac_reward_zero_std": 0.30859375, + "grad_norm": 0.1840677410364151, + "learning_rate": 5e-05, + "loss": 0.002, + "num_tokens": 89433463.0, + "reward": 10.5318962931633, + "reward_std": 1.0016295239329338, + "rewards/bm25_retrieval_reward_fn/mean": 0.8906202651560307, + "rewards/bm25_retrieval_reward_fn/std": 0.2701933770440519, + "rewards/event_reward_fn/mean": 8.732421875, + "rewards/event_reward_fn/std": 5.466498285531998, + "rewards/format_reward_fn/mean": 0.9088541679084301, + "rewards/format_reward_fn/std": 0.26082120556384325, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 252.0, + "completions/max_terminated_length": 241.0625, + "completions/mean_length": 172.640625, + "completions/mean_terminated_length": 167.69061183929443, + "completions/min_length": 109.125, + "completions/min_terminated_length": 109.125, + "entropy": 0.06799150491133332, + "epoch": 1.1039844509232264, + "frac_reward_zero_std": 0.4140625, + "grad_norm": 0.13713772594928741, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 90709603.0, + "reward": 10.70878279209137, + "reward_std": 0.8687677383422852, + "rewards/bm25_retrieval_reward_fn/mean": 0.901556234806776, + "rewards/bm25_retrieval_reward_fn/std": 0.25905836455058306, + "rewards/event_reward_fn/mean": 8.890625, + "rewards/event_reward_fn/std": 5.81499570608139, + "rewards/format_reward_fn/mean": 0.9166015610098839, + "rewards/format_reward_fn/std": 0.2588641280308366, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0537109375, + "completions/max_length": 245.375, + "completions/max_terminated_length": 232.6875, + "completions/mean_length": 173.802734375, + "completions/mean_terminated_length": 169.22315788269043, + "completions/min_length": 108.6875, + "completions/min_terminated_length": 108.6875, + "entropy": 0.06564864912070334, + "epoch": 1.119533527696793, + "frac_reward_zero_std": 0.390625, + "grad_norm": 0.12585203349590302, + "learning_rate": 5e-05, + "loss": -0.0042, + "num_tokens": 91936041.0, + "reward": 10.107302486896515, + "reward_std": 0.8662888705730438, + "rewards/bm25_retrieval_reward_fn/mean": 0.8905055709183216, + "rewards/bm25_retrieval_reward_fn/std": 0.27319654333405197, + "rewards/event_reward_fn/mean": 8.3056640625, + "rewards/event_reward_fn/std": 5.399076372385025, + "rewards/format_reward_fn/mean": 0.9111328125, + "rewards/format_reward_fn/std": 0.2696619238704443, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 251.875, + "completions/max_terminated_length": 239.5, + "completions/mean_length": 181.62109375, + "completions/mean_terminated_length": 175.5115909576416, + "completions/min_length": 116.1875, + "completions/min_terminated_length": 116.1875, + "entropy": 0.06781496806070209, + "epoch": 1.1350826044703597, + "frac_reward_zero_std": 0.34765625, + "grad_norm": 0.20404520630836487, + "learning_rate": 5e-05, + "loss": -0.0056, + "num_tokens": 93201105.0, + "reward": 10.648929178714752, + "reward_std": 0.9610726498067379, + "rewards/bm25_retrieval_reward_fn/mean": 0.8819760829210281, + "rewards/bm25_retrieval_reward_fn/std": 0.27036565099842846, + "rewards/event_reward_fn/mean": 8.8681640625, + "rewards/event_reward_fn/std": 5.41285502910614, + "rewards/format_reward_fn/mean": 0.8987890630960464, + "rewards/format_reward_fn/std": 0.2619143519550562, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1376953125, + "completions/max_length": 254.375, + "completions/max_terminated_length": 246.9375, + "completions/mean_length": 201.556640625, + "completions/mean_terminated_length": 193.48375415802002, + "completions/min_length": 136.3125, + "completions/min_terminated_length": 136.3125, + "entropy": 0.06725385342724621, + "epoch": 1.1506316812439261, + "frac_reward_zero_std": 0.30078125, + "grad_norm": 0.1181834414601326, + "learning_rate": 5e-05, + "loss": -0.0024, + "num_tokens": 94576891.0, + "reward": 11.315544486045837, + "reward_std": 0.8351979665458202, + "rewards/bm25_retrieval_reward_fn/mean": 0.8150821626186371, + "rewards/bm25_retrieval_reward_fn/std": 0.34588195278774947, + "rewards/event_reward_fn/mean": 9.6708984375, + "rewards/event_reward_fn/std": 5.589598774909973, + "rewards/format_reward_fn/mean": 0.829563807696104, + "rewards/format_reward_fn/std": 0.34839474968612194, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.154296875, + "completions/max_length": 253.375, + "completions/max_terminated_length": 247.1875, + "completions/mean_length": 203.18359375, + "completions/mean_terminated_length": 194.42267608642578, + "completions/min_length": 138.5625, + "completions/min_terminated_length": 138.5625, + "entropy": 0.07394770160317421, + "epoch": 1.1661807580174928, + "frac_reward_zero_std": 0.26171875, + "grad_norm": 0.18629314005374908, + "learning_rate": 5e-05, + "loss": 0.0049, + "num_tokens": 95881507.0, + "reward": 10.813474893569946, + "reward_std": 1.0365745667368174, + "rewards/bm25_retrieval_reward_fn/mean": 0.8142888676375151, + "rewards/bm25_retrieval_reward_fn/std": 0.3391092037782073, + "rewards/event_reward_fn/mean": 9.1708984375, + "rewards/event_reward_fn/std": 5.419242635369301, + "rewards/format_reward_fn/mean": 0.8282877653837204, + "rewards/format_reward_fn/std": 0.34210248570889235, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0810546875, + "completions/max_length": 251.3125, + "completions/max_terminated_length": 242.875, + "completions/mean_length": 186.673828125, + "completions/mean_terminated_length": 180.55548667907715, + "completions/min_length": 126.1875, + "completions/min_terminated_length": 126.1875, + "entropy": 0.06798666249960661, + "epoch": 1.1817298347910592, + "frac_reward_zero_std": 0.40234375, + "grad_norm": 0.17748276889324188, + "learning_rate": 5e-05, + "loss": -0.0026, + "num_tokens": 97176789.0, + "reward": 10.461668372154236, + "reward_std": 0.7730772253125906, + "rewards/bm25_retrieval_reward_fn/mean": 0.8607958517968655, + "rewards/bm25_retrieval_reward_fn/std": 0.31750916969031096, + "rewards/event_reward_fn/mean": 8.73046875, + "rewards/event_reward_fn/std": 4.8714660704135895, + "rewards/format_reward_fn/mean": 0.8704036474227905, + "rewards/format_reward_fn/std": 0.3206114452332258, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1318359375, + "completions/max_length": 255.125, + "completions/max_terminated_length": 246.6875, + "completions/mean_length": 200.603515625, + "completions/mean_terminated_length": 192.25616931915283, + "completions/min_length": 141.375, + "completions/min_terminated_length": 141.375, + "entropy": 0.07248709676787257, + "epoch": 1.1972789115646258, + "frac_reward_zero_std": 0.3046875, + "grad_norm": 0.15709719061851501, + "learning_rate": 5e-05, + "loss": -0.002, + "num_tokens": 98527511.0, + "reward": 10.778121054172516, + "reward_std": 0.9276621714234352, + "rewards/bm25_retrieval_reward_fn/mean": 0.848999809473753, + "rewards/bm25_retrieval_reward_fn/std": 0.31297336355783045, + "rewards/event_reward_fn/mean": 9.0654296875, + "rewards/event_reward_fn/std": 5.5686564445495605, + "rewards/format_reward_fn/mean": 0.863691408187151, + "rewards/format_reward_fn/std": 0.31423071026802063, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12109375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 246.25, + "completions/mean_length": 201.720703125, + "completions/mean_terminated_length": 194.1233615875244, + "completions/min_length": 144.375, + "completions/min_terminated_length": 144.375, + "entropy": 0.07165544992312789, + "epoch": 1.2128279883381925, + "frac_reward_zero_std": 0.26953125, + "grad_norm": 0.14202959835529327, + "learning_rate": 5e-05, + "loss": 0.001, + "num_tokens": 99860557.0, + "reward": 11.258443832397461, + "reward_std": 0.9464571885764599, + "rewards/bm25_retrieval_reward_fn/mean": 0.8287562467157841, + "rewards/bm25_retrieval_reward_fn/std": 0.3439189847558737, + "rewards/event_reward_fn/mean": 9.5859375, + "rewards/event_reward_fn/std": 5.814349502325058, + "rewards/format_reward_fn/mean": 0.84375, + "rewards/format_reward_fn/std": 0.3489691922441125, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 254.5, + "completions/max_terminated_length": 245.8125, + "completions/mean_length": 195.880859375, + "completions/mean_terminated_length": 189.886492729187, + "completions/min_length": 131.9375, + "completions/min_terminated_length": 131.9375, + "entropy": 0.07229456026107073, + "epoch": 1.228377065111759, + "frac_reward_zero_std": 0.2890625, + "grad_norm": 0.2180081307888031, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 101156375.0, + "reward": 10.139498263597488, + "reward_std": 0.8279522079974413, + "rewards/bm25_retrieval_reward_fn/mean": 0.8485933281481266, + "rewards/bm25_retrieval_reward_fn/std": 0.3134065044578165, + "rewards/event_reward_fn/mean": 8.4267578125, + "rewards/event_reward_fn/std": 5.1162159740924835, + "rewards/format_reward_fn/mean": 0.864147137850523, + "rewards/format_reward_fn/std": 0.31587369833141565, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0693359375, + "completions/max_length": 249.0, + "completions/max_terminated_length": 244.3125, + "completions/mean_length": 191.0654296875, + "completions/mean_terminated_length": 186.35216617584229, + "completions/min_length": 133.375, + "completions/min_terminated_length": 133.375, + "entropy": 0.06208949023857713, + "epoch": 1.2439261418853256, + "frac_reward_zero_std": 0.32421875, + "grad_norm": 0.11896482855081558, + "learning_rate": 5e-05, + "loss": -0.0008, + "num_tokens": 102475714.0, + "reward": 11.101193368434906, + "reward_std": 0.891064302995801, + "rewards/bm25_retrieval_reward_fn/mean": 0.9093973524868488, + "rewards/bm25_retrieval_reward_fn/std": 0.2311963284155354, + "rewards/event_reward_fn/mean": 9.275390625, + "rewards/event_reward_fn/std": 5.045834094285965, + "rewards/format_reward_fn/mean": 0.9164053164422512, + "rewards/format_reward_fn/std": 0.2260741894133389, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.126953125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 248.5625, + "completions/mean_length": 203.279296875, + "completions/mean_terminated_length": 195.46306896209717, + "completions/min_length": 141.875, + "completions/min_terminated_length": 141.875, + "entropy": 0.06049947580322623, + "epoch": 1.259475218658892, + "frac_reward_zero_std": 0.3984375, + "grad_norm": 0.32402676343917847, + "learning_rate": 5e-05, + "loss": 0.004, + "num_tokens": 103787612.0, + "reward": 10.865350365638733, + "reward_std": 0.8294984549283981, + "rewards/bm25_retrieval_reward_fn/mean": 0.8310730122029781, + "rewards/bm25_retrieval_reward_fn/std": 0.3465144941583276, + "rewards/event_reward_fn/mean": 9.1904296875, + "rewards/event_reward_fn/std": 5.264712706208229, + "rewards/format_reward_fn/mean": 0.8438476547598839, + "rewards/format_reward_fn/std": 0.35203980933874846, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.158203125, + "completions/max_length": 255.9375, + "completions/max_terminated_length": 248.4375, + "completions/mean_length": 210.01953125, + "completions/mean_terminated_length": 201.3692398071289, + "completions/min_length": 153.25, + "completions/min_terminated_length": 153.25, + "entropy": 0.06503300159238279, + "epoch": 1.2750242954324587, + "frac_reward_zero_std": 0.3203125, + "grad_norm": 0.12493407726287842, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 105090944.0, + "reward": 10.944722652435303, + "reward_std": 0.8488267995417118, + "rewards/bm25_retrieval_reward_fn/mean": 0.8110702559351921, + "rewards/bm25_retrieval_reward_fn/std": 0.3519176107365638, + "rewards/event_reward_fn/mean": 9.3095703125, + "rewards/event_reward_fn/std": 5.613954737782478, + "rewards/format_reward_fn/mean": 0.8240820355713367, + "rewards/format_reward_fn/std": 0.3540602792054415, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1943359375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 252.375, + "completions/mean_length": 220.37890625, + "completions/mean_terminated_length": 212.05939102172852, + "completions/min_length": 168.75, + "completions/min_terminated_length": 168.75, + "entropy": 0.07254446996375918, + "epoch": 1.2905733722060253, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.20231589674949646, + "learning_rate": 5e-05, + "loss": 0.001, + "num_tokens": 106417228.0, + "reward": 10.507185876369476, + "reward_std": 0.8794627524912357, + "rewards/bm25_retrieval_reward_fn/mean": 0.7671468704938889, + "rewards/bm25_retrieval_reward_fn/std": 0.3928522327914834, + "rewards/event_reward_fn/mean": 8.9560546875, + "rewards/event_reward_fn/std": 4.9830086678266525, + "rewards/format_reward_fn/mean": 0.7839843779802322, + "rewards/format_reward_fn/std": 0.3996342560276389, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.162109375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 250.5, + "completions/mean_length": 212.796875, + "completions/mean_terminated_length": 205.06715965270996, + "completions/min_length": 154.0625, + "completions/min_terminated_length": 154.0625, + "entropy": 0.07244179910048842, + "epoch": 1.306122448979592, + "frac_reward_zero_std": 0.29296875, + "grad_norm": 0.11867273598909378, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_tokens": 107788032.0, + "reward": 9.871440827846527, + "reward_std": 0.9372463561594486, + "rewards/bm25_retrieval_reward_fn/mean": 0.7559298947453499, + "rewards/bm25_retrieval_reward_fn/std": 0.39855979569256306, + "rewards/event_reward_fn/mean": 8.3486328125, + "rewards/event_reward_fn/std": 5.14014707505703, + "rewards/format_reward_fn/mean": 0.766878254711628, + "rewards/format_reward_fn/std": 0.40443217288702726, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 250.5, + "completions/max_terminated_length": 242.5625, + "completions/mean_length": 190.1435546875, + "completions/mean_terminated_length": 186.02939891815186, + "completions/min_length": 129.1875, + "completions/min_terminated_length": 129.1875, + "entropy": 0.07182836486026645, + "epoch": 1.3216715257531584, + "frac_reward_zero_std": 0.3359375, + "grad_norm": 0.2237968146800995, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_tokens": 109081171.0, + "reward": 11.108476847410202, + "reward_std": 0.8281007707118988, + "rewards/bm25_retrieval_reward_fn/mean": 0.8979316018521786, + "rewards/bm25_retrieval_reward_fn/std": 0.2320653998758644, + "rewards/event_reward_fn/mean": 9.30078125, + "rewards/event_reward_fn/std": 5.342449679970741, + "rewards/format_reward_fn/mean": 0.90976407751441, + "rewards/format_reward_fn/std": 0.2275423549581319, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0478515625, + "completions/max_length": 249.8125, + "completions/max_terminated_length": 240.5625, + "completions/mean_length": 181.1884765625, + "completions/mean_terminated_length": 177.4005880355835, + "completions/min_length": 118.5, + "completions/min_terminated_length": 118.5, + "entropy": 0.06897289073094726, + "epoch": 1.337220602526725, + "frac_reward_zero_std": 0.3203125, + "grad_norm": 0.3482232987880707, + "learning_rate": 5e-05, + "loss": -0.0049, + "num_tokens": 110353716.0, + "reward": 10.944713652133942, + "reward_std": 0.9444422572851181, + "rewards/bm25_retrieval_reward_fn/mean": 0.8998373299837112, + "rewards/bm25_retrieval_reward_fn/std": 0.23540139599936083, + "rewards/event_reward_fn/mean": 9.1416015625, + "rewards/event_reward_fn/std": 5.0793561935424805, + "rewards/format_reward_fn/mean": 0.9032747447490692, + "rewards/format_reward_fn/std": 0.249714526347816, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 251.3125, + "completions/max_terminated_length": 241.8125, + "completions/mean_length": 191.3935546875, + "completions/mean_terminated_length": 185.62936782836914, + "completions/min_length": 134.875, + "completions/min_terminated_length": 134.875, + "entropy": 0.07020568964071572, + "epoch": 1.3527696793002915, + "frac_reward_zero_std": 0.36328125, + "grad_norm": 0.26569458842277527, + "learning_rate": 5e-05, + "loss": -0.0078, + "num_tokens": 111677811.0, + "reward": 11.232036709785461, + "reward_std": 0.8982522189617157, + "rewards/bm25_retrieval_reward_fn/mean": 0.8747124671936035, + "rewards/bm25_retrieval_reward_fn/std": 0.28446152550168335, + "rewards/event_reward_fn/mean": 9.4736328125, + "rewards/event_reward_fn/std": 5.696656331419945, + "rewards/format_reward_fn/mean": 0.8836914077401161, + "rewards/format_reward_fn/std": 0.2844822397455573, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1943359375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 251.3125, + "completions/mean_length": 217.27734375, + "completions/mean_terminated_length": 207.9519443511963, + "completions/min_length": 165.875, + "completions/min_terminated_length": 165.875, + "entropy": 0.07557977363467216, + "epoch": 1.3683187560738581, + "frac_reward_zero_std": 0.3515625, + "grad_norm": 0.19800527393817902, + "learning_rate": 5e-05, + "loss": 0.0051, + "num_tokens": 112955859.0, + "reward": 10.414989709854126, + "reward_std": 0.8083504606038332, + "rewards/bm25_retrieval_reward_fn/mean": 0.7803216241300106, + "rewards/bm25_retrieval_reward_fn/std": 0.3945994917303324, + "rewards/event_reward_fn/mean": 8.841796875, + "rewards/event_reward_fn/std": 4.6407610476017, + "rewards/format_reward_fn/mean": 0.7928710989654064, + "rewards/format_reward_fn/std": 0.39927749149501324, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1201171875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 205.212890625, + "completions/mean_terminated_length": 198.47776794433594, + "completions/min_length": 152.8125, + "completions/min_terminated_length": 152.8125, + "entropy": 0.07301379647105932, + "epoch": 1.3838678328474248, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.13404177129268646, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 114336889.0, + "reward": 11.331741988658905, + "reward_std": 0.8269859068095684, + "rewards/bm25_retrieval_reward_fn/mean": 0.8490662761032581, + "rewards/bm25_retrieval_reward_fn/std": 0.32220354955643415, + "rewards/event_reward_fn/mean": 9.6259765625, + "rewards/event_reward_fn/std": 5.535077631473541, + "rewards/format_reward_fn/mean": 0.8566992208361626, + "rewards/format_reward_fn/std": 0.3254187796264887, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0771484375, + "completions/max_length": 254.125, + "completions/max_terminated_length": 244.625, + "completions/mean_length": 195.3828125, + "completions/mean_terminated_length": 190.21417903900146, + "completions/min_length": 143.1875, + "completions/min_terminated_length": 143.1875, + "entropy": 0.07323169219307601, + "epoch": 1.3994169096209912, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.5236871242523193, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 115649713.0, + "reward": 11.00212150812149, + "reward_std": 0.9517039023339748, + "rewards/bm25_retrieval_reward_fn/mean": 0.8683973699808121, + "rewards/bm25_retrieval_reward_fn/std": 0.3078096741810441, + "rewards/event_reward_fn/mean": 9.25390625, + "rewards/event_reward_fn/std": 5.412711590528488, + "rewards/format_reward_fn/mean": 0.8798177093267441, + "rewards/format_reward_fn/std": 0.3116344837471843, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 255.0, + "completions/max_terminated_length": 247.0625, + "completions/mean_length": 194.5361328125, + "completions/mean_terminated_length": 190.66623401641846, + "completions/min_length": 140.625, + "completions/min_terminated_length": 140.625, + "entropy": 0.0805603014305234, + "epoch": 1.4149659863945578, + "frac_reward_zero_std": 0.3359375, + "grad_norm": 0.24460569024085999, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 116942470.0, + "reward": 10.231546640396118, + "reward_std": 0.7946171164512634, + "rewards/bm25_retrieval_reward_fn/mean": 0.9014858566224575, + "rewards/bm25_retrieval_reward_fn/std": 0.2548077297396958, + "rewards/event_reward_fn/mean": 8.4208984375, + "rewards/event_reward_fn/std": 4.865608409047127, + "rewards/format_reward_fn/mean": 0.9091623313724995, + "rewards/format_reward_fn/std": 0.2567377556115389, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 253.9375, + "completions/max_terminated_length": 245.1875, + "completions/mean_length": 196.1181640625, + "completions/mean_terminated_length": 193.0701208114624, + "completions/min_length": 151.625, + "completions/min_terminated_length": 151.625, + "entropy": 0.08425948722288013, + "epoch": 1.4305150631681243, + "frac_reward_zero_std": 0.27734375, + "grad_norm": 0.207608163356781, + "learning_rate": 5e-05, + "loss": 0.002, + "num_tokens": 118257695.0, + "reward": 10.509873569011688, + "reward_std": 0.8822544571012259, + "rewards/bm25_retrieval_reward_fn/mean": 0.9213385097682476, + "rewards/bm25_retrieval_reward_fn/std": 0.20133669557981193, + "rewards/event_reward_fn/mean": 8.650390625, + "rewards/event_reward_fn/std": 5.170787841081619, + "rewards/format_reward_fn/mean": 0.9381445348262787, + "rewards/format_reward_fn/std": 0.1984235211275518, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 249.875, + "completions/max_terminated_length": 240.25, + "completions/mean_length": 186.7001953125, + "completions/mean_terminated_length": 182.21374893188477, + "completions/min_length": 132.8125, + "completions/min_terminated_length": 132.8125, + "entropy": 0.0810700710862875, + "epoch": 1.446064139941691, + "frac_reward_zero_std": 0.3046875, + "grad_norm": 0.2522001564502716, + "learning_rate": 5e-05, + "loss": -0.0016, + "num_tokens": 119571004.0, + "reward": 11.150494575500488, + "reward_std": 0.9185313917696476, + "rewards/bm25_retrieval_reward_fn/mean": 0.9072700254619122, + "rewards/bm25_retrieval_reward_fn/std": 0.238927063299343, + "rewards/event_reward_fn/mean": 9.32421875, + "rewards/event_reward_fn/std": 5.231076046824455, + "rewards/format_reward_fn/mean": 0.9190057702362537, + "rewards/format_reward_fn/std": 0.23810118879191577, + "step": 1488 + } + ], + "logging_steps": 16, + "max_steps": 10290, + "num_input_tokens_seen": 120551388, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}