{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.344995140913508, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0322265625, "completions/max_length": 244.3125, "completions/max_terminated_length": 204.375, "completions/mean_length": 64.1787109375, "completions/mean_terminated_length": 57.63671565055847, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15918307239189744, "epoch": 0.015549076773566569, "frac_reward_zero_std": 0.453125, "grad_norm": 0.7093124389648438, "learning_rate": 5e-05, "loss": -0.4337, "num_tokens": 1189183.0, "reward": 3.017539083957672, "reward_std": 1.1567719243466854, "rewards/bm25_retrieval_reward_fn/mean": 0.3280859384685755, "rewards/bm25_retrieval_reward_fn/std": 0.38037889264523983, "rewards/event_reward_fn/mean": 2.2734375, "rewards/event_reward_fn/std": 3.2615081816911697, "rewards/format_reward_fn/mean": 0.4160156287252903, "rewards/format_reward_fn/std": 0.33771974220871925, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0478515625, "completions/max_length": 240.6875, "completions/max_terminated_length": 213.0, "completions/mean_length": 96.623046875, "completions/mean_terminated_length": 88.71417284011841, "completions/min_length": 7.3125, "completions/min_terminated_length": 7.3125, "entropy": 0.0870705652050674, "epoch": 0.031098153547133137, "frac_reward_zero_std": 0.34375, "grad_norm": 0.11729823052883148, "learning_rate": 5e-05, "loss": -0.1999, "num_tokens": 2439389.0, "reward": 6.020513415336609, "reward_std": 1.318240948021412, "rewards/bm25_retrieval_reward_fn/mean": 0.636932659894228, "rewards/bm25_retrieval_reward_fn/std": 0.4330139197409153, "rewards/event_reward_fn/mean": 4.67578125, "rewards/event_reward_fn/std": 4.073968470096588, "rewards/format_reward_fn/mean": 0.7077994756400585, "rewards/format_reward_fn/std": 0.37246643379330635, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0498046875, "completions/max_length": 249.75, "completions/max_terminated_length": 224.0, "completions/mean_length": 108.2373046875, "completions/mean_terminated_length": 100.33520174026489, "completions/min_length": 21.9375, "completions/min_terminated_length": 21.9375, "entropy": 0.06846319953911006, "epoch": 0.04664723032069971, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.11189325153827667, "learning_rate": 5e-05, "loss": -0.0794, "num_tokens": 3697760.0, "reward": 6.618032068014145, "reward_std": 1.1964115016162395, "rewards/bm25_retrieval_reward_fn/mean": 0.8193489573895931, "rewards/bm25_retrieval_reward_fn/std": 0.3472439646720886, "rewards/event_reward_fn/mean": 4.94921875, "rewards/event_reward_fn/std": 4.1042004227638245, "rewards/format_reward_fn/mean": 0.8494642823934555, "rewards/format_reward_fn/std": 0.30338616110384464, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 251.625, "completions/max_terminated_length": 232.75, "completions/mean_length": 114.8154296875, "completions/mean_terminated_length": 102.74591159820557, "completions/min_length": 36.3125, "completions/min_terminated_length": 36.3125, "entropy": 0.0698625217191875, "epoch": 0.062196307094266275, "frac_reward_zero_std": 0.4140625, "grad_norm": 0.1286889612674713, "learning_rate": 5e-05, "loss": -0.0521, "num_tokens": 4908167.0, "reward": 7.294231742620468, "reward_std": 1.1466168127954006, "rewards/bm25_retrieval_reward_fn/mean": 0.833867184817791, "rewards/bm25_retrieval_reward_fn/std": 0.3500053770840168, "rewards/event_reward_fn/mean": 5.6142578125, "rewards/event_reward_fn/std": 4.4990804344415665, "rewards/format_reward_fn/mean": 0.8461067788302898, "rewards/format_reward_fn/std": 0.3374804314225912, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0712890625, "completions/max_length": 246.625, "completions/max_terminated_length": 219.25, "completions/mean_length": 113.826171875, "completions/mean_terminated_length": 103.01660490036011, "completions/min_length": 38.6875, "completions/min_terminated_length": 38.6875, "entropy": 0.0676732650026679, "epoch": 0.07774538386783285, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.08458422869443893, "learning_rate": 5e-05, "loss": -0.0159, "num_tokens": 6128157.0, "reward": 7.6867459416389465, "reward_std": 0.9968220815062523, "rewards/bm25_retrieval_reward_fn/mean": 0.8819040954113007, "rewards/bm25_retrieval_reward_fn/std": 0.29652632866054773, "rewards/event_reward_fn/mean": 5.904296875, "rewards/event_reward_fn/std": 4.028907224535942, "rewards/format_reward_fn/mean": 0.9005450159311295, "rewards/format_reward_fn/std": 0.2789665600284934, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0361328125, "completions/max_length": 247.125, "completions/max_terminated_length": 236.1875, "completions/mean_length": 112.671875, "completions/mean_terminated_length": 107.51665830612183, "completions/min_length": 37.375, "completions/min_terminated_length": 37.375, "entropy": 0.06292958417907357, "epoch": 0.09329446064139942, "frac_reward_zero_std": 0.453125, "grad_norm": 0.09709884226322174, "learning_rate": 5e-05, "loss": -0.0353, "num_tokens": 7275181.0, "reward": 7.98236358165741, "reward_std": 1.0672973282635212, "rewards/bm25_retrieval_reward_fn/mean": 0.9419283382594585, "rewards/bm25_retrieval_reward_fn/std": 0.18250679067568853, "rewards/event_reward_fn/mean": 6.091796875, "rewards/event_reward_fn/std": 4.740213438868523, "rewards/format_reward_fn/mean": 0.9486383907496929, "rewards/format_reward_fn/std": 0.16436113324016333, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1083984375, "completions/max_length": 255.1875, "completions/max_terminated_length": 238.3125, "completions/mean_length": 129.126953125, "completions/mean_terminated_length": 113.86429929733276, "completions/min_length": 41.4375, "completions/min_terminated_length": 41.4375, "entropy": 0.06285874638706446, "epoch": 0.10884353741496598, "frac_reward_zero_std": 0.46484375, "grad_norm": 0.17054593563079834, "learning_rate": 5e-05, "loss": -0.0124, "num_tokens": 8523631.0, "reward": 7.8958849012851715, "reward_std": 1.0757801569998264, "rewards/bm25_retrieval_reward_fn/mean": 0.8616694211959839, "rewards/bm25_retrieval_reward_fn/std": 0.31423775386065245, "rewards/event_reward_fn/mean": 6.1513671875, "rewards/event_reward_fn/std": 4.978878691792488, "rewards/format_reward_fn/mean": 0.8828483074903488, "rewards/format_reward_fn/std": 0.30156402476131916, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0849609375, "completions/max_length": 251.375, "completions/max_terminated_length": 228.4375, "completions/mean_length": 127.2109375, "completions/mean_terminated_length": 115.21368026733398, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.0637968888040632, "epoch": 0.12439261418853255, "frac_reward_zero_std": 0.390625, "grad_norm": 0.1357981413602829, "learning_rate": 5e-05, "loss": -0.0186, "num_tokens": 9707755.0, "reward": 8.13655748963356, "reward_std": 1.0848342552781105, "rewards/bm25_retrieval_reward_fn/mean": 0.8800471648573875, "rewards/bm25_retrieval_reward_fn/std": 0.29066222277469933, "rewards/event_reward_fn/mean": 6.36328125, "rewards/event_reward_fn/std": 4.869352951645851, "rewards/format_reward_fn/mean": 0.8932291641831398, "rewards/format_reward_fn/std": 0.28600863087922335, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1123046875, "completions/max_length": 253.1875, "completions/max_terminated_length": 237.5, "completions/mean_length": 140.646484375, "completions/mean_terminated_length": 126.51056718826294, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.06207763450220227, "epoch": 0.13994169096209913, "frac_reward_zero_std": 0.43359375, "grad_norm": 0.18493860960006714, "learning_rate": 5e-05, "loss": -0.0196, "num_tokens": 10967133.0, "reward": 8.799611210823059, "reward_std": 0.9760072641074657, "rewards/bm25_retrieval_reward_fn/mean": 0.8509132824838161, "rewards/bm25_retrieval_reward_fn/std": 0.3210932519286871, "rewards/event_reward_fn/mean": 7.080078125, "rewards/event_reward_fn/std": 4.870284929871559, "rewards/format_reward_fn/mean": 0.868619792163372, "rewards/format_reward_fn/std": 0.3133174767717719, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.158203125, "completions/max_length": 256.0, "completions/max_terminated_length": 243.6875, "completions/mean_length": 147.4453125, "completions/mean_terminated_length": 126.75604343414307, "completions/min_length": 45.1875, "completions/min_terminated_length": 45.1875, "entropy": 0.07122921152040362, "epoch": 0.1554907677356657, "frac_reward_zero_std": 0.34375, "grad_norm": 0.2770453989505768, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 12272289.0, "reward": 8.378659665584564, "reward_std": 1.1576978042721748, "rewards/bm25_retrieval_reward_fn/mean": 0.814004722982645, "rewards/bm25_retrieval_reward_fn/std": 0.3608710467815399, "rewards/event_reward_fn/mean": 6.7333984375, "rewards/event_reward_fn/std": 4.9148435443639755, "rewards/format_reward_fn/mean": 0.8312565125524998, "rewards/format_reward_fn/std": 0.3619283623993397, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 254.8125, "completions/max_terminated_length": 245.5625, "completions/mean_length": 147.28125, "completions/mean_terminated_length": 126.25686740875244, "completions/min_length": 47.875, "completions/min_terminated_length": 47.875, "entropy": 0.07063461863435805, "epoch": 0.17103984450923226, "frac_reward_zero_std": 0.359375, "grad_norm": 0.20308320224285126, "learning_rate": 5e-05, "loss": -0.0226, "num_tokens": 13517901.0, "reward": 8.586736917495728, "reward_std": 1.180733297020197, "rewards/bm25_retrieval_reward_fn/mean": 0.7950377985835075, "rewards/bm25_retrieval_reward_fn/std": 0.35726089123636484, "rewards/event_reward_fn/mean": 6.9697265625, "rewards/event_reward_fn/std": 4.731213182210922, "rewards/format_reward_fn/mean": 0.8219726607203484, "rewards/format_reward_fn/std": 0.35164750274270773, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1572265625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.375, "completions/mean_length": 139.818359375, "completions/mean_terminated_length": 118.0954179763794, "completions/min_length": 40.8125, "completions/min_terminated_length": 40.8125, "entropy": 0.06718740961514413, "epoch": 0.18658892128279883, "frac_reward_zero_std": 0.41015625, "grad_norm": 0.10902810841798782, "learning_rate": 5e-05, "loss": -0.014, "num_tokens": 14756091.0, "reward": 8.501474261283875, "reward_std": 1.0462469272315502, "rewards/bm25_retrieval_reward_fn/mean": 0.7894043922424316, "rewards/bm25_retrieval_reward_fn/std": 0.36320002656430006, "rewards/event_reward_fn/mean": 6.900390625, "rewards/event_reward_fn/std": 4.841355547308922, "rewards/format_reward_fn/mean": 0.8116793744266033, "rewards/format_reward_fn/std": 0.36351621337234974, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1650390625, "completions/max_length": 256.0, "completions/max_terminated_length": 240.75, "completions/mean_length": 150.31640625, "completions/mean_terminated_length": 129.93299293518066, "completions/min_length": 47.3125, "completions/min_terminated_length": 47.3125, "entropy": 0.06464909669011831, "epoch": 0.2021379980563654, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.14535187184810638, "learning_rate": 5e-05, "loss": -0.0053, "num_tokens": 16038167.0, "reward": 9.072274684906006, "reward_std": 1.216166764497757, "rewards/bm25_retrieval_reward_fn/mean": 0.8119766861200333, "rewards/bm25_retrieval_reward_fn/std": 0.3669638652354479, "rewards/event_reward_fn/mean": 7.439453125, "rewards/event_reward_fn/std": 5.443397417664528, "rewards/format_reward_fn/mean": 0.8208449557423592, "rewards/format_reward_fn/std": 0.3688422851264477, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.123046875, "completions/max_length": 253.125, "completions/max_terminated_length": 236.25, "completions/mean_length": 134.2294921875, "completions/mean_terminated_length": 117.79497241973877, "completions/min_length": 37.875, "completions/min_terminated_length": 37.875, "entropy": 0.06634449982084334, "epoch": 0.21768707482993196, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.0848744735121727, "learning_rate": 5e-05, "loss": -0.0096, "num_tokens": 17312310.0, "reward": 8.67900961637497, "reward_std": 1.2484335452318192, "rewards/bm25_retrieval_reward_fn/mean": 0.8359074406325817, "rewards/bm25_retrieval_reward_fn/std": 0.33306772634387016, "rewards/event_reward_fn/mean": 6.998046875, "rewards/event_reward_fn/std": 5.2749055325984955, "rewards/format_reward_fn/mean": 0.8450553454458714, "rewards/format_reward_fn/std": 0.3290289109572768, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1416015625, "completions/max_length": 251.875, "completions/max_terminated_length": 233.375, "completions/mean_length": 138.087890625, "completions/mean_terminated_length": 119.15170526504517, "completions/min_length": 43.5, "completions/min_terminated_length": 43.5, "entropy": 0.06127542559988797, "epoch": 0.23323615160349853, "frac_reward_zero_std": 0.390625, "grad_norm": 0.11555243283510208, "learning_rate": 5e-05, "loss": -0.0131, "num_tokens": 18542780.0, "reward": 8.50430566072464, "reward_std": 0.9973306134343147, "rewards/bm25_retrieval_reward_fn/mean": 0.8396431356668472, "rewards/bm25_retrieval_reward_fn/std": 0.3250976144336164, "rewards/event_reward_fn/mean": 6.8232421875, "rewards/event_reward_fn/std": 4.391354620456696, "rewards/format_reward_fn/mean": 0.8414202034473419, "rewards/format_reward_fn/std": 0.32731985161080956, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.150390625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.3125, "completions/mean_length": 145.9013671875, "completions/mean_terminated_length": 126.70299863815308, "completions/min_length": 42.625, "completions/min_terminated_length": 42.625, "entropy": 0.06281583779491484, "epoch": 0.2487852283770651, "frac_reward_zero_std": 0.375, "grad_norm": 0.18120716512203217, "learning_rate": 5e-05, "loss": -0.0064, "num_tokens": 19826255.0, "reward": 8.296767592430115, "reward_std": 1.2354702651500702, "rewards/bm25_retrieval_reward_fn/mean": 0.8322994858026505, "rewards/bm25_retrieval_reward_fn/std": 0.347878853790462, "rewards/event_reward_fn/mean": 6.6259765625, "rewards/event_reward_fn/std": 4.630821079015732, "rewards/format_reward_fn/mean": 0.8384914398193359, "rewards/format_reward_fn/std": 0.3501485912129283, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.193359375, "completions/max_length": 255.875, "completions/max_terminated_length": 238.25, "completions/mean_length": 149.7919921875, "completions/mean_terminated_length": 124.16296577453613, "completions/min_length": 45.875, "completions/min_terminated_length": 45.875, "entropy": 0.06578910606913269, "epoch": 0.26433430515063167, "frac_reward_zero_std": 0.359375, "grad_norm": 0.1267288774251938, "learning_rate": 5e-05, "loss": -0.0136, "num_tokens": 21125750.0, "reward": 8.781003445386887, "reward_std": 1.1383938118815422, "rewards/bm25_retrieval_reward_fn/mean": 0.7845699526369572, "rewards/bm25_retrieval_reward_fn/std": 0.37547132885083556, "rewards/event_reward_fn/mean": 7.2001953125, "rewards/event_reward_fn/std": 5.128496631979942, "rewards/format_reward_fn/mean": 0.7962380684912205, "rewards/format_reward_fn/std": 0.3762207794934511, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1591796875, "completions/max_length": 256.0, "completions/max_terminated_length": 240.5625, "completions/mean_length": 153.1923828125, "completions/mean_terminated_length": 134.3278865814209, "completions/min_length": 47.875, "completions/min_terminated_length": 47.875, "entropy": 0.06297733471728861, "epoch": 0.27988338192419826, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1375938057899475, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 22340679.0, "reward": 8.529892146587372, "reward_std": 0.9637267738580704, "rewards/bm25_retrieval_reward_fn/mean": 0.8117021955549717, "rewards/bm25_retrieval_reward_fn/std": 0.36230491753667593, "rewards/event_reward_fn/mean": 6.8916015625, "rewards/event_reward_fn/std": 4.614271923899651, "rewards/format_reward_fn/mean": 0.8265885375440121, "rewards/format_reward_fn/std": 0.3658856125548482, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 256.0, "completions/max_terminated_length": 241.3125, "completions/mean_length": 154.25, "completions/mean_terminated_length": 135.76823568344116, "completions/min_length": 48.5625, "completions/min_terminated_length": 48.5625, "entropy": 0.06781743955798447, "epoch": 0.2954324586977648, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.1462751030921936, "learning_rate": 5e-05, "loss": -0.0151, "num_tokens": 23587243.0, "reward": 8.23677259683609, "reward_std": 1.0035298839211464, "rewards/bm25_retrieval_reward_fn/mean": 0.8233126699924469, "rewards/bm25_retrieval_reward_fn/std": 0.3517280900850892, "rewards/event_reward_fn/mean": 6.5712890625, "rewards/event_reward_fn/std": 4.795703008770943, "rewards/format_reward_fn/mean": 0.8421707637608051, "rewards/format_reward_fn/std": 0.35116075072437525, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1806640625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.75, "completions/mean_length": 166.966796875, "completions/mean_terminated_length": 147.62712383270264, "completions/min_length": 61.375, "completions/min_terminated_length": 61.375, "entropy": 0.06373983481898904, "epoch": 0.3109815354713314, "frac_reward_zero_std": 0.3984375, "grad_norm": 0.11124490946531296, "learning_rate": 5e-05, "loss": 0.003, "num_tokens": 24903841.0, "reward": 9.053372412919998, "reward_std": 0.9619283508509398, "rewards/bm25_retrieval_reward_fn/mean": 0.7753774374723434, "rewards/bm25_retrieval_reward_fn/std": 0.390325166285038, "rewards/event_reward_fn/mean": 7.4970703125, "rewards/event_reward_fn/std": 4.754469409584999, "rewards/format_reward_fn/mean": 0.7809244766831398, "rewards/format_reward_fn/std": 0.39305115677416325, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2158203125, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0625, "completions/mean_length": 168.8544921875, "completions/mean_terminated_length": 144.7502179145813, "completions/min_length": 54.75, "completions/min_terminated_length": 54.75, "entropy": 0.06674353126436472, "epoch": 0.32653061224489793, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.11352943629026413, "learning_rate": 5e-05, "loss": 0.0032, "num_tokens": 26176048.0, "reward": 8.903641551733017, "reward_std": 1.0895367171615362, "rewards/bm25_retrieval_reward_fn/mean": 0.7484599277377129, "rewards/bm25_retrieval_reward_fn/std": 0.39963601250201464, "rewards/event_reward_fn/mean": 7.390625, "rewards/event_reward_fn/std": 5.010941222310066, "rewards/format_reward_fn/mean": 0.7645566947758198, "rewards/format_reward_fn/std": 0.40272433403879404, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.201171875, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 169.0126953125, "completions/mean_terminated_length": 147.5876121520996, "completions/min_length": 54.375, "completions/min_terminated_length": 54.375, "entropy": 0.06429841788485646, "epoch": 0.34207968901846453, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.15351000428199768, "learning_rate": 5e-05, "loss": -0.0046, "num_tokens": 27486521.0, "reward": 9.137612909078598, "reward_std": 1.0497351847589016, "rewards/bm25_retrieval_reward_fn/mean": 0.7691521309316158, "rewards/bm25_retrieval_reward_fn/std": 0.3831571042537689, "rewards/event_reward_fn/mean": 7.5888671875, "rewards/event_reward_fn/std": 5.132935270667076, "rewards/format_reward_fn/mean": 0.7795935608446598, "rewards/format_reward_fn/std": 0.3872489295899868, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 256.0, "completions/max_terminated_length": 237.75, "completions/mean_length": 163.7431640625, "completions/mean_terminated_length": 143.10296440124512, "completions/min_length": 51.8125, "completions/min_terminated_length": 51.8125, "entropy": 0.06813837168738246, "epoch": 0.3576287657920311, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.17290791869163513, "learning_rate": 5e-05, "loss": 0.0092, "num_tokens": 28746262.0, "reward": 9.347127586603165, "reward_std": 1.1120197921991348, "rewards/bm25_retrieval_reward_fn/mean": 0.7918755821883678, "rewards/bm25_retrieval_reward_fn/std": 0.3813342722132802, "rewards/event_reward_fn/mean": 7.75390625, "rewards/event_reward_fn/std": 5.131016373634338, "rewards/format_reward_fn/mean": 0.8013457953929901, "rewards/format_reward_fn/std": 0.3841324523091316, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.169921875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.625, "completions/mean_length": 163.630859375, "completions/mean_terminated_length": 145.52464532852173, "completions/min_length": 53.8125, "completions/min_terminated_length": 53.8125, "entropy": 0.06338186049833894, "epoch": 0.37317784256559766, "frac_reward_zero_std": 0.42578125, "grad_norm": 0.15385830402374268, "learning_rate": 5e-05, "loss": 0.0024, "num_tokens": 29971032.0, "reward": 8.972355782985687, "reward_std": 0.9110043197870255, "rewards/bm25_retrieval_reward_fn/mean": 0.8088140487670898, "rewards/bm25_retrieval_reward_fn/std": 0.35705708153545856, "rewards/event_reward_fn/mean": 7.345703125, "rewards/event_reward_fn/std": 4.8114437609910965, "rewards/format_reward_fn/mean": 0.817838542163372, "rewards/format_reward_fn/std": 0.35746027156710625, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1533203125, "completions/max_length": 256.0, "completions/max_terminated_length": 239.5, "completions/mean_length": 163.6767578125, "completions/mean_terminated_length": 147.22436618804932, "completions/min_length": 49.375, "completions/min_terminated_length": 49.375, "entropy": 0.06737338448874652, "epoch": 0.38872691933916426, "frac_reward_zero_std": 0.3671875, "grad_norm": 0.11516160517930984, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 31274657.0, "reward": 9.454054236412048, "reward_std": 1.1389728896319866, "rewards/bm25_retrieval_reward_fn/mean": 0.8227716907858849, "rewards/bm25_retrieval_reward_fn/std": 0.35767858382314444, "rewards/event_reward_fn/mean": 7.80078125, "rewards/event_reward_fn/std": 4.638232260942459, "rewards/format_reward_fn/mean": 0.8305013030767441, "rewards/format_reward_fn/std": 0.3585221981629729, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.212890625, "completions/max_length": 255.9375, "completions/max_terminated_length": 249.6875, "completions/mean_length": 176.8935546875, "completions/mean_terminated_length": 156.1867184638977, "completions/min_length": 67.1875, "completions/min_terminated_length": 67.1875, "entropy": 0.07263953145593405, "epoch": 0.4042759961127308, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.12655992805957794, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 32597096.0, "reward": 9.407644420862198, "reward_std": 1.1765358839184046, "rewards/bm25_retrieval_reward_fn/mean": 0.7457954213023186, "rewards/bm25_retrieval_reward_fn/std": 0.4088666429743171, "rewards/event_reward_fn/mean": 7.8974609375, "rewards/event_reward_fn/std": 5.147656410932541, "rewards/format_reward_fn/mean": 0.764388021081686, "rewards/format_reward_fn/std": 0.39705412182956934, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2021484375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.125, "completions/mean_length": 175.6865234375, "completions/mean_terminated_length": 155.94458389282227, "completions/min_length": 69.0625, "completions/min_terminated_length": 69.0625, "entropy": 0.0728312199935317, "epoch": 0.4198250728862974, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.14607630670070648, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 33869635.0, "reward": 8.863501250743866, "reward_std": 1.1473261304199696, "rewards/bm25_retrieval_reward_fn/mean": 0.7608970887959003, "rewards/bm25_retrieval_reward_fn/std": 0.39875176921486855, "rewards/event_reward_fn/mean": 7.32421875, "rewards/event_reward_fn/std": 4.745256543159485, "rewards/format_reward_fn/mean": 0.7783854156732559, "rewards/format_reward_fn/std": 0.4040640462189913, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2216796875, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 184.185546875, "completions/mean_terminated_length": 164.68915843963623, "completions/min_length": 79.1875, "completions/min_terminated_length": 79.1875, "entropy": 0.07330505712889135, "epoch": 0.43537414965986393, "frac_reward_zero_std": 0.3125, "grad_norm": 0.11223267763853073, "learning_rate": 5e-05, "loss": 0.0046, "num_tokens": 35198117.0, "reward": 9.225192874670029, "reward_std": 1.0972841531038284, "rewards/bm25_retrieval_reward_fn/mean": 0.756638091057539, "rewards/bm25_retrieval_reward_fn/std": 0.39518506824970245, "rewards/event_reward_fn/mean": 7.6982421875, "rewards/event_reward_fn/std": 4.850593596696854, "rewards/format_reward_fn/mean": 0.7703125029802322, "rewards/format_reward_fn/std": 0.39637050684541464, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.181640625, "completions/max_length": 256.0, "completions/max_terminated_length": 243.4375, "completions/mean_length": 167.8017578125, "completions/mean_terminated_length": 149.08932733535767, "completions/min_length": 67.0625, "completions/min_terminated_length": 67.0625, "entropy": 0.06937285792082548, "epoch": 0.4509232264334305, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.277972549200058, "learning_rate": 5e-05, "loss": 0.005, "num_tokens": 36479498.0, "reward": 9.970476865768433, "reward_std": 1.2180952616035938, "rewards/bm25_retrieval_reward_fn/mean": 0.7747513987123966, "rewards/bm25_retrieval_reward_fn/std": 0.3933687787503004, "rewards/event_reward_fn/mean": 8.412109375, "rewards/event_reward_fn/std": 5.536467835307121, "rewards/format_reward_fn/mean": 0.7836160659790039, "rewards/format_reward_fn/std": 0.3935097064822912, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1650390625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.75, "completions/mean_length": 165.2724609375, "completions/mean_terminated_length": 147.48859310150146, "completions/min_length": 61.3125, "completions/min_terminated_length": 61.3125, "entropy": 0.062284021405503154, "epoch": 0.46647230320699706, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.19548486173152924, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 37738681.0, "reward": 9.709998965263367, "reward_std": 1.080780379474163, "rewards/bm25_retrieval_reward_fn/mean": 0.7859884761273861, "rewards/bm25_retrieval_reward_fn/std": 0.3875921927392483, "rewards/event_reward_fn/mean": 8.123046875, "rewards/event_reward_fn/std": 5.239727973937988, "rewards/format_reward_fn/mean": 0.8009635433554649, "rewards/format_reward_fn/std": 0.3870681691914797, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.177734375, "completions/max_length": 256.0, "completions/max_terminated_length": 243.4375, "completions/mean_length": 175.16015625, "completions/mean_terminated_length": 158.66053676605225, "completions/min_length": 70.4375, "completions/min_terminated_length": 70.4375, "entropy": 0.06954935006797314, "epoch": 0.48202137998056366, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.20231647789478302, "learning_rate": 5e-05, "loss": -0.0035, "num_tokens": 39016317.0, "reward": 9.523818492889404, "reward_std": 1.1278588809072971, "rewards/bm25_retrieval_reward_fn/mean": 0.7954656668007374, "rewards/bm25_retrieval_reward_fn/std": 0.380647461861372, "rewards/event_reward_fn/mean": 7.9248046875, "rewards/event_reward_fn/std": 5.266398847103119, "rewards/format_reward_fn/mean": 0.8035481758415699, "rewards/format_reward_fn/std": 0.3839748175814748, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2158203125, "completions/max_length": 255.375, "completions/max_terminated_length": 247.625, "completions/mean_length": 189.3232421875, "completions/mean_terminated_length": 170.41810989379883, "completions/min_length": 91.3125, "completions/min_terminated_length": 91.3125, "entropy": 0.07108506350778043, "epoch": 0.4975704567541302, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.05343855917453766, "learning_rate": 5e-05, "loss": 0.0085, "num_tokens": 40322552.0, "reward": 9.111008793115616, "reward_std": 1.051011398434639, "rewards/bm25_retrieval_reward_fn/mean": 0.7504944987595081, "rewards/bm25_retrieval_reward_fn/std": 0.3889648839831352, "rewards/event_reward_fn/mean": 7.58984375, "rewards/event_reward_fn/std": 4.890510141849518, "rewards/format_reward_fn/mean": 0.7706705778837204, "rewards/format_reward_fn/std": 0.39065420906990767, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.185546875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.25, "completions/mean_length": 181.51953125, "completions/mean_terminated_length": 164.5301055908203, "completions/min_length": 85.625, "completions/min_terminated_length": 85.625, "entropy": 0.07393265794962645, "epoch": 0.5131195335276968, "frac_reward_zero_std": 0.3125, "grad_norm": 0.13203799724578857, "learning_rate": 5e-05, "loss": 0.0024, "num_tokens": 41624024.0, "reward": 9.416305720806122, "reward_std": 1.0736836642026901, "rewards/bm25_retrieval_reward_fn/mean": 0.749378640204668, "rewards/bm25_retrieval_reward_fn/std": 0.39308065082877874, "rewards/event_reward_fn/mean": 7.89453125, "rewards/event_reward_fn/std": 5.149897053837776, "rewards/format_reward_fn/mean": 0.7723958268761635, "rewards/format_reward_fn/std": 0.3941022912040353, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1455078125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.3125, "completions/mean_length": 183.716796875, "completions/mean_terminated_length": 171.55659580230713, "completions/min_length": 93.8125, "completions/min_terminated_length": 93.8125, "entropy": 0.07057009753771126, "epoch": 0.5286686103012633, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.14103946089744568, "learning_rate": 5e-05, "loss": 0.0125, "num_tokens": 42874182.0, "reward": 9.582858800888062, "reward_std": 0.9491388313472271, "rewards/bm25_retrieval_reward_fn/mean": 0.8273874409496784, "rewards/bm25_retrieval_reward_fn/std": 0.33826882019639015, "rewards/event_reward_fn/mean": 7.9111328125, "rewards/event_reward_fn/std": 4.9791994243860245, "rewards/format_reward_fn/mean": 0.844338733702898, "rewards/format_reward_fn/std": 0.3412060188129544, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1806640625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.75, "completions/mean_length": 182.1201171875, "completions/mean_terminated_length": 165.9890251159668, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.07039901474490762, "epoch": 0.54421768707483, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1732136756181717, "learning_rate": 5e-05, "loss": -0.0085, "num_tokens": 44221753.0, "reward": 9.929603159427643, "reward_std": 1.1261513829231262, "rewards/bm25_retrieval_reward_fn/mean": 0.7931484319269657, "rewards/bm25_retrieval_reward_fn/std": 0.3795010205358267, "rewards/event_reward_fn/mean": 8.328125, "rewards/event_reward_fn/std": 4.941879317164421, "rewards/format_reward_fn/mean": 0.8083296120166779, "rewards/format_reward_fn/std": 0.3848690167069435, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 255.75, "completions/max_terminated_length": 248.5625, "completions/mean_length": 183.498046875, "completions/mean_terminated_length": 172.7189416885376, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "entropy": 0.06706819240935147, "epoch": 0.5597667638483965, "frac_reward_zero_std": 0.359375, "grad_norm": 0.18046796321868896, "learning_rate": 5e-05, "loss": 0.0046, "num_tokens": 45493775.0, "reward": 9.964149117469788, "reward_std": 1.0226014591753483, "rewards/bm25_retrieval_reward_fn/mean": 0.8349823988974094, "rewards/bm25_retrieval_reward_fn/std": 0.3266658801585436, "rewards/event_reward_fn/mean": 8.2734375, "rewards/event_reward_fn/std": 5.3706135004758835, "rewards/format_reward_fn/mean": 0.855729166418314, "rewards/format_reward_fn/std": 0.33050147350877523, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.123046875, "completions/max_length": 254.125, "completions/max_terminated_length": 240.3125, "completions/mean_length": 157.6474609375, "completions/mean_terminated_length": 144.27598762512207, "completions/min_length": 70.4375, "completions/min_terminated_length": 70.4375, "entropy": 0.06692534498870373, "epoch": 0.5753158406219631, "frac_reward_zero_std": 0.41015625, "grad_norm": 0.18845033645629883, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 46845374.0, "reward": 9.60788244009018, "reward_std": 1.077907931059599, "rewards/bm25_retrieval_reward_fn/mean": 0.8490281663835049, "rewards/bm25_retrieval_reward_fn/std": 0.3180042654275894, "rewards/event_reward_fn/mean": 7.8984375, "rewards/event_reward_fn/std": 5.3387322425842285, "rewards/format_reward_fn/mean": 0.8604166693985462, "rewards/format_reward_fn/std": 0.3179410183802247, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 253.375, "completions/max_terminated_length": 237.9375, "completions/mean_length": 141.9638671875, "completions/mean_terminated_length": 132.3677864074707, "completions/min_length": 62.875, "completions/min_terminated_length": 62.875, "entropy": 0.06903915433213115, "epoch": 0.5908649173955296, "frac_reward_zero_std": 0.3984375, "grad_norm": 0.5333549380302429, "learning_rate": 5e-05, "loss": -0.0115, "num_tokens": 48101641.0, "reward": 9.540560752153397, "reward_std": 0.925620548427105, "rewards/bm25_retrieval_reward_fn/mean": 0.8527389727532864, "rewards/bm25_retrieval_reward_fn/std": 0.3208633568137884, "rewards/event_reward_fn/mean": 7.8310546875, "rewards/event_reward_fn/std": 4.8098659962415695, "rewards/format_reward_fn/mean": 0.8567671179771423, "rewards/format_reward_fn/std": 0.3000659542158246, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 255.0625, "completions/max_terminated_length": 246.0, "completions/mean_length": 166.4150390625, "completions/mean_terminated_length": 156.72006034851074, "completions/min_length": 85.125, "completions/min_terminated_length": 85.125, "entropy": 0.0729338163509965, "epoch": 0.6064139941690962, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.24336110055446625, "learning_rate": 5e-05, "loss": -0.0083, "num_tokens": 49369398.0, "reward": 8.873536258935928, "reward_std": 1.0537522435188293, "rewards/bm25_retrieval_reward_fn/mean": 0.5898543912917376, "rewards/bm25_retrieval_reward_fn/std": 0.43156279996037483, "rewards/event_reward_fn/mean": 7.6513671875, "rewards/event_reward_fn/std": 4.6099734753370285, "rewards/format_reward_fn/mean": 0.6323146112263203, "rewards/format_reward_fn/std": 0.3545870538800955, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 255.875, "completions/max_terminated_length": 247.125, "completions/mean_length": 177.6005859375, "completions/mean_terminated_length": 167.23325157165527, "completions/min_length": 87.3125, "completions/min_terminated_length": 87.3125, "entropy": 0.07631410518661141, "epoch": 0.6219630709426628, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.20491930842399597, "learning_rate": 5e-05, "loss": -0.0025, "num_tokens": 50684041.0, "reward": 10.097908169031143, "reward_std": 1.099338386207819, "rewards/bm25_retrieval_reward_fn/mean": 0.8222074285149574, "rewards/bm25_retrieval_reward_fn/std": 0.3467689296230674, "rewards/event_reward_fn/mean": 8.4619140625, "rewards/event_reward_fn/std": 5.3020381182432175, "rewards/format_reward_fn/mean": 0.8137868903577328, "rewards/format_reward_fn/std": 0.34406947437673807, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 254.125, "completions/max_terminated_length": 246.4375, "completions/mean_length": 175.4794921875, "completions/mean_terminated_length": 168.33812427520752, "completions/min_length": 90.8125, "completions/min_terminated_length": 90.8125, "entropy": 0.0735539214219898, "epoch": 0.6375121477162293, "frac_reward_zero_std": 0.328125, "grad_norm": 0.09572162479162216, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 52012688.0, "reward": 10.082605361938477, "reward_std": 0.8642270974814892, "rewards/bm25_retrieval_reward_fn/mean": 0.8720975369215012, "rewards/bm25_retrieval_reward_fn/std": 0.265175896929577, "rewards/event_reward_fn/mean": 8.3173828125, "rewards/event_reward_fn/std": 5.005625352263451, "rewards/format_reward_fn/mean": 0.8931249976158142, "rewards/format_reward_fn/std": 0.26579738268628716, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1337890625, "completions/max_length": 254.5, "completions/max_terminated_length": 248.5, "completions/mean_length": 188.3662109375, "completions/mean_terminated_length": 178.33765697479248, "completions/min_length": 97.0625, "completions/min_terminated_length": 97.0625, "entropy": 0.07762870891019702, "epoch": 0.6530612244897959, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.12062438577413559, "learning_rate": 5e-05, "loss": -0.0038, "num_tokens": 53302267.0, "reward": 10.330396890640259, "reward_std": 0.9276157356798649, "rewards/bm25_retrieval_reward_fn/mean": 0.8132092356681824, "rewards/bm25_retrieval_reward_fn/std": 0.33317599166184664, "rewards/event_reward_fn/mean": 8.6669921875, "rewards/event_reward_fn/std": 5.405571684241295, "rewards/format_reward_fn/mean": 0.8501953110098839, "rewards/format_reward_fn/std": 0.3391735916957259, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1806640625, "completions/max_length": 255.5, "completions/max_terminated_length": 252.0625, "completions/mean_length": 195.8662109375, "completions/mean_terminated_length": 182.70546627044678, "completions/min_length": 118.8125, "completions/min_terminated_length": 118.8125, "entropy": 0.07736558141186833, "epoch": 0.6686103012633625, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.1318187564611435, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 54548578.0, "reward": 10.327336311340332, "reward_std": 1.0552778337150812, "rewards/bm25_retrieval_reward_fn/mean": 0.7777924984693527, "rewards/bm25_retrieval_reward_fn/std": 0.36456546862609684, "rewards/event_reward_fn/mean": 8.7490234375, "rewards/event_reward_fn/std": 5.162845477461815, "rewards/format_reward_fn/mean": 0.8005203679203987, "rewards/format_reward_fn/std": 0.36596682760864496, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.169921875, "completions/max_length": 254.9375, "completions/max_terminated_length": 246.0, "completions/mean_length": 192.677734375, "completions/mean_terminated_length": 180.396879196167, "completions/min_length": 110.875, "completions/min_terminated_length": 110.875, "entropy": 0.07653255970217288, "epoch": 0.6841593780369291, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.11105561256408691, "learning_rate": 5e-05, "loss": 0.0111, "num_tokens": 55838164.0, "reward": 10.28305697441101, "reward_std": 0.9923089742660522, "rewards/bm25_retrieval_reward_fn/mean": 0.8106463178992271, "rewards/bm25_retrieval_reward_fn/std": 0.33590539428405464, "rewards/event_reward_fn/mean": 8.654296875, "rewards/event_reward_fn/std": 5.304804667830467, "rewards/format_reward_fn/mean": 0.8181138336658478, "rewards/format_reward_fn/std": 0.33803721610456705, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 250.6875, "completions/max_terminated_length": 234.9375, "completions/mean_length": 174.64453125, "completions/mean_terminated_length": 166.46386337280273, "completions/min_length": 96.4375, "completions/min_terminated_length": 96.4375, "entropy": 0.0745530491694808, "epoch": 0.6997084548104956, "frac_reward_zero_std": 0.33984375, "grad_norm": 0.23312747478485107, "learning_rate": 5e-05, "loss": -0.0036, "num_tokens": 57159160.0, "reward": 10.450001329183578, "reward_std": 0.9695746805518866, "rewards/bm25_retrieval_reward_fn/mean": 0.8385304771363735, "rewards/bm25_retrieval_reward_fn/std": 0.32711231615394354, "rewards/event_reward_fn/mean": 8.76953125, "rewards/event_reward_fn/std": 5.160630002617836, "rewards/format_reward_fn/mean": 0.841939639300108, "rewards/format_reward_fn/std": 0.3311331504955888, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1201171875, "completions/max_length": 252.625, "completions/max_terminated_length": 244.75, "completions/mean_length": 186.2763671875, "completions/mean_terminated_length": 177.73645687103271, "completions/min_length": 108.3125, "completions/min_terminated_length": 108.3125, "entropy": 0.06967416848056018, "epoch": 0.7152575315840622, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.18518145382404327, "learning_rate": 5e-05, "loss": 0.0029, "num_tokens": 58498911.0, "reward": 10.422120094299316, "reward_std": 0.8677894007414579, "rewards/bm25_retrieval_reward_fn/mean": 0.8313977345824242, "rewards/bm25_retrieval_reward_fn/std": 0.3058948842808604, "rewards/event_reward_fn/mean": 8.7490234375, "rewards/event_reward_fn/std": 5.314541980624199, "rewards/format_reward_fn/mean": 0.8416987583041191, "rewards/format_reward_fn/std": 0.30255721998400986, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0556640625, "completions/max_length": 250.3125, "completions/max_terminated_length": 239.4375, "completions/mean_length": 165.4521484375, "completions/mean_terminated_length": 160.36469841003418, "completions/min_length": 98.875, "completions/min_terminated_length": 98.875, "entropy": 0.07365260319784284, "epoch": 0.7308066083576288, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.1640687733888626, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 59787238.0, "reward": 9.981308668851852, "reward_std": 0.8546336572617292, "rewards/bm25_retrieval_reward_fn/mean": 0.8880095556378365, "rewards/bm25_retrieval_reward_fn/std": 0.26380802411586046, "rewards/event_reward_fn/mean": 8.1982421875, "rewards/event_reward_fn/std": 5.018857464194298, "rewards/format_reward_fn/mean": 0.8950570411980152, "rewards/format_reward_fn/std": 0.2686548628844321, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 237.9375, "completions/max_terminated_length": 218.5, "completions/mean_length": 147.5947265625, "completions/mean_terminated_length": 141.85034561157227, "completions/min_length": 84.9375, "completions/min_terminated_length": 84.9375, "entropy": 0.0634845974855125, "epoch": 0.7463556851311953, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.20938096940517426, "learning_rate": 5e-05, "loss": -0.0135, "num_tokens": 61026539.0, "reward": 10.265896439552307, "reward_std": 0.9515191409736872, "rewards/bm25_retrieval_reward_fn/mean": 0.9135676696896553, "rewards/bm25_retrieval_reward_fn/std": 0.21515763795468956, "rewards/event_reward_fn/mean": 8.4326171875, "rewards/event_reward_fn/std": 5.086499974131584, "rewards/format_reward_fn/mean": 0.9197116829454899, "rewards/format_reward_fn/std": 0.20483782514929771, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1162109375, "completions/max_length": 253.375, "completions/max_terminated_length": 245.1875, "completions/mean_length": 187.802734375, "completions/mean_terminated_length": 179.58960628509521, "completions/min_length": 114.1875, "completions/min_terminated_length": 114.1875, "entropy": 0.0702137725893408, "epoch": 0.7619047619047619, "frac_reward_zero_std": 0.3125, "grad_norm": 0.18974582850933075, "learning_rate": 5e-05, "loss": 0.0031, "num_tokens": 62313941.0, "reward": 10.483202993869781, "reward_std": 0.943267323076725, "rewards/bm25_retrieval_reward_fn/mean": 0.84468699619174, "rewards/bm25_retrieval_reward_fn/std": 0.3197319367900491, "rewards/event_reward_fn/mean": 8.7783203125, "rewards/event_reward_fn/std": 5.225345551967621, "rewards/format_reward_fn/mean": 0.8601957745850086, "rewards/format_reward_fn/std": 0.32167423889040947, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1279296875, "completions/max_length": 252.5625, "completions/max_terminated_length": 243.5625, "completions/mean_length": 186.220703125, "completions/mean_terminated_length": 176.8216428756714, "completions/min_length": 120.125, "completions/min_terminated_length": 120.125, "entropy": 0.07592986570671201, "epoch": 0.7774538386783285, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.1492370367050171, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 63617551.0, "reward": 9.786191403865814, "reward_std": 0.9563806988298893, "rewards/bm25_retrieval_reward_fn/mean": 0.8338383696973324, "rewards/bm25_retrieval_reward_fn/std": 0.31574585498310626, "rewards/event_reward_fn/mean": 8.1015625, "rewards/event_reward_fn/std": 4.919276848435402, "rewards/format_reward_fn/mean": 0.850790549069643, "rewards/format_reward_fn/std": 0.31896755122579634, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0654296875, "completions/max_length": 249.75, "completions/max_terminated_length": 233.875, "completions/mean_length": 160.158203125, "completions/mean_terminated_length": 153.3260145187378, "completions/min_length": 89.625, "completions/min_terminated_length": 89.625, "entropy": 0.06539753102697432, "epoch": 0.793002915451895, "frac_reward_zero_std": 0.33984375, "grad_norm": 0.20876899361610413, "learning_rate": 5e-05, "loss": 0.0054, "num_tokens": 64856573.0, "reward": 9.658368825912476, "reward_std": 0.9620554894208908, "rewards/bm25_retrieval_reward_fn/mean": 0.895107377320528, "rewards/bm25_retrieval_reward_fn/std": 0.25602476752828807, "rewards/event_reward_fn/mean": 7.857421875, "rewards/event_reward_fn/std": 4.493844509124756, "rewards/format_reward_fn/mean": 0.9058398455381393, "rewards/format_reward_fn/std": 0.2541873576119542, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0712890625, "completions/max_length": 249.5, "completions/max_terminated_length": 236.25, "completions/mean_length": 169.5673828125, "completions/mean_terminated_length": 163.07803535461426, "completions/min_length": 100.75, "completions/min_terminated_length": 100.75, "entropy": 0.06818107352592051, "epoch": 0.8085519922254616, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.20962977409362793, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 66158990.0, "reward": 10.353489756584167, "reward_std": 0.9985193219035864, "rewards/bm25_retrieval_reward_fn/mean": 0.8550090603530407, "rewards/bm25_retrieval_reward_fn/std": 0.3100271187722683, "rewards/event_reward_fn/mean": 8.62890625, "rewards/event_reward_fn/std": 4.978297606110573, "rewards/format_reward_fn/mean": 0.8695743456482887, "rewards/format_reward_fn/std": 0.31247875466942787, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 255.625, "completions/max_terminated_length": 245.0625, "completions/mean_length": 183.2822265625, "completions/mean_terminated_length": 172.68112754821777, "completions/min_length": 108.0625, "completions/min_terminated_length": 108.0625, "entropy": 0.07087993528693914, "epoch": 0.8241010689990281, "frac_reward_zero_std": 0.328125, "grad_norm": 0.17762945592403412, "learning_rate": 5e-05, "loss": -0.0049, "num_tokens": 67501687.0, "reward": 10.45748645067215, "reward_std": 0.9843454174697399, "rewards/bm25_retrieval_reward_fn/mean": 0.8369721993803978, "rewards/bm25_retrieval_reward_fn/std": 0.33558181021362543, "rewards/event_reward_fn/mean": 8.7734375, "rewards/event_reward_fn/std": 5.215954706072807, "rewards/format_reward_fn/mean": 0.8470768220722675, "rewards/format_reward_fn/std": 0.33892421517521143, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.169921875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.8125, "completions/mean_length": 204.1787109375, "completions/mean_terminated_length": 193.6075620651245, "completions/min_length": 138.125, "completions/min_terminated_length": 138.125, "entropy": 0.07261033216491342, "epoch": 0.8396501457725948, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.18910834193229675, "learning_rate": 5e-05, "loss": 0.0076, "num_tokens": 68769010.0, "reward": 10.237849026918411, "reward_std": 0.9591232761740685, "rewards/bm25_retrieval_reward_fn/mean": 0.7683875225484371, "rewards/bm25_retrieval_reward_fn/std": 0.39661576971411705, "rewards/event_reward_fn/mean": 8.6796875, "rewards/event_reward_fn/std": 5.074413627386093, "rewards/format_reward_fn/mean": 0.7897739969193935, "rewards/format_reward_fn/std": 0.4046425260603428, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 255.75, "completions/max_terminated_length": 243.25, "completions/mean_length": 185.24609375, "completions/mean_terminated_length": 176.98966312408447, "completions/min_length": 115.1875, "completions/min_terminated_length": 115.1875, "entropy": 0.07335718860849738, "epoch": 0.8551992225461613, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.19497302174568176, "learning_rate": 5e-05, "loss": -0.0045, "num_tokens": 70072946.0, "reward": 9.902502715587616, "reward_std": 1.0277547165751457, "rewards/bm25_retrieval_reward_fn/mean": 0.8267017714679241, "rewards/bm25_retrieval_reward_fn/std": 0.33686008118093014, "rewards/event_reward_fn/mean": 8.2275390625, "rewards/event_reward_fn/std": 4.601325109601021, "rewards/format_reward_fn/mean": 0.848261721432209, "rewards/format_reward_fn/std": 0.33925584983080626, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0771484375, "completions/max_length": 253.25, "completions/max_terminated_length": 235.625, "completions/mean_length": 168.2685546875, "completions/mean_terminated_length": 160.67493724822998, "completions/min_length": 91.75, "completions/min_terminated_length": 91.75, "entropy": 0.07022251281887293, "epoch": 0.8707482993197279, "frac_reward_zero_std": 0.31640625, "grad_norm": 0.22896689176559448, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 71408625.0, "reward": 10.46012270450592, "reward_std": 0.9469396620988846, "rewards/bm25_retrieval_reward_fn/mean": 0.8612882420420647, "rewards/bm25_retrieval_reward_fn/std": 0.27462146105244756, "rewards/event_reward_fn/mean": 8.716796875, "rewards/event_reward_fn/std": 5.030902713537216, "rewards/format_reward_fn/mean": 0.8820377588272095, "rewards/format_reward_fn/std": 0.2647479181177914, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 242.9375, "completions/max_terminated_length": 230.4375, "completions/mean_length": 145.1064453125, "completions/mean_terminated_length": 139.08607959747314, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.06589728174731135, "epoch": 0.8862973760932945, "frac_reward_zero_std": 0.3671875, "grad_norm": 0.3958277702331543, "learning_rate": 5e-05, "loss": -0.0044, "num_tokens": 72662642.0, "reward": 10.200001657009125, "reward_std": 0.9421045333147049, "rewards/bm25_retrieval_reward_fn/mean": 0.9011733010411263, "rewards/bm25_retrieval_reward_fn/std": 0.2283891054103151, "rewards/event_reward_fn/mean": 8.373046875, "rewards/event_reward_fn/std": 4.713611409068108, "rewards/format_reward_fn/mean": 0.92578125, "rewards/format_reward_fn/std": 0.21754403738304973, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0283203125, "completions/max_length": 238.5, "completions/max_terminated_length": 225.9375, "completions/mean_length": 132.8330078125, "completions/mean_terminated_length": 129.05783081054688, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.060941929230466485, "epoch": 0.901846452866861, "frac_reward_zero_std": 0.4296875, "grad_norm": 0.17806340754032135, "learning_rate": 5e-05, "loss": -0.0042, "num_tokens": 73875967.0, "reward": 9.45022863149643, "reward_std": 0.7097359485924244, "rewards/bm25_retrieval_reward_fn/mean": 0.935970850288868, "rewards/bm25_retrieval_reward_fn/std": 0.16215045971330255, "rewards/event_reward_fn/mean": 7.5576171875, "rewards/event_reward_fn/std": 4.745793879032135, "rewards/format_reward_fn/mean": 0.9566406235098839, "rewards/format_reward_fn/std": 0.13776301313191652, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 251.5, "completions/max_terminated_length": 235.5, "completions/mean_length": 162.619140625, "completions/mean_terminated_length": 155.03940200805664, "completions/min_length": 95.9375, "completions/min_terminated_length": 95.9375, "entropy": 0.06535043194890022, "epoch": 0.9173955296404276, "frac_reward_zero_std": 0.3671875, "grad_norm": 0.3161742687225342, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 75159981.0, "reward": 10.09964656829834, "reward_std": 0.951118241995573, "rewards/bm25_retrieval_reward_fn/mean": 0.8730840981006622, "rewards/bm25_retrieval_reward_fn/std": 0.2608068126719445, "rewards/event_reward_fn/mean": 8.326171875, "rewards/event_reward_fn/std": 5.628681242465973, "rewards/format_reward_fn/mean": 0.900390625, "rewards/format_reward_fn/std": 0.25107863638550043, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1083984375, "completions/max_length": 252.875, "completions/max_terminated_length": 241.9375, "completions/mean_length": 188.71875, "completions/mean_terminated_length": 181.2633514404297, "completions/min_length": 117.625, "completions/min_terminated_length": 117.625, "entropy": 0.07137463777326047, "epoch": 0.9329446064139941, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.2805193066596985, "learning_rate": 5e-05, "loss": -0.0023, "num_tokens": 76415209.0, "reward": 10.079432845115662, "reward_std": 0.7802535220980644, "rewards/bm25_retrieval_reward_fn/mean": 0.8216202445328236, "rewards/bm25_retrieval_reward_fn/std": 0.3154827356338501, "rewards/event_reward_fn/mean": 8.40625, "rewards/event_reward_fn/std": 5.284300252795219, "rewards/format_reward_fn/mean": 0.8515625, "rewards/format_reward_fn/std": 0.3159356191754341, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.162109375, "completions/max_length": 256.0, "completions/max_terminated_length": 246.875, "completions/mean_length": 205.09375, "completions/mean_terminated_length": 195.3371343612671, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06873999303206801, "epoch": 0.9484936831875608, "frac_reward_zero_std": 0.328125, "grad_norm": 0.10008546710014343, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 77770037.0, "reward": 10.352019369602203, "reward_std": 0.7665594182908535, "rewards/bm25_retrieval_reward_fn/mean": 0.7921560294926167, "rewards/bm25_retrieval_reward_fn/std": 0.3632864858955145, "rewards/event_reward_fn/mean": 8.732421875, "rewards/event_reward_fn/std": 5.339399605989456, "rewards/format_reward_fn/mean": 0.8274414055049419, "rewards/format_reward_fn/std": 0.37096375692635775, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0654296875, "completions/max_length": 254.0, "completions/max_terminated_length": 241.5, "completions/mean_length": 184.2421875, "completions/mean_terminated_length": 179.25956344604492, "completions/min_length": 118.875, "completions/min_terminated_length": 118.875, "entropy": 0.07302290247753263, "epoch": 0.9640427599611273, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.11569506675004959, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 79058385.0, "reward": 10.270190715789795, "reward_std": 0.9035040959715843, "rewards/bm25_retrieval_reward_fn/mean": 0.8707767426967621, "rewards/bm25_retrieval_reward_fn/std": 0.2817615191452205, "rewards/event_reward_fn/mean": 8.4990234375, "rewards/event_reward_fn/std": 5.17444010078907, "rewards/format_reward_fn/mean": 0.900390625, "rewards/format_reward_fn/std": 0.27954914048314095, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 256.0, "completions/max_terminated_length": 241.625, "completions/mean_length": 177.517578125, "completions/mean_terminated_length": 170.22113609313965, "completions/min_length": 109.5625, "completions/min_terminated_length": 109.5625, "entropy": 0.07096637412905693, "epoch": 0.9795918367346939, "frac_reward_zero_std": 0.328125, "grad_norm": 0.24779611825942993, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 80333979.0, "reward": 10.588190495967865, "reward_std": 0.8662494085729122, "rewards/bm25_retrieval_reward_fn/mean": 0.8662735223770142, "rewards/bm25_retrieval_reward_fn/std": 0.29072041157633066, "rewards/event_reward_fn/mean": 8.833984375, "rewards/event_reward_fn/std": 5.076077088713646, "rewards/format_reward_fn/mean": 0.8879324793815613, "rewards/format_reward_fn/std": 0.28930215165019035, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0673828125, "completions/max_length": 252.1875, "completions/max_terminated_length": 235.3125, "completions/mean_length": 171.123046875, "completions/mean_terminated_length": 165.06201934814453, "completions/min_length": 99.5625, "completions/min_terminated_length": 99.5625, "entropy": 0.06698882719501853, "epoch": 0.9951409135082604, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.19038081169128418, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 81595449.0, "reward": 10.268950939178467, "reward_std": 0.8857842069119215, "rewards/bm25_retrieval_reward_fn/mean": 0.8906435556709766, "rewards/bm25_retrieval_reward_fn/std": 0.25779614597558975, "rewards/event_reward_fn/mean": 8.462890625, "rewards/event_reward_fn/std": 5.074081584811211, "rewards/format_reward_fn/mean": 0.915416669100523, "rewards/format_reward_fn/std": 0.25232047867029905, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08806818181818182, "completions/max_length": 254.45454545454547, "completions/max_terminated_length": 244.9090909090909, "completions/mean_length": 178.65482954545453, "completions/mean_terminated_length": 171.0937597101385, "completions/min_length": 112.27272727272727, "completions/min_terminated_length": 112.27272727272727, "entropy": 0.07033889300443909, "epoch": 1.010689990281827, "frac_reward_zero_std": 0.3465909090909091, "grad_norm": 0.13537470996379852, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 82875743.0, "reward": 10.102936571294611, "reward_std": 0.9548169591210105, "rewards/bm25_retrieval_reward_fn/mean": 0.8734858144413341, "rewards/bm25_retrieval_reward_fn/std": 0.2784238914874467, "rewards/event_reward_fn/mean": 8.342329545454545, "rewards/event_reward_fn/std": 4.7735207947817715, "rewards/format_reward_fn/mean": 0.8871212113987316, "rewards/format_reward_fn/std": 0.27951826561581006, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.123046875, "completions/max_length": 255.5625, "completions/max_terminated_length": 248.1875, "completions/mean_length": 187.521484375, "completions/mean_terminated_length": 177.78209781646729, "completions/min_length": 120.125, "completions/min_terminated_length": 120.125, "entropy": 0.06871294020675123, "epoch": 1.0262390670553936, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.14529068768024445, "learning_rate": 5e-05, "loss": -0.0064, "num_tokens": 84193629.0, "reward": 10.711235225200653, "reward_std": 0.9264990799129009, "rewards/bm25_retrieval_reward_fn/mean": 0.8358808867633343, "rewards/bm25_retrieval_reward_fn/std": 0.3423158023506403, "rewards/event_reward_fn/mean": 9.029296875, "rewards/event_reward_fn/std": 5.407557427883148, "rewards/format_reward_fn/mean": 0.8460574820637703, "rewards/format_reward_fn/std": 0.34603168070316315, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.625, "completions/mean_length": 188.896484375, "completions/mean_terminated_length": 181.97695064544678, "completions/min_length": 120.8125, "completions/min_terminated_length": 120.8125, "entropy": 0.06455810344778001, "epoch": 1.0417881438289602, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.17880740761756897, "learning_rate": 5e-05, "loss": -0.0014, "num_tokens": 85537059.0, "reward": 10.491520524024963, "reward_std": 0.9325292967259884, "rewards/bm25_retrieval_reward_fn/mean": 0.841754749417305, "rewards/bm25_retrieval_reward_fn/std": 0.3239448321983218, "rewards/event_reward_fn/mean": 8.7900390625, "rewards/event_reward_fn/std": 5.293820217251778, "rewards/format_reward_fn/mean": 0.8597265593707561, "rewards/format_reward_fn/std": 0.3177201831713319, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 255.5, "completions/max_terminated_length": 244.5, "completions/mean_length": 200.1123046875, "completions/mean_terminated_length": 190.06258392333984, "completions/min_length": 137.1875, "completions/min_terminated_length": 137.1875, "entropy": 0.06795511720702052, "epoch": 1.0573372206025267, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.08574163913726807, "learning_rate": 5e-05, "loss": -0.0069, "num_tokens": 86860678.0, "reward": 10.812386631965637, "reward_std": 1.0998864620923996, "rewards/bm25_retrieval_reward_fn/mean": 0.7638319730758667, "rewards/bm25_retrieval_reward_fn/std": 0.3960200799629092, "rewards/event_reward_fn/mean": 9.2724609375, "rewards/event_reward_fn/std": 6.0772674679756165, "rewards/format_reward_fn/mean": 0.7760937549173832, "rewards/format_reward_fn/std": 0.4006781214848161, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2978515625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 220.5654296875, "completions/mean_terminated_length": 206.34245109558105, "completions/min_length": 159.6875, "completions/min_terminated_length": 159.6875, "entropy": 0.07582874409854412, "epoch": 1.0728862973760933, "frac_reward_zero_std": 0.234375, "grad_norm": 0.20225036144256592, "learning_rate": 5e-05, "loss": 0.0128, "num_tokens": 88190949.0, "reward": 10.548758864402771, "reward_std": 0.9557082541286945, "rewards/bm25_retrieval_reward_fn/mean": 0.6565842125564814, "rewards/bm25_retrieval_reward_fn/std": 0.42724930588155985, "rewards/event_reward_fn/mean": 9.2119140625, "rewards/event_reward_fn/std": 5.52141310274601, "rewards/format_reward_fn/mean": 0.6802604161202908, "rewards/format_reward_fn/std": 0.4410219779238105, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 254.9375, "completions/max_terminated_length": 243.1875, "completions/mean_length": 181.650390625, "completions/mean_terminated_length": 176.39045429229736, "completions/min_length": 119.75, "completions/min_terminated_length": 119.75, "entropy": 0.07010088441893458, "epoch": 1.08843537414966, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.1840677410364151, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 89433463.0, "reward": 10.5318962931633, "reward_std": 1.0016295239329338, "rewards/bm25_retrieval_reward_fn/mean": 0.8906202651560307, "rewards/bm25_retrieval_reward_fn/std": 0.2701933770440519, "rewards/event_reward_fn/mean": 8.732421875, "rewards/event_reward_fn/std": 5.466498285531998, "rewards/format_reward_fn/mean": 0.9088541679084301, "rewards/format_reward_fn/std": 0.26082120556384325, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 252.0, "completions/max_terminated_length": 241.0625, "completions/mean_length": 172.640625, "completions/mean_terminated_length": 167.69061183929443, "completions/min_length": 109.125, "completions/min_terminated_length": 109.125, "entropy": 0.06799150491133332, "epoch": 1.1039844509232264, "frac_reward_zero_std": 0.4140625, "grad_norm": 0.13713772594928741, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 90709603.0, "reward": 10.70878279209137, "reward_std": 0.8687677383422852, "rewards/bm25_retrieval_reward_fn/mean": 0.901556234806776, "rewards/bm25_retrieval_reward_fn/std": 0.25905836455058306, "rewards/event_reward_fn/mean": 8.890625, "rewards/event_reward_fn/std": 5.81499570608139, "rewards/format_reward_fn/mean": 0.9166015610098839, "rewards/format_reward_fn/std": 0.2588641280308366, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0537109375, "completions/max_length": 245.375, "completions/max_terminated_length": 232.6875, "completions/mean_length": 173.802734375, "completions/mean_terminated_length": 169.22315788269043, "completions/min_length": 108.6875, "completions/min_terminated_length": 108.6875, "entropy": 0.06564864912070334, "epoch": 1.119533527696793, "frac_reward_zero_std": 0.390625, "grad_norm": 0.12585203349590302, "learning_rate": 5e-05, "loss": -0.0042, "num_tokens": 91936041.0, "reward": 10.107302486896515, "reward_std": 0.8662888705730438, "rewards/bm25_retrieval_reward_fn/mean": 0.8905055709183216, "rewards/bm25_retrieval_reward_fn/std": 0.27319654333405197, "rewards/event_reward_fn/mean": 8.3056640625, "rewards/event_reward_fn/std": 5.399076372385025, "rewards/format_reward_fn/mean": 0.9111328125, "rewards/format_reward_fn/std": 0.2696619238704443, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 251.875, "completions/max_terminated_length": 239.5, "completions/mean_length": 181.62109375, "completions/mean_terminated_length": 175.5115909576416, "completions/min_length": 116.1875, "completions/min_terminated_length": 116.1875, "entropy": 0.06781496806070209, "epoch": 1.1350826044703597, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.20404520630836487, "learning_rate": 5e-05, "loss": -0.0056, "num_tokens": 93201105.0, "reward": 10.648929178714752, "reward_std": 0.9610726498067379, "rewards/bm25_retrieval_reward_fn/mean": 0.8819760829210281, "rewards/bm25_retrieval_reward_fn/std": 0.27036565099842846, "rewards/event_reward_fn/mean": 8.8681640625, "rewards/event_reward_fn/std": 5.41285502910614, "rewards/format_reward_fn/mean": 0.8987890630960464, "rewards/format_reward_fn/std": 0.2619143519550562, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1376953125, "completions/max_length": 254.375, "completions/max_terminated_length": 246.9375, "completions/mean_length": 201.556640625, "completions/mean_terminated_length": 193.48375415802002, "completions/min_length": 136.3125, "completions/min_terminated_length": 136.3125, "entropy": 0.06725385342724621, "epoch": 1.1506316812439261, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.1181834414601326, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 94576891.0, "reward": 11.315544486045837, "reward_std": 0.8351979665458202, "rewards/bm25_retrieval_reward_fn/mean": 0.8150821626186371, "rewards/bm25_retrieval_reward_fn/std": 0.34588195278774947, "rewards/event_reward_fn/mean": 9.6708984375, "rewards/event_reward_fn/std": 5.589598774909973, "rewards/format_reward_fn/mean": 0.829563807696104, "rewards/format_reward_fn/std": 0.34839474968612194, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.154296875, "completions/max_length": 253.375, "completions/max_terminated_length": 247.1875, "completions/mean_length": 203.18359375, "completions/mean_terminated_length": 194.42267608642578, "completions/min_length": 138.5625, "completions/min_terminated_length": 138.5625, "entropy": 0.07394770160317421, "epoch": 1.1661807580174928, "frac_reward_zero_std": 0.26171875, "grad_norm": 0.18629314005374908, "learning_rate": 5e-05, "loss": 0.0049, "num_tokens": 95881507.0, "reward": 10.813474893569946, "reward_std": 1.0365745667368174, "rewards/bm25_retrieval_reward_fn/mean": 0.8142888676375151, "rewards/bm25_retrieval_reward_fn/std": 0.3391092037782073, "rewards/event_reward_fn/mean": 9.1708984375, "rewards/event_reward_fn/std": 5.419242635369301, "rewards/format_reward_fn/mean": 0.8282877653837204, "rewards/format_reward_fn/std": 0.34210248570889235, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0810546875, "completions/max_length": 251.3125, "completions/max_terminated_length": 242.875, "completions/mean_length": 186.673828125, "completions/mean_terminated_length": 180.55548667907715, "completions/min_length": 126.1875, "completions/min_terminated_length": 126.1875, "entropy": 0.06798666249960661, "epoch": 1.1817298347910592, "frac_reward_zero_std": 0.40234375, "grad_norm": 0.17748276889324188, "learning_rate": 5e-05, "loss": -0.0026, "num_tokens": 97176789.0, "reward": 10.461668372154236, "reward_std": 0.7730772253125906, "rewards/bm25_retrieval_reward_fn/mean": 0.8607958517968655, "rewards/bm25_retrieval_reward_fn/std": 0.31750916969031096, "rewards/event_reward_fn/mean": 8.73046875, "rewards/event_reward_fn/std": 4.8714660704135895, "rewards/format_reward_fn/mean": 0.8704036474227905, "rewards/format_reward_fn/std": 0.3206114452332258, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1318359375, "completions/max_length": 255.125, "completions/max_terminated_length": 246.6875, "completions/mean_length": 200.603515625, "completions/mean_terminated_length": 192.25616931915283, "completions/min_length": 141.375, "completions/min_terminated_length": 141.375, "entropy": 0.07248709676787257, "epoch": 1.1972789115646258, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.15709719061851501, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 98527511.0, "reward": 10.778121054172516, "reward_std": 0.9276621714234352, "rewards/bm25_retrieval_reward_fn/mean": 0.848999809473753, "rewards/bm25_retrieval_reward_fn/std": 0.31297336355783045, "rewards/event_reward_fn/mean": 9.0654296875, "rewards/event_reward_fn/std": 5.5686564445495605, "rewards/format_reward_fn/mean": 0.863691408187151, "rewards/format_reward_fn/std": 0.31423071026802063, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 256.0, "completions/max_terminated_length": 246.25, "completions/mean_length": 201.720703125, "completions/mean_terminated_length": 194.1233615875244, "completions/min_length": 144.375, "completions/min_terminated_length": 144.375, "entropy": 0.07165544992312789, "epoch": 1.2128279883381925, "frac_reward_zero_std": 0.26953125, "grad_norm": 0.14202959835529327, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 99860557.0, "reward": 11.258443832397461, "reward_std": 0.9464571885764599, "rewards/bm25_retrieval_reward_fn/mean": 0.8287562467157841, "rewards/bm25_retrieval_reward_fn/std": 0.3439189847558737, "rewards/event_reward_fn/mean": 9.5859375, "rewards/event_reward_fn/std": 5.814349502325058, "rewards/format_reward_fn/mean": 0.84375, "rewards/format_reward_fn/std": 0.3489691922441125, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 254.5, "completions/max_terminated_length": 245.8125, "completions/mean_length": 195.880859375, "completions/mean_terminated_length": 189.886492729187, "completions/min_length": 131.9375, "completions/min_terminated_length": 131.9375, "entropy": 0.07229456026107073, "epoch": 1.228377065111759, "frac_reward_zero_std": 0.2890625, "grad_norm": 0.2180081307888031, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 101156375.0, "reward": 10.139498263597488, "reward_std": 0.8279522079974413, "rewards/bm25_retrieval_reward_fn/mean": 0.8485933281481266, "rewards/bm25_retrieval_reward_fn/std": 0.3134065044578165, "rewards/event_reward_fn/mean": 8.4267578125, "rewards/event_reward_fn/std": 5.1162159740924835, "rewards/format_reward_fn/mean": 0.864147137850523, "rewards/format_reward_fn/std": 0.31587369833141565, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0693359375, "completions/max_length": 249.0, "completions/max_terminated_length": 244.3125, "completions/mean_length": 191.0654296875, "completions/mean_terminated_length": 186.35216617584229, "completions/min_length": 133.375, "completions/min_terminated_length": 133.375, "entropy": 0.06208949023857713, "epoch": 1.2439261418853256, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.11896482855081558, "learning_rate": 5e-05, "loss": -0.0008, "num_tokens": 102475714.0, "reward": 11.101193368434906, "reward_std": 0.891064302995801, "rewards/bm25_retrieval_reward_fn/mean": 0.9093973524868488, "rewards/bm25_retrieval_reward_fn/std": 0.2311963284155354, "rewards/event_reward_fn/mean": 9.275390625, "rewards/event_reward_fn/std": 5.045834094285965, "rewards/format_reward_fn/mean": 0.9164053164422512, "rewards/format_reward_fn/std": 0.2260741894133389, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.5625, "completions/mean_length": 203.279296875, "completions/mean_terminated_length": 195.46306896209717, "completions/min_length": 141.875, "completions/min_terminated_length": 141.875, "entropy": 0.06049947580322623, "epoch": 1.259475218658892, "frac_reward_zero_std": 0.3984375, "grad_norm": 0.32402676343917847, "learning_rate": 5e-05, "loss": 0.004, "num_tokens": 103787612.0, "reward": 10.865350365638733, "reward_std": 0.8294984549283981, "rewards/bm25_retrieval_reward_fn/mean": 0.8310730122029781, "rewards/bm25_retrieval_reward_fn/std": 0.3465144941583276, "rewards/event_reward_fn/mean": 9.1904296875, "rewards/event_reward_fn/std": 5.264712706208229, "rewards/format_reward_fn/mean": 0.8438476547598839, "rewards/format_reward_fn/std": 0.35203980933874846, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.158203125, "completions/max_length": 255.9375, "completions/max_terminated_length": 248.4375, "completions/mean_length": 210.01953125, "completions/mean_terminated_length": 201.3692398071289, "completions/min_length": 153.25, "completions/min_terminated_length": 153.25, "entropy": 0.06503300159238279, "epoch": 1.2750242954324587, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.12493407726287842, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 105090944.0, "reward": 10.944722652435303, "reward_std": 0.8488267995417118, "rewards/bm25_retrieval_reward_fn/mean": 0.8110702559351921, "rewards/bm25_retrieval_reward_fn/std": 0.3519176107365638, "rewards/event_reward_fn/mean": 9.3095703125, "rewards/event_reward_fn/std": 5.613954737782478, "rewards/format_reward_fn/mean": 0.8240820355713367, "rewards/format_reward_fn/std": 0.3540602792054415, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1943359375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.375, "completions/mean_length": 220.37890625, "completions/mean_terminated_length": 212.05939102172852, "completions/min_length": 168.75, "completions/min_terminated_length": 168.75, "entropy": 0.07254446996375918, "epoch": 1.2905733722060253, "frac_reward_zero_std": 0.28125, "grad_norm": 0.20231589674949646, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 106417228.0, "reward": 10.507185876369476, "reward_std": 0.8794627524912357, "rewards/bm25_retrieval_reward_fn/mean": 0.7671468704938889, "rewards/bm25_retrieval_reward_fn/std": 0.3928522327914834, "rewards/event_reward_fn/mean": 8.9560546875, "rewards/event_reward_fn/std": 4.9830086678266525, "rewards/format_reward_fn/mean": 0.7839843779802322, "rewards/format_reward_fn/std": 0.3996342560276389, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.162109375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.5, "completions/mean_length": 212.796875, "completions/mean_terminated_length": 205.06715965270996, "completions/min_length": 154.0625, "completions/min_terminated_length": 154.0625, "entropy": 0.07244179910048842, "epoch": 1.306122448979592, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.11867273598909378, "learning_rate": 5e-05, "loss": 0.0027, "num_tokens": 107788032.0, "reward": 9.871440827846527, "reward_std": 0.9372463561594486, "rewards/bm25_retrieval_reward_fn/mean": 0.7559298947453499, "rewards/bm25_retrieval_reward_fn/std": 0.39855979569256306, "rewards/event_reward_fn/mean": 8.3486328125, "rewards/event_reward_fn/std": 5.14014707505703, "rewards/format_reward_fn/mean": 0.766878254711628, "rewards/format_reward_fn/std": 0.40443217288702726, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 250.5, "completions/max_terminated_length": 242.5625, "completions/mean_length": 190.1435546875, "completions/mean_terminated_length": 186.02939891815186, "completions/min_length": 129.1875, "completions/min_terminated_length": 129.1875, "entropy": 0.07182836486026645, "epoch": 1.3216715257531584, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.2237968146800995, "learning_rate": 5e-05, "loss": 0.0051, "num_tokens": 109081171.0, "reward": 11.108476847410202, "reward_std": 0.8281007707118988, "rewards/bm25_retrieval_reward_fn/mean": 0.8979316018521786, "rewards/bm25_retrieval_reward_fn/std": 0.2320653998758644, "rewards/event_reward_fn/mean": 9.30078125, "rewards/event_reward_fn/std": 5.342449679970741, "rewards/format_reward_fn/mean": 0.90976407751441, "rewards/format_reward_fn/std": 0.2275423549581319, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0478515625, "completions/max_length": 249.8125, "completions/max_terminated_length": 240.5625, "completions/mean_length": 181.1884765625, "completions/mean_terminated_length": 177.4005880355835, "completions/min_length": 118.5, "completions/min_terminated_length": 118.5, "entropy": 0.06897289073094726, "epoch": 1.337220602526725, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.3482232987880707, "learning_rate": 5e-05, "loss": -0.0049, "num_tokens": 110353716.0, "reward": 10.944713652133942, "reward_std": 0.9444422572851181, "rewards/bm25_retrieval_reward_fn/mean": 0.8998373299837112, "rewards/bm25_retrieval_reward_fn/std": 0.23540139599936083, "rewards/event_reward_fn/mean": 9.1416015625, "rewards/event_reward_fn/std": 5.0793561935424805, "rewards/format_reward_fn/mean": 0.9032747447490692, "rewards/format_reward_fn/std": 0.249714526347816, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 251.3125, "completions/max_terminated_length": 241.8125, "completions/mean_length": 191.3935546875, "completions/mean_terminated_length": 185.62936782836914, "completions/min_length": 134.875, "completions/min_terminated_length": 134.875, "entropy": 0.07020568964071572, "epoch": 1.3527696793002915, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.26569458842277527, "learning_rate": 5e-05, "loss": -0.0078, "num_tokens": 111677811.0, "reward": 11.232036709785461, "reward_std": 0.8982522189617157, "rewards/bm25_retrieval_reward_fn/mean": 0.8747124671936035, "rewards/bm25_retrieval_reward_fn/std": 0.28446152550168335, "rewards/event_reward_fn/mean": 9.4736328125, "rewards/event_reward_fn/std": 5.696656331419945, "rewards/format_reward_fn/mean": 0.8836914077401161, "rewards/format_reward_fn/std": 0.2844822397455573, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1943359375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.3125, "completions/mean_length": 217.27734375, "completions/mean_terminated_length": 207.9519443511963, "completions/min_length": 165.875, "completions/min_terminated_length": 165.875, "entropy": 0.07557977363467216, "epoch": 1.3683187560738581, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.19800527393817902, "learning_rate": 5e-05, "loss": 0.0051, "num_tokens": 112955859.0, "reward": 10.414989709854126, "reward_std": 0.8083504606038332, "rewards/bm25_retrieval_reward_fn/mean": 0.7803216241300106, "rewards/bm25_retrieval_reward_fn/std": 0.3945994917303324, "rewards/event_reward_fn/mean": 8.841796875, "rewards/event_reward_fn/std": 4.6407610476017, "rewards/format_reward_fn/mean": 0.7928710989654064, "rewards/format_reward_fn/std": 0.39927749149501324, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1201171875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 205.212890625, "completions/mean_terminated_length": 198.47776794433594, "completions/min_length": 152.8125, "completions/min_terminated_length": 152.8125, "entropy": 0.07301379647105932, "epoch": 1.3838678328474248, "frac_reward_zero_std": 0.375, "grad_norm": 0.13404177129268646, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 114336889.0, "reward": 11.331741988658905, "reward_std": 0.8269859068095684, "rewards/bm25_retrieval_reward_fn/mean": 0.8490662761032581, "rewards/bm25_retrieval_reward_fn/std": 0.32220354955643415, "rewards/event_reward_fn/mean": 9.6259765625, "rewards/event_reward_fn/std": 5.535077631473541, "rewards/format_reward_fn/mean": 0.8566992208361626, "rewards/format_reward_fn/std": 0.3254187796264887, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0771484375, "completions/max_length": 254.125, "completions/max_terminated_length": 244.625, "completions/mean_length": 195.3828125, "completions/mean_terminated_length": 190.21417903900146, "completions/min_length": 143.1875, "completions/min_terminated_length": 143.1875, "entropy": 0.07323169219307601, "epoch": 1.3994169096209912, "frac_reward_zero_std": 0.34375, "grad_norm": 0.5236871242523193, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 115649713.0, "reward": 11.00212150812149, "reward_std": 0.9517039023339748, "rewards/bm25_retrieval_reward_fn/mean": 0.8683973699808121, "rewards/bm25_retrieval_reward_fn/std": 0.3078096741810441, "rewards/event_reward_fn/mean": 9.25390625, "rewards/event_reward_fn/std": 5.412711590528488, "rewards/format_reward_fn/mean": 0.8798177093267441, "rewards/format_reward_fn/std": 0.3116344837471843, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 255.0, "completions/max_terminated_length": 247.0625, "completions/mean_length": 194.5361328125, "completions/mean_terminated_length": 190.66623401641846, "completions/min_length": 140.625, "completions/min_terminated_length": 140.625, "entropy": 0.0805603014305234, "epoch": 1.4149659863945578, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.24460569024085999, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 116942470.0, "reward": 10.231546640396118, "reward_std": 0.7946171164512634, "rewards/bm25_retrieval_reward_fn/mean": 0.9014858566224575, "rewards/bm25_retrieval_reward_fn/std": 0.2548077297396958, "rewards/event_reward_fn/mean": 8.4208984375, "rewards/event_reward_fn/std": 4.865608409047127, "rewards/format_reward_fn/mean": 0.9091623313724995, "rewards/format_reward_fn/std": 0.2567377556115389, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 253.9375, "completions/max_terminated_length": 245.1875, "completions/mean_length": 196.1181640625, "completions/mean_terminated_length": 193.0701208114624, "completions/min_length": 151.625, "completions/min_terminated_length": 151.625, "entropy": 0.08425948722288013, "epoch": 1.4305150631681243, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.207608163356781, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 118257695.0, "reward": 10.509873569011688, "reward_std": 0.8822544571012259, "rewards/bm25_retrieval_reward_fn/mean": 0.9213385097682476, "rewards/bm25_retrieval_reward_fn/std": 0.20133669557981193, "rewards/event_reward_fn/mean": 8.650390625, "rewards/event_reward_fn/std": 5.170787841081619, "rewards/format_reward_fn/mean": 0.9381445348262787, "rewards/format_reward_fn/std": 0.1984235211275518, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 249.875, "completions/max_terminated_length": 240.25, "completions/mean_length": 186.7001953125, "completions/mean_terminated_length": 182.21374893188477, "completions/min_length": 132.8125, "completions/min_terminated_length": 132.8125, "entropy": 0.0810700710862875, "epoch": 1.446064139941691, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.2522001564502716, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 119571004.0, "reward": 11.150494575500488, "reward_std": 0.9185313917696476, "rewards/bm25_retrieval_reward_fn/mean": 0.9072700254619122, "rewards/bm25_retrieval_reward_fn/std": 0.238927063299343, "rewards/event_reward_fn/mean": 9.32421875, "rewards/event_reward_fn/std": 5.231076046824455, "rewards/format_reward_fn/mean": 0.9190057702362537, "rewards/format_reward_fn/std": 0.23810118879191577, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 253.5, "completions/max_terminated_length": 244.625, "completions/mean_length": 190.771484375, "completions/mean_terminated_length": 185.8874397277832, "completions/min_length": 140.1875, "completions/min_terminated_length": 140.1875, "entropy": 0.0769493873231113, "epoch": 1.4616132167152576, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.13221606612205505, "learning_rate": 5e-05, "loss": -0.0052, "num_tokens": 120878670.0, "reward": 11.72208970785141, "reward_std": 0.9891778491437435, "rewards/bm25_retrieval_reward_fn/mean": 0.865969829261303, "rewards/bm25_retrieval_reward_fn/std": 0.3025930265430361, "rewards/event_reward_fn/mean": 9.978515625, "rewards/event_reward_fn/std": 6.088510304689407, "rewards/format_reward_fn/mean": 0.8776041679084301, "rewards/format_reward_fn/std": 0.30370487459003925, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.875, "completions/mean_length": 210.828125, "completions/mean_terminated_length": 206.00187873840332, "completions/min_length": 159.9375, "completions/min_terminated_length": 159.9375, "entropy": 0.09037951100617647, "epoch": 1.4771622934888242, "frac_reward_zero_std": 0.31640625, "grad_norm": 0.303564190864563, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 122164070.0, "reward": 11.119612038135529, "reward_std": 0.99767005443573, "rewards/bm25_retrieval_reward_fn/mean": 0.8756668232381344, "rewards/bm25_retrieval_reward_fn/std": 0.29590372927486897, "rewards/event_reward_fn/mean": 9.3515625, "rewards/event_reward_fn/std": 5.329805389046669, "rewards/format_reward_fn/mean": 0.8923828117549419, "rewards/format_reward_fn/std": 0.30003819055855274, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1669921875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 215.0888671875, "completions/mean_terminated_length": 206.6857042312622, "completions/min_length": 164.4375, "completions/min_terminated_length": 164.4375, "entropy": 0.09090339438989758, "epoch": 1.4927113702623906, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.16249267756938934, "learning_rate": 5e-05, "loss": 0.0016, "num_tokens": 123527081.0, "reward": 10.766064465045929, "reward_std": 0.8386576101183891, "rewards/bm25_retrieval_reward_fn/mean": 0.7968913167715073, "rewards/bm25_retrieval_reward_fn/std": 0.3705411199480295, "rewards/event_reward_fn/mean": 9.1552734375, "rewards/event_reward_fn/std": 5.637863516807556, "rewards/format_reward_fn/mean": 0.8138997405767441, "rewards/format_reward_fn/std": 0.3759169615805149, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1435546875, "completions/max_length": 255.375, "completions/max_terminated_length": 250.375, "completions/mean_length": 212.8251953125, "completions/mean_terminated_length": 205.47227001190186, "completions/min_length": 157.5625, "completions/min_terminated_length": 157.5625, "entropy": 0.10008962173014879, "epoch": 1.508260447035957, "frac_reward_zero_std": 0.265625, "grad_norm": 0.23113620281219482, "learning_rate": 5e-05, "loss": 0.004, "num_tokens": 124865830.0, "reward": 10.332128584384918, "reward_std": 1.082621719688177, "rewards/bm25_retrieval_reward_fn/mean": 0.8261714465916157, "rewards/bm25_retrieval_reward_fn/std": 0.3391446927562356, "rewards/event_reward_fn/mean": 8.6630859375, "rewards/event_reward_fn/std": 5.3445031344890594, "rewards/format_reward_fn/mean": 0.8428710959851742, "rewards/format_reward_fn/std": 0.34356776159256697, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1103515625, "completions/max_length": 255.125, "completions/max_terminated_length": 247.875, "completions/mean_length": 204.4716796875, "completions/mean_terminated_length": 198.4902868270874, "completions/min_length": 149.9375, "completions/min_terminated_length": 149.9375, "entropy": 0.09716548025608063, "epoch": 1.5238095238095237, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.13532325625419617, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 126156049.0, "reward": 9.934103816747665, "reward_std": 0.9690110310912132, "rewards/bm25_retrieval_reward_fn/mean": 0.8615453615784645, "rewards/bm25_retrieval_reward_fn/std": 0.2830730821006, "rewards/event_reward_fn/mean": 8.1953125, "rewards/event_reward_fn/std": 4.997192412614822, "rewards/format_reward_fn/mean": 0.8772460930049419, "rewards/format_reward_fn/std": 0.27953232545405626, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0634765625, "completions/max_length": 253.625, "completions/max_terminated_length": 241.8125, "completions/mean_length": 191.8193359375, "completions/mean_terminated_length": 187.4044713973999, "completions/min_length": 134.4375, "completions/min_terminated_length": 134.4375, "entropy": 0.08724062331020832, "epoch": 1.5393586005830904, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.13813965022563934, "learning_rate": 5e-05, "loss": -0.0036, "num_tokens": 127483244.0, "reward": 11.109964549541473, "reward_std": 0.9232164584100246, "rewards/bm25_retrieval_reward_fn/mean": 0.9082068763673306, "rewards/bm25_retrieval_reward_fn/std": 0.25597723573446274, "rewards/event_reward_fn/mean": 9.279296875, "rewards/event_reward_fn/std": 5.3837059289216995, "rewards/format_reward_fn/mean": 0.9224609360098839, "rewards/format_reward_fn/std": 0.2576202508062124, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0458984375, "completions/max_length": 251.0, "completions/max_terminated_length": 242.375, "completions/mean_length": 193.9111328125, "completions/mean_terminated_length": 190.90945529937744, "completions/min_length": 148.625, "completions/min_terminated_length": 148.625, "entropy": 0.08152232086285949, "epoch": 1.554907677356657, "frac_reward_zero_std": 0.34375, "grad_norm": 0.35102641582489014, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 128764293.0, "reward": 11.371211469173431, "reward_std": 0.8595849685370922, "rewards/bm25_retrieval_reward_fn/mean": 0.9204303659498692, "rewards/bm25_retrieval_reward_fn/std": 0.21888624806888402, "rewards/event_reward_fn/mean": 9.513671875, "rewards/event_reward_fn/std": 5.4597727209329605, "rewards/format_reward_fn/mean": 0.9371093735098839, "rewards/format_reward_fn/std": 0.2127007795497775, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 252.375, "completions/max_terminated_length": 245.375, "completions/mean_length": 200.150390625, "completions/mean_terminated_length": 194.58474922180176, "completions/min_length": 145.125, "completions/min_terminated_length": 145.125, "entropy": 0.08945442596450448, "epoch": 1.5704567541302237, "frac_reward_zero_std": 0.28125, "grad_norm": 0.11586015671491623, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 130147811.0, "reward": 10.688360095024109, "reward_std": 0.8784848563373089, "rewards/bm25_retrieval_reward_fn/mean": 0.8703912869095802, "rewards/bm25_retrieval_reward_fn/std": 0.26367771509103477, "rewards/event_reward_fn/mean": 8.9267578125, "rewards/event_reward_fn/std": 5.635714888572693, "rewards/format_reward_fn/mean": 0.8912109360098839, "rewards/format_reward_fn/std": 0.2533010635524988, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 254.3125, "completions/max_terminated_length": 239.75, "completions/mean_length": 195.1494140625, "completions/mean_terminated_length": 190.32571697235107, "completions/min_length": 143.625, "completions/min_terminated_length": 143.625, "entropy": 0.08721820963546634, "epoch": 1.58600583090379, "frac_reward_zero_std": 0.328125, "grad_norm": 0.1575620472431183, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 131479512.0, "reward": 10.922975957393646, "reward_std": 0.7370323836803436, "rewards/bm25_retrieval_reward_fn/mean": 0.8828910291194916, "rewards/bm25_retrieval_reward_fn/std": 0.2897054869681597, "rewards/event_reward_fn/mean": 9.146484375, "rewards/event_reward_fn/std": 5.057717680931091, "rewards/format_reward_fn/mean": 0.8936002627015114, "rewards/format_reward_fn/std": 0.28906678687781096, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 254.0, "completions/max_terminated_length": 241.6875, "completions/mean_length": 198.5185546875, "completions/mean_terminated_length": 194.11609935760498, "completions/min_length": 149.1875, "completions/min_terminated_length": 149.1875, "entropy": 0.08794478559866548, "epoch": 1.6015549076773565, "frac_reward_zero_std": 0.34375, "grad_norm": 0.16397124528884888, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 132797395.0, "reward": 10.608273446559906, "reward_std": 0.8345479369163513, "rewards/bm25_retrieval_reward_fn/mean": 0.8834850341081619, "rewards/bm25_retrieval_reward_fn/std": 0.27482672582846135, "rewards/event_reward_fn/mean": 8.8251953125, "rewards/event_reward_fn/std": 5.233703002333641, "rewards/format_reward_fn/mean": 0.8995930962264538, "rewards/format_reward_fn/std": 0.27503635361790657, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 254.6875, "completions/max_terminated_length": 241.25, "completions/mean_length": 192.1123046875, "completions/mean_terminated_length": 187.6938066482544, "completions/min_length": 137.3125, "completions/min_terminated_length": 137.3125, "entropy": 0.08001765748485923, "epoch": 1.6171039844509232, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.16833443939685822, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 134069966.0, "reward": 11.113677322864532, "reward_std": 0.9350622501224279, "rewards/bm25_retrieval_reward_fn/mean": 0.9033686555922031, "rewards/bm25_retrieval_reward_fn/std": 0.2602922786027193, "rewards/event_reward_fn/mean": 9.2939453125, "rewards/event_reward_fn/std": 5.6752976179122925, "rewards/format_reward_fn/mean": 0.9163634702563286, "rewards/format_reward_fn/std": 0.2618194241076708, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 249.3125, "completions/max_terminated_length": 241.75, "completions/mean_length": 196.7158203125, "completions/mean_terminated_length": 192.56449699401855, "completions/min_length": 129.6875, "completions/min_terminated_length": 129.6875, "entropy": 0.08379031391814351, "epoch": 1.6326530612244898, "frac_reward_zero_std": 0.3125, "grad_norm": 0.14574581384658813, "learning_rate": 5e-05, "loss": -0.0047, "num_tokens": 135435439.0, "reward": 11.539310336112976, "reward_std": 0.9443789459764957, "rewards/bm25_retrieval_reward_fn/mean": 0.8967322260141373, "rewards/bm25_retrieval_reward_fn/std": 0.22321847162675112, "rewards/event_reward_fn/mean": 9.7314453125, "rewards/event_reward_fn/std": 5.278485506772995, "rewards/format_reward_fn/mean": 0.9111328125, "rewards/format_reward_fn/std": 0.21248832251876593, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0791015625, "completions/max_length": 254.8125, "completions/max_terminated_length": 244.8125, "completions/mean_length": 202.83203125, "completions/mean_terminated_length": 198.2507667541504, "completions/min_length": 150.625, "completions/min_terminated_length": 150.625, "entropy": 0.08855495927855372, "epoch": 1.6482021379980565, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.17940281331539154, "learning_rate": 5e-05, "loss": -0.0056, "num_tokens": 136778675.0, "reward": 11.134308993816376, "reward_std": 0.9293302595615387, "rewards/bm25_retrieval_reward_fn/mean": 0.8743740394711494, "rewards/bm25_retrieval_reward_fn/std": 0.26636734034400433, "rewards/event_reward_fn/mean": 9.3662109375, "rewards/event_reward_fn/std": 5.85838320851326, "rewards/format_reward_fn/mean": 0.8937239646911621, "rewards/format_reward_fn/std": 0.2652863524854183, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.6875, "completions/mean_length": 213.9638671875, "completions/mean_terminated_length": 207.2107219696045, "completions/min_length": 162.375, "completions/min_terminated_length": 162.375, "entropy": 0.09064092021435499, "epoch": 1.663751214771623, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.15384909510612488, "learning_rate": 5e-05, "loss": 0.0029, "num_tokens": 138044578.0, "reward": 11.337530732154846, "reward_std": 0.9400022551417351, "rewards/bm25_retrieval_reward_fn/mean": 0.824542474001646, "rewards/bm25_retrieval_reward_fn/std": 0.33465168718248606, "rewards/event_reward_fn/mean": 9.6669921875, "rewards/event_reward_fn/std": 5.503222852945328, "rewards/format_reward_fn/mean": 0.8459960930049419, "rewards/format_reward_fn/std": 0.3366972776129842, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 251.5625, "completions/max_terminated_length": 245.5625, "completions/mean_length": 203.677734375, "completions/mean_terminated_length": 201.15838241577148, "completions/min_length": 159.1875, "completions/min_terminated_length": 159.1875, "entropy": 0.08698790520429611, "epoch": 1.6793002915451893, "frac_reward_zero_std": 0.296875, "grad_norm": 0.11867301166057587, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 139288124.0, "reward": 11.192306399345398, "reward_std": 0.9463471882045269, "rewards/bm25_retrieval_reward_fn/mean": 0.9286414235830307, "rewards/bm25_retrieval_reward_fn/std": 0.20035810582339764, "rewards/event_reward_fn/mean": 9.3193359375, "rewards/event_reward_fn/std": 5.200570702552795, "rewards/format_reward_fn/mean": 0.9443289637565613, "rewards/format_reward_fn/std": 0.19202105328440666, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0556640625, "completions/max_length": 252.125, "completions/max_terminated_length": 244.875, "completions/mean_length": 193.6708984375, "completions/mean_terminated_length": 189.9529905319214, "completions/min_length": 136.625, "completions/min_terminated_length": 136.625, "entropy": 0.08362232241779566, "epoch": 1.694849368318756, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.11613152176141739, "learning_rate": 5e-05, "loss": -0.0019, "num_tokens": 140615163.0, "reward": 11.211718916893005, "reward_std": 0.8285622540861368, "rewards/bm25_retrieval_reward_fn/mean": 0.9016408734023571, "rewards/bm25_retrieval_reward_fn/std": 0.25260637141764164, "rewards/event_reward_fn/mean": 9.390625, "rewards/event_reward_fn/std": 5.310590535402298, "rewards/format_reward_fn/mean": 0.9194531291723251, "rewards/format_reward_fn/std": 0.251515906304121, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0751953125, "completions/max_length": 248.6875, "completions/max_terminated_length": 240.3125, "completions/mean_length": 194.3291015625, "completions/mean_terminated_length": 189.4942626953125, "completions/min_length": 139.25, "completions/min_terminated_length": 139.25, "entropy": 0.08920921664685011, "epoch": 1.7103984450923226, "frac_reward_zero_std": 0.265625, "grad_norm": 0.1495039016008377, "learning_rate": 5e-05, "loss": -0.0026, "num_tokens": 141995908.0, "reward": 11.331986844539642, "reward_std": 0.9946209099143744, "rewards/bm25_retrieval_reward_fn/mean": 0.8805676624178886, "rewards/bm25_retrieval_reward_fn/std": 0.2605485112289898, "rewards/event_reward_fn/mean": 9.5615234375, "rewards/event_reward_fn/std": 5.626507669687271, "rewards/format_reward_fn/mean": 0.8898958377540112, "rewards/format_reward_fn/std": 0.25742682348936796, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0556640625, "completions/max_length": 250.625, "completions/max_terminated_length": 242.25, "completions/mean_length": 194.5751953125, "completions/mean_terminated_length": 190.86159992218018, "completions/min_length": 141.5625, "completions/min_terminated_length": 141.5625, "entropy": 0.09618484182283282, "epoch": 1.7259475218658893, "frac_reward_zero_std": 0.26953125, "grad_norm": 0.20417290925979614, "learning_rate": 5e-05, "loss": -0.0055, "num_tokens": 143301673.0, "reward": 10.538148939609528, "reward_std": 0.9361699968576431, "rewards/bm25_retrieval_reward_fn/mean": 0.9052551127970219, "rewards/bm25_retrieval_reward_fn/std": 0.2257093784864992, "rewards/event_reward_fn/mean": 8.7158203125, "rewards/event_reward_fn/std": 4.607826009392738, "rewards/format_reward_fn/mean": 0.9170735664665699, "rewards/format_reward_fn/std": 0.22840105323120952, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0576171875, "completions/max_length": 252.8125, "completions/max_terminated_length": 244.4375, "completions/mean_length": 200.8662109375, "completions/mean_terminated_length": 197.39703178405762, "completions/min_length": 149.9375, "completions/min_terminated_length": 149.9375, "entropy": 0.08653424866497517, "epoch": 1.741496598639456, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.14243784546852112, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 144603412.0, "reward": 11.493825078010559, "reward_std": 0.8755283299833536, "rewards/bm25_retrieval_reward_fn/mean": 0.9135190099477768, "rewards/bm25_retrieval_reward_fn/std": 0.24690337451465894, "rewards/event_reward_fn/mean": 9.658203125, "rewards/event_reward_fn/std": 5.445283606648445, "rewards/format_reward_fn/mean": 0.922102864831686, "rewards/format_reward_fn/std": 0.2481938637793064, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0556640625, "completions/max_length": 252.0625, "completions/max_terminated_length": 246.75, "completions/mean_length": 196.5146484375, "completions/mean_terminated_length": 193.12859344482422, "completions/min_length": 140.8125, "completions/min_terminated_length": 140.8125, "entropy": 0.08316960139200091, "epoch": 1.7570456754130224, "frac_reward_zero_std": 0.3125, "grad_norm": 0.109793521463871, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 145919099.0, "reward": 11.740033328533173, "reward_std": 0.9224549978971481, "rewards/bm25_retrieval_reward_fn/mean": 0.9150333367288113, "rewards/bm25_retrieval_reward_fn/std": 0.24498367216438055, "rewards/event_reward_fn/mean": 9.90234375, "rewards/event_reward_fn/std": 5.425331294536591, "rewards/format_reward_fn/mean": 0.9226562492549419, "rewards/format_reward_fn/std": 0.24409929476678371, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 254.75, "completions/max_terminated_length": 249.3125, "completions/mean_length": 208.4755859375, "completions/mean_terminated_length": 204.96116065979004, "completions/min_length": 160.5625, "completions/min_terminated_length": 160.5625, "entropy": 0.08932856796309352, "epoch": 1.7725947521865888, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.24029314517974854, "learning_rate": 5e-05, "loss": 0.0045, "num_tokens": 147275350.0, "reward": 11.133660674095154, "reward_std": 1.0420608818531036, "rewards/bm25_retrieval_reward_fn/mean": 0.8887974470853806, "rewards/bm25_retrieval_reward_fn/std": 0.2559172356268391, "rewards/event_reward_fn/mean": 9.34375, "rewards/event_reward_fn/std": 5.51551166176796, "rewards/format_reward_fn/mean": 0.9011132828891277, "rewards/format_reward_fn/std": 0.2507179146632552, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 252.375, "completions/max_terminated_length": 240.875, "completions/mean_length": 198.576171875, "completions/mean_terminated_length": 195.7430601119995, "completions/min_length": 151.25, "completions/min_terminated_length": 151.25, "entropy": 0.08435806119814515, "epoch": 1.7881438289601554, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.13869501650333405, "learning_rate": 5e-05, "loss": -0.0026, "num_tokens": 148539184.0, "reward": 11.031599402427673, "reward_std": 0.7965468689799309, "rewards/bm25_retrieval_reward_fn/mean": 0.9208246804773808, "rewards/bm25_retrieval_reward_fn/std": 0.2361440734239295, "rewards/event_reward_fn/mean": 9.1796875, "rewards/event_reward_fn/std": 4.907300844788551, "rewards/format_reward_fn/mean": 0.9310872405767441, "rewards/format_reward_fn/std": 0.2394925099797547, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0380859375, "completions/max_length": 252.5, "completions/max_terminated_length": 250.125, "completions/mean_length": 202.90234375, "completions/mean_terminated_length": 200.88229370117188, "completions/min_length": 155.875, "completions/min_terminated_length": 155.875, "entropy": 0.08052209811285138, "epoch": 1.803692905733722, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.1878909021615982, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 149840940.0, "reward": 10.959127485752106, "reward_std": 0.9578492008149624, "rewards/bm25_retrieval_reward_fn/mean": 0.9300258904695511, "rewards/bm25_retrieval_reward_fn/std": 0.18775073438882828, "rewards/event_reward_fn/mean": 9.0859375, "rewards/event_reward_fn/std": 5.149698540568352, "rewards/format_reward_fn/mean": 0.9431640617549419, "rewards/format_reward_fn/std": 0.17401384096592665, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 255.9375, "completions/max_terminated_length": 248.875, "completions/mean_length": 211.8564453125, "completions/mean_terminated_length": 207.1767454147339, "completions/min_length": 164.5625, "completions/min_terminated_length": 164.5625, "entropy": 0.08094025542959571, "epoch": 1.8192419825072887, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.14807139337062836, "learning_rate": 5e-05, "loss": 0.0023, "num_tokens": 151221749.0, "reward": 11.752990126609802, "reward_std": 0.9537594802677631, "rewards/bm25_retrieval_reward_fn/mean": 0.8724758252501488, "rewards/bm25_retrieval_reward_fn/std": 0.29270493309013546, "rewards/event_reward_fn/mean": 9.9970703125, "rewards/event_reward_fn/std": 5.857491314411163, "rewards/format_reward_fn/mean": 0.8834440112113953, "rewards/format_reward_fn/std": 0.29244135320186615, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 254.3125, "completions/max_terminated_length": 250.25, "completions/mean_length": 208.7001953125, "completions/mean_terminated_length": 206.05935287475586, "completions/min_length": 159.125, "completions/min_terminated_length": 159.125, "entropy": 0.08766834484413266, "epoch": 1.8347910592808552, "frac_reward_zero_std": 0.34375, "grad_norm": 0.17317424714565277, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 152521294.0, "reward": 11.364756107330322, "reward_std": 0.9098326228559017, "rewards/bm25_retrieval_reward_fn/mean": 0.8889735676348209, "rewards/bm25_retrieval_reward_fn/std": 0.2667266938369721, "rewards/event_reward_fn/mean": 9.568359375, "rewards/event_reward_fn/std": 5.424193903803825, "rewards/format_reward_fn/mean": 0.9074231162667274, "rewards/format_reward_fn/std": 0.2601332040503621, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0478515625, "completions/max_length": 253.1875, "completions/max_terminated_length": 248.625, "completions/mean_length": 205.2333984375, "completions/mean_terminated_length": 202.61692428588867, "completions/min_length": 158.0625, "completions/min_terminated_length": 158.0625, "entropy": 0.0873062857426703, "epoch": 1.8503401360544216, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.16510100662708282, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 153814765.0, "reward": 10.840591430664062, "reward_std": 0.806601133197546, "rewards/bm25_retrieval_reward_fn/mean": 0.8800444230437279, "rewards/bm25_retrieval_reward_fn/std": 0.2592724412679672, "rewards/event_reward_fn/mean": 9.05078125, "rewards/event_reward_fn/std": 5.017846331000328, "rewards/format_reward_fn/mean": 0.9097656235098839, "rewards/format_reward_fn/std": 0.2350642140954733, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0478515625, "completions/max_length": 250.6875, "completions/max_terminated_length": 246.625, "completions/mean_length": 199.6025390625, "completions/mean_terminated_length": 196.8941469192505, "completions/min_length": 146.75, "completions/min_terminated_length": 146.75, "entropy": 0.08616631478071213, "epoch": 1.8658892128279883, "frac_reward_zero_std": 0.328125, "grad_norm": 0.10711020976305008, "learning_rate": 5e-05, "loss": 0.003, "num_tokens": 155159530.0, "reward": 11.430678129196167, "reward_std": 0.7845460455864668, "rewards/bm25_retrieval_reward_fn/mean": 0.9038552716374397, "rewards/bm25_retrieval_reward_fn/std": 0.21465440141037107, "rewards/event_reward_fn/mean": 9.5986328125, "rewards/event_reward_fn/std": 5.15682627260685, "rewards/format_reward_fn/mean": 0.928190104663372, "rewards/format_reward_fn/std": 0.19767758785746992, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 253.1875, "completions/max_terminated_length": 246.6875, "completions/mean_length": 200.734375, "completions/mean_terminated_length": 197.25225925445557, "completions/min_length": 150.1875, "completions/min_terminated_length": 150.1875, "entropy": 0.08771243086084723, "epoch": 1.881438289601555, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.119595006108284, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 156525614.0, "reward": 11.453014373779297, "reward_std": 1.111331295222044, "rewards/bm25_retrieval_reward_fn/mean": 0.8604980707168579, "rewards/bm25_retrieval_reward_fn/std": 0.2871107269311324, "rewards/event_reward_fn/mean": 9.708984375, "rewards/event_reward_fn/std": 5.215842500329018, "rewards/format_reward_fn/mean": 0.8835319019854069, "rewards/format_reward_fn/std": 0.2829501121304929, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 250.25, "completions/max_terminated_length": 245.8125, "completions/mean_length": 194.017578125, "completions/mean_terminated_length": 192.65657711029053, "completions/min_length": 140.6875, "completions/min_terminated_length": 140.6875, "entropy": 0.08764936728402972, "epoch": 1.8969873663751216, "frac_reward_zero_std": 0.28515625, "grad_norm": 0.1511303335428238, "learning_rate": 5e-05, "loss": 0.0032, "num_tokens": 157786400.0, "reward": 10.632731199264526, "reward_std": 0.9243863355368376, "rewards/bm25_retrieval_reward_fn/mean": 0.9324383623898029, "rewards/bm25_retrieval_reward_fn/std": 0.18536719167605042, "rewards/event_reward_fn/mean": 8.7451171875, "rewards/event_reward_fn/std": 5.235057607293129, "rewards/format_reward_fn/mean": 0.9551757834851742, "rewards/format_reward_fn/std": 0.16272677155211568, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 254.125, "completions/max_terminated_length": 250.6875, "completions/mean_length": 200.95703125, "completions/mean_terminated_length": 198.30670166015625, "completions/min_length": 154.6875, "completions/min_terminated_length": 154.6875, "entropy": 0.08874167408794165, "epoch": 1.9125364431486882, "frac_reward_zero_std": 0.33984375, "grad_norm": 0.17749741673469543, "learning_rate": 5e-05, "loss": 0.0037, "num_tokens": 159093888.0, "reward": 11.275705397129059, "reward_std": 0.856599148362875, "rewards/bm25_retrieval_reward_fn/mean": 0.9131891131401062, "rewards/bm25_retrieval_reward_fn/std": 0.2150915495294612, "rewards/event_reward_fn/mean": 9.4345703125, "rewards/event_reward_fn/std": 5.729633465409279, "rewards/format_reward_fn/mean": 0.927945964038372, "rewards/format_reward_fn/std": 0.2096583191305399, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 253.875, "completions/max_terminated_length": 246.5625, "completions/mean_length": 200.9677734375, "completions/mean_terminated_length": 198.57855701446533, "completions/min_length": 142.8125, "completions/min_terminated_length": 142.8125, "entropy": 0.0850910097360611, "epoch": 1.9280855199222546, "frac_reward_zero_std": 0.375, "grad_norm": 0.12046821415424347, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 160340479.0, "reward": 10.719317555427551, "reward_std": 0.8128865994513035, "rewards/bm25_retrieval_reward_fn/mean": 0.9061989188194275, "rewards/bm25_retrieval_reward_fn/std": 0.24043723253998905, "rewards/event_reward_fn/mean": 8.892578125, "rewards/event_reward_fn/std": 5.485840782523155, "rewards/format_reward_fn/mean": 0.9205403625965118, "rewards/format_reward_fn/std": 0.2409290496725589, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.125, "completions/mean_length": 213.712890625, "completions/mean_terminated_length": 207.85150337219238, "completions/min_length": 160.9375, "completions/min_terminated_length": 160.9375, "entropy": 0.08267078269273043, "epoch": 1.943634596695821, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.09311431646347046, "learning_rate": 5e-05, "loss": 0.0044, "num_tokens": 161744217.0, "reward": 10.99679410457611, "reward_std": 0.9773008767515421, "rewards/bm25_retrieval_reward_fn/mean": 0.856908455491066, "rewards/bm25_retrieval_reward_fn/std": 0.3204036271199584, "rewards/event_reward_fn/mean": 9.2744140625, "rewards/event_reward_fn/std": 5.77374792098999, "rewards/format_reward_fn/mean": 0.8654715418815613, "rewards/format_reward_fn/std": 0.3263047467917204, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0732421875, "completions/max_length": 255.6875, "completions/max_terminated_length": 246.0625, "completions/mean_length": 203.2724609375, "completions/mean_terminated_length": 199.26644325256348, "completions/min_length": 149.0625, "completions/min_terminated_length": 149.0625, "entropy": 0.08016827004030347, "epoch": 1.9591836734693877, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.28214797377586365, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 163027312.0, "reward": 10.820812225341797, "reward_std": 0.9312072917819023, "rewards/bm25_retrieval_reward_fn/mean": 0.8814437240362167, "rewards/bm25_retrieval_reward_fn/std": 0.2926495522260666, "rewards/event_reward_fn/mean": 9.048828125, "rewards/event_reward_fn/std": 5.542583703994751, "rewards/format_reward_fn/mean": 0.890540361404419, "rewards/format_reward_fn/std": 0.2912682769820094, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 255.8125, "completions/max_terminated_length": 251.4375, "completions/mean_length": 210.033203125, "completions/mean_terminated_length": 206.1150426864624, "completions/min_length": 157.75, "completions/min_terminated_length": 157.75, "entropy": 0.07839876413345337, "epoch": 1.9747327502429544, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.1044137179851532, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 164352274.0, "reward": 11.44101220369339, "reward_std": 0.9166238941252232, "rewards/bm25_retrieval_reward_fn/mean": 0.8769194670021534, "rewards/bm25_retrieval_reward_fn/std": 0.2891712624114007, "rewards/event_reward_fn/mean": 9.6767578125, "rewards/event_reward_fn/std": 5.10790191590786, "rewards/format_reward_fn/mean": 0.8873349130153656, "rewards/format_reward_fn/std": 0.2897007022984326, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0849609375, "completions/max_length": 255.125, "completions/max_terminated_length": 249.125, "completions/mean_length": 207.380859375, "completions/mean_terminated_length": 202.79765510559082, "completions/min_length": 162.625, "completions/min_terminated_length": 162.625, "entropy": 0.07377080479636788, "epoch": 1.990281827016521, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.16017131507396698, "learning_rate": 5e-05, "loss": 0.0026, "num_tokens": 165649512.0, "reward": 11.277841091156006, "reward_std": 0.9176982510834932, "rewards/bm25_retrieval_reward_fn/mean": 0.8867846131324768, "rewards/bm25_retrieval_reward_fn/std": 0.275547455297783, "rewards/event_reward_fn/mean": 9.4931640625, "rewards/event_reward_fn/std": 5.252700716257095, "rewards/format_reward_fn/mean": 0.8978923298418522, "rewards/format_reward_fn/std": 0.27646369859576225, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0537109375, "completions/max_length": 255.8125, "completions/max_terminated_length": 249.6875, "completions/mean_length": 204.119140625, "completions/mean_terminated_length": 201.15938472747803, "completions/min_length": 153.9375, "completions/min_terminated_length": 153.9375, "entropy": 0.08090341417118907, "epoch": 2.0058309037900877, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.14778926968574524, "learning_rate": 5e-05, "loss": -0.0041, "num_tokens": 166928214.0, "reward": 10.784139513969421, "reward_std": 0.845683254301548, "rewards/bm25_retrieval_reward_fn/mean": 0.9085673242807388, "rewards/bm25_retrieval_reward_fn/std": 0.22585340135265142, "rewards/event_reward_fn/mean": 8.958984375, "rewards/event_reward_fn/std": 5.853749170899391, "rewards/format_reward_fn/mean": 0.9165879562497139, "rewards/format_reward_fn/std": 0.2265966208651662, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 255.9375, "completions/max_terminated_length": 252.5625, "completions/mean_length": 217.998046875, "completions/mean_terminated_length": 213.3584222793579, "completions/min_length": 168.625, "completions/min_terminated_length": 168.625, "entropy": 0.07874821173027158, "epoch": 2.021379980563654, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.2152583748102188, "learning_rate": 5e-05, "loss": 0.004, "num_tokens": 168321184.0, "reward": 11.825278103351593, "reward_std": 0.8600351363420486, "rewards/bm25_retrieval_reward_fn/mean": 0.8512045294046402, "rewards/bm25_retrieval_reward_fn/std": 0.3247975427657366, "rewards/event_reward_fn/mean": 10.119140625, "rewards/event_reward_fn/std": 5.912083759903908, "rewards/format_reward_fn/mean": 0.8549330346286297, "rewards/format_reward_fn/std": 0.32650260720402, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 255.6875, "completions/max_terminated_length": 251.375, "completions/mean_length": 215.453125, "completions/mean_terminated_length": 210.9893503189087, "completions/min_length": 169.125, "completions/min_terminated_length": 169.125, "entropy": 0.07949487678706646, "epoch": 2.0369290573372205, "frac_reward_zero_std": 0.26953125, "grad_norm": 0.17531728744506836, "learning_rate": 5e-05, "loss": 0.0025, "num_tokens": 169684588.0, "reward": 11.59043002128601, "reward_std": 0.8503808788955212, "rewards/bm25_retrieval_reward_fn/mean": 0.8546094782650471, "rewards/bm25_retrieval_reward_fn/std": 0.29849119763821363, "rewards/event_reward_fn/mean": 9.8740234375, "rewards/event_reward_fn/std": 5.960100635886192, "rewards/format_reward_fn/mean": 0.8617968708276749, "rewards/format_reward_fn/std": 0.30179503839462996, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 254.9375, "completions/max_terminated_length": 249.9375, "completions/mean_length": 208.2587890625, "completions/mean_terminated_length": 205.81217098236084, "completions/min_length": 162.8125, "completions/min_terminated_length": 162.8125, "entropy": 0.08167764730751514, "epoch": 2.052478134110787, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.17342492938041687, "learning_rate": 5e-05, "loss": -0.0046, "num_tokens": 171007325.0, "reward": 11.406825065612793, "reward_std": 0.9372135195881128, "rewards/bm25_retrieval_reward_fn/mean": 0.8985451720654964, "rewards/bm25_retrieval_reward_fn/std": 0.2477037919452414, "rewards/event_reward_fn/mean": 9.6015625, "rewards/event_reward_fn/std": 5.504318922758102, "rewards/format_reward_fn/mean": 0.9067176692187786, "rewards/format_reward_fn/std": 0.2522282497957349, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0595703125, "completions/max_length": 255.5625, "completions/max_terminated_length": 251.4375, "completions/mean_length": 209.2939453125, "completions/mean_terminated_length": 206.3635711669922, "completions/min_length": 164.3125, "completions/min_terminated_length": 164.3125, "entropy": 0.08719462575390935, "epoch": 2.068027210884354, "frac_reward_zero_std": 0.26171875, "grad_norm": 0.21987557411193848, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 172342766.0, "reward": 11.441301941871643, "reward_std": 1.013813877478242, "rewards/bm25_retrieval_reward_fn/mean": 0.8949129357933998, "rewards/bm25_retrieval_reward_fn/std": 0.23817135416902602, "rewards/event_reward_fn/mean": 9.6416015625, "rewards/event_reward_fn/std": 5.478666722774506, "rewards/format_reward_fn/mean": 0.904787328094244, "rewards/format_reward_fn/std": 0.24113686219789088, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0751953125, "completions/max_length": 254.6875, "completions/max_terminated_length": 251.3125, "completions/mean_length": 214.8193359375, "completions/mean_terminated_length": 211.62517070770264, "completions/min_length": 171.75, "completions/min_terminated_length": 171.75, "entropy": 0.08591812197118998, "epoch": 2.0835762876579205, "frac_reward_zero_std": 0.25, "grad_norm": 0.12275879085063934, "learning_rate": 5e-05, "loss": 0.0019, "num_tokens": 173671761.0, "reward": 11.561739206314087, "reward_std": 0.9223730489611626, "rewards/bm25_retrieval_reward_fn/mean": 0.8886811174452305, "rewards/bm25_retrieval_reward_fn/std": 0.24229650711640716, "rewards/event_reward_fn/mean": 9.76953125, "rewards/event_reward_fn/std": 5.791984856128693, "rewards/format_reward_fn/mean": 0.9035267867147923, "rewards/format_reward_fn/std": 0.23866780381649733, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0439453125, "completions/max_length": 253.125, "completions/max_terminated_length": 248.3125, "completions/mean_length": 208.9638671875, "completions/mean_terminated_length": 206.84466552734375, "completions/min_length": 168.875, "completions/min_terminated_length": 168.875, "entropy": 0.08218491962179542, "epoch": 2.0991253644314867, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.2145640254020691, "learning_rate": 5e-05, "loss": 0.0021, "num_tokens": 175000300.0, "reward": 11.832590639591217, "reward_std": 0.8098492994904518, "rewards/bm25_retrieval_reward_fn/mean": 0.9014225117862225, "rewards/bm25_retrieval_reward_fn/std": 0.24191926792263985, "rewards/event_reward_fn/mean": 10.02734375, "rewards/event_reward_fn/std": 5.564895883202553, "rewards/format_reward_fn/mean": 0.9038244113326073, "rewards/format_reward_fn/std": 0.2553019989281893, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 255.0625, "completions/max_terminated_length": 251.375, "completions/mean_length": 212.275390625, "completions/mean_terminated_length": 209.4294490814209, "completions/min_length": 172.875, "completions/min_terminated_length": 172.875, "entropy": 0.08632302051410079, "epoch": 2.1146744412050533, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.12819725275039673, "learning_rate": 5e-05, "loss": -0.0027, "num_tokens": 176397358.0, "reward": 11.750000655651093, "reward_std": 1.0350622907280922, "rewards/bm25_retrieval_reward_fn/mean": 0.8829724602401257, "rewards/bm25_retrieval_reward_fn/std": 0.26738651166670024, "rewards/event_reward_fn/mean": 9.9794921875, "rewards/event_reward_fn/std": 5.499541476368904, "rewards/format_reward_fn/mean": 0.8875359706580639, "rewards/format_reward_fn/std": 0.27499296236783266, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0771484375, "completions/max_length": 253.9375, "completions/max_terminated_length": 248.5, "completions/mean_length": 211.8388671875, "completions/mean_terminated_length": 208.1127519607544, "completions/min_length": 169.125, "completions/min_terminated_length": 169.125, "entropy": 0.09303951309993863, "epoch": 2.13022351797862, "frac_reward_zero_std": 0.28515625, "grad_norm": 0.22441552579402924, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 177793437.0, "reward": 11.069317996501923, "reward_std": 1.0597262904047966, "rewards/bm25_retrieval_reward_fn/mean": 0.8652172312140465, "rewards/bm25_retrieval_reward_fn/std": 0.28404899896122515, "rewards/event_reward_fn/mean": 9.337890625, "rewards/event_reward_fn/std": 5.563613697886467, "rewards/format_reward_fn/mean": 0.8662101663649082, "rewards/format_reward_fn/std": 0.2897696476429701, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1318359375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.8125, "completions/mean_length": 223.1318359375, "completions/mean_terminated_length": 218.348069190979, "completions/min_length": 178.5625, "completions/min_terminated_length": 178.5625, "entropy": 0.08407490560784936, "epoch": 2.1457725947521866, "frac_reward_zero_std": 0.2734375, "grad_norm": 0.20042170584201813, "learning_rate": 5e-05, "loss": 0.0042, "num_tokens": 179158840.0, "reward": 11.508206486701965, "reward_std": 0.8616610933095217, "rewards/bm25_retrieval_reward_fn/mean": 0.8278331160545349, "rewards/bm25_retrieval_reward_fn/std": 0.32595117576420307, "rewards/event_reward_fn/mean": 9.8291015625, "rewards/event_reward_fn/std": 5.2151205241680145, "rewards/format_reward_fn/mean": 0.8512718565762043, "rewards/format_reward_fn/std": 0.33205954916775227, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 254.6875, "completions/max_terminated_length": 247.75, "completions/mean_length": 205.880859375, "completions/mean_terminated_length": 203.19688606262207, "completions/min_length": 166.625, "completions/min_terminated_length": 166.625, "entropy": 0.07565503777004778, "epoch": 2.1613216715257533, "frac_reward_zero_std": 0.31640625, "grad_norm": 0.15642929077148438, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 180532374.0, "reward": 11.076000154018402, "reward_std": 0.8950711917132139, "rewards/bm25_retrieval_reward_fn/mean": 0.9075712524354458, "rewards/bm25_retrieval_reward_fn/std": 0.24491250491701066, "rewards/event_reward_fn/mean": 9.248046875, "rewards/event_reward_fn/std": 5.206828847527504, "rewards/format_reward_fn/mean": 0.9203820116817951, "rewards/format_reward_fn/std": 0.24691881332546473, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0498046875, "completions/max_length": 253.6875, "completions/max_terminated_length": 248.5625, "completions/mean_length": 204.2744140625, "completions/mean_terminated_length": 201.6161012649536, "completions/min_length": 162.0625, "completions/min_terminated_length": 162.0625, "entropy": 0.07292898930609226, "epoch": 2.17687074829932, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.20497596263885498, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 181834967.0, "reward": 11.745607078075409, "reward_std": 0.847535029053688, "rewards/bm25_retrieval_reward_fn/mean": 0.8826499357819557, "rewards/bm25_retrieval_reward_fn/std": 0.2826285846531391, "rewards/event_reward_fn/mean": 9.96875, "rewards/event_reward_fn/std": 5.558514207601547, "rewards/format_reward_fn/mean": 0.8942071311175823, "rewards/format_reward_fn/std": 0.2861539525911212, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0615234375, "completions/max_length": 255.1875, "completions/max_terminated_length": 250.8125, "completions/mean_length": 212.1318359375, "completions/mean_terminated_length": 209.29450035095215, "completions/min_length": 174.1875, "completions/min_terminated_length": 174.1875, "entropy": 0.07543019764125347, "epoch": 2.192419825072886, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.10211238265037537, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 183156678.0, "reward": 11.885134816169739, "reward_std": 0.7728028316050768, "rewards/bm25_retrieval_reward_fn/mean": 0.9035747610032558, "rewards/bm25_retrieval_reward_fn/std": 0.24522930546663702, "rewards/event_reward_fn/mean": 10.0634765625, "rewards/event_reward_fn/std": 5.761056482791901, "rewards/format_reward_fn/mean": 0.9180834665894508, "rewards/format_reward_fn/std": 0.24741819500923157, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 255.5, "completions/max_terminated_length": 252.125, "completions/mean_length": 217.482421875, "completions/mean_terminated_length": 212.08176708221436, "completions/min_length": 176.875, "completions/min_terminated_length": 176.875, "entropy": 0.07963799126446247, "epoch": 2.207968901846453, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.2214186191558838, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 184464124.0, "reward": 11.618954241275787, "reward_std": 0.9292504880577326, "rewards/bm25_retrieval_reward_fn/mean": 0.8386580236256123, "rewards/bm25_retrieval_reward_fn/std": 0.32745907083153725, "rewards/event_reward_fn/mean": 9.9248046875, "rewards/event_reward_fn/std": 6.44027054309845, "rewards/format_reward_fn/mean": 0.8554915376007557, "rewards/format_reward_fn/std": 0.3294975752942264, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0498046875, "completions/max_length": 254.4375, "completions/max_terminated_length": 249.375, "completions/mean_length": 206.373046875, "completions/mean_terminated_length": 203.74801063537598, "completions/min_length": 163.625, "completions/min_terminated_length": 163.625, "entropy": 0.0740656116977334, "epoch": 2.2235179786200194, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.15276597440242767, "learning_rate": 5e-05, "loss": -0.0032, "num_tokens": 185812542.0, "reward": 11.359130620956421, "reward_std": 0.6935872584581375, "rewards/bm25_retrieval_reward_fn/mean": 0.9103271588683128, "rewards/bm25_retrieval_reward_fn/std": 0.23343394591938704, "rewards/event_reward_fn/mean": 9.5244140625, "rewards/event_reward_fn/std": 5.075364321470261, "rewards/format_reward_fn/mean": 0.9243892580270767, "rewards/format_reward_fn/std": 0.2387167038396001, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 254.5, "completions/max_terminated_length": 250.9375, "completions/mean_length": 210.4521484375, "completions/mean_terminated_length": 207.25661849975586, "completions/min_length": 169.8125, "completions/min_terminated_length": 169.8125, "entropy": 0.07526633841916919, "epoch": 2.239067055393586, "frac_reward_zero_std": 0.390625, "grad_norm": 0.17438335716724396, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 187174565.0, "reward": 11.606376469135284, "reward_std": 0.8281035982072353, "rewards/bm25_retrieval_reward_fn/mean": 0.8901838399469852, "rewards/bm25_retrieval_reward_fn/std": 0.2384987068362534, "rewards/event_reward_fn/mean": 9.806640625, "rewards/event_reward_fn/std": 5.699323073029518, "rewards/format_reward_fn/mean": 0.9095519706606865, "rewards/format_reward_fn/std": 0.2393078247550875, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0322265625, "completions/max_length": 249.0, "completions/max_terminated_length": 243.6875, "completions/mean_length": 203.9052734375, "completions/mean_terminated_length": 202.31891632080078, "completions/min_length": 159.4375, "completions/min_terminated_length": 159.4375, "entropy": 0.07370157795958221, "epoch": 2.2546161321671527, "frac_reward_zero_std": 0.390625, "grad_norm": 0.12816278636455536, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 188422072.0, "reward": 11.180573999881744, "reward_std": 0.7691474985331297, "rewards/bm25_retrieval_reward_fn/mean": 0.9313375540077686, "rewards/bm25_retrieval_reward_fn/std": 0.16660339455120265, "rewards/event_reward_fn/mean": 9.302734375, "rewards/event_reward_fn/std": 5.455415144562721, "rewards/format_reward_fn/mean": 0.946502048522234, "rewards/format_reward_fn/std": 0.16397117311134934, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0419921875, "completions/max_length": 250.875, "completions/max_terminated_length": 246.125, "completions/mean_length": 201.8203125, "completions/mean_terminated_length": 199.46971607208252, "completions/min_length": 160.6875, "completions/min_terminated_length": 160.6875, "entropy": 0.0713472084607929, "epoch": 2.2701652089407194, "frac_reward_zero_std": 0.45703125, "grad_norm": 0.15345498919487, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 189749628.0, "reward": 11.676891207695007, "reward_std": 0.7641248423606157, "rewards/bm25_retrieval_reward_fn/mean": 0.9076020866632462, "rewards/bm25_retrieval_reward_fn/std": 0.25307453935965896, "rewards/event_reward_fn/mean": 9.8544921875, "rewards/event_reward_fn/std": 5.772381603717804, "rewards/format_reward_fn/mean": 0.9147970490157604, "rewards/format_reward_fn/std": 0.25177112873643637, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 253.0, "completions/max_terminated_length": 245.9375, "completions/mean_length": 206.1015625, "completions/mean_terminated_length": 203.27950382232666, "completions/min_length": 162.625, "completions/min_terminated_length": 162.625, "entropy": 0.06717114523053169, "epoch": 2.2857142857142856, "frac_reward_zero_std": 0.3984375, "grad_norm": 0.14800839126110077, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 191059400.0, "reward": 11.288125574588776, "reward_std": 0.8216591961681843, "rewards/bm25_retrieval_reward_fn/mean": 0.9098481498658657, "rewards/bm25_retrieval_reward_fn/std": 0.2527642482891679, "rewards/event_reward_fn/mean": 9.45703125, "rewards/event_reward_fn/std": 6.013585805892944, "rewards/format_reward_fn/mean": 0.9212462790310383, "rewards/format_reward_fn/std": 0.255017863586545, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.3125, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 217.90027904510498, "completions/min_length": 175.625, "completions/min_terminated_length": 175.625, "entropy": 0.0761020069476217, "epoch": 2.3012633624878522, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.24630595743656158, "learning_rate": 5e-05, "loss": 0.0025, "num_tokens": 192350156.0, "reward": 11.402837812900543, "reward_std": 0.8105970397591591, "rewards/bm25_retrieval_reward_fn/mean": 0.8388476483523846, "rewards/bm25_retrieval_reward_fn/std": 0.32804084848612547, "rewards/event_reward_fn/mean": 9.7060546875, "rewards/event_reward_fn/std": 5.547398820519447, "rewards/format_reward_fn/mean": 0.8579354099929333, "rewards/format_reward_fn/std": 0.33059023320674896, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0537109375, "completions/max_length": 253.0625, "completions/max_terminated_length": 248.125, "completions/mean_length": 211.1552734375, "completions/mean_terminated_length": 208.6152687072754, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.07612166181206703, "epoch": 2.316812439261419, "frac_reward_zero_std": 0.390625, "grad_norm": 0.08691530674695969, "learning_rate": 5e-05, "loss": 0.0024, "num_tokens": 193665307.0, "reward": 11.495616167783737, "reward_std": 0.7880423050373793, "rewards/bm25_retrieval_reward_fn/mean": 0.902044016867876, "rewards/bm25_retrieval_reward_fn/std": 0.23614092892967165, "rewards/event_reward_fn/mean": 9.6865234375, "rewards/event_reward_fn/std": 6.028887152671814, "rewards/format_reward_fn/mean": 0.9070489220321178, "rewards/format_reward_fn/std": 0.24180734669789672, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0185546875, "completions/max_length": 246.4375, "completions/max_terminated_length": 242.0625, "completions/mean_length": 193.6845703125, "completions/mean_terminated_length": 192.51368141174316, "completions/min_length": 148.6875, "completions/min_terminated_length": 148.6875, "entropy": 0.07832195260562003, "epoch": 2.3323615160349855, "frac_reward_zero_std": 0.40234375, "grad_norm": 0.1274639219045639, "learning_rate": 5e-05, "loss": -0.0048, "num_tokens": 194920676.0, "reward": 11.40235447883606, "reward_std": 0.7331925742328167, "rewards/bm25_retrieval_reward_fn/mean": 0.9401694796979427, "rewards/bm25_retrieval_reward_fn/std": 0.17933110590092838, "rewards/event_reward_fn/mean": 9.5068359375, "rewards/event_reward_fn/std": 5.196439817547798, "rewards/format_reward_fn/mean": 0.9553493969142437, "rewards/format_reward_fn/std": 0.172615127870813, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 246.9375, "completions/max_terminated_length": 243.6875, "completions/mean_length": 197.6689453125, "completions/mean_terminated_length": 196.28760814666748, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.07638159766793251, "epoch": 2.347910592808552, "frac_reward_zero_std": 0.3671875, "grad_norm": 0.18019680678844452, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 196239697.0, "reward": 11.403923392295837, "reward_std": 0.7415321134030819, "rewards/bm25_retrieval_reward_fn/mean": 0.9454397931694984, "rewards/bm25_retrieval_reward_fn/std": 0.15600819129031152, "rewards/event_reward_fn/mean": 9.4951171875, "rewards/event_reward_fn/std": 5.713589310646057, "rewards/format_reward_fn/mean": 0.9633664786815643, "rewards/format_reward_fn/std": 0.14195893332362175, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0927734375, "completions/max_length": 255.75, "completions/max_terminated_length": 252.25, "completions/mean_length": 216.240234375, "completions/mean_terminated_length": 212.49735260009766, "completions/min_length": 171.4375, "completions/min_terminated_length": 171.4375, "entropy": 0.07729306910187006, "epoch": 2.3634596695821184, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.1351216435432434, "learning_rate": 5e-05, "loss": 0.0052, "num_tokens": 197585395.0, "reward": 11.078547358512878, "reward_std": 0.8885079212486744, "rewards/bm25_retrieval_reward_fn/mean": 0.8728507719933987, "rewards/bm25_retrieval_reward_fn/std": 0.2855207370594144, "rewards/event_reward_fn/mean": 9.3125, "rewards/event_reward_fn/std": 5.477887436747551, "rewards/format_reward_fn/mean": 0.8931966163218021, "rewards/format_reward_fn/std": 0.2850890662521124, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1064453125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.25, "completions/mean_length": 222.671875, "completions/mean_terminated_length": 218.8065061569214, "completions/min_length": 170.75, "completions/min_terminated_length": 170.75, "entropy": 0.07714059529826045, "epoch": 2.379008746355685, "frac_reward_zero_std": 0.359375, "grad_norm": 0.14632916450500488, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 198898579.0, "reward": 10.960965871810913, "reward_std": 0.8031115736812353, "rewards/bm25_retrieval_reward_fn/mean": 0.8641464188694954, "rewards/bm25_retrieval_reward_fn/std": 0.3086322648450732, "rewards/event_reward_fn/mean": 9.2119140625, "rewards/event_reward_fn/std": 5.528163373470306, "rewards/format_reward_fn/mean": 0.8849051333963871, "rewards/format_reward_fn/std": 0.31144819781184196, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0419921875, "completions/max_length": 251.8125, "completions/max_terminated_length": 247.875, "completions/mean_length": 210.1025390625, "completions/mean_terminated_length": 208.211838722229, "completions/min_length": 161.5625, "completions/min_terminated_length": 161.5625, "entropy": 0.07773100049234927, "epoch": 2.3945578231292517, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.16118626296520233, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 200201952.0, "reward": 10.956557631492615, "reward_std": 0.7257527317851782, "rewards/bm25_retrieval_reward_fn/mean": 0.9221826978027821, "rewards/bm25_retrieval_reward_fn/std": 0.20850562094710767, "rewards/event_reward_fn/mean": 9.0927734375, "rewards/event_reward_fn/std": 5.140145808458328, "rewards/format_reward_fn/mean": 0.9416015632450581, "rewards/format_reward_fn/std": 0.1953780883923173, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0224609375, "completions/max_length": 245.9375, "completions/max_terminated_length": 240.75, "completions/mean_length": 202.029296875, "completions/mean_terminated_length": 200.80841064453125, "completions/min_length": 159.5, "completions/min_terminated_length": 159.5, "entropy": 0.0753114647231996, "epoch": 2.4101068999028183, "frac_reward_zero_std": 0.41015625, "grad_norm": 0.15847159922122955, "learning_rate": 5e-05, "loss": -0.0026, "num_tokens": 201522430.0, "reward": 11.323036313056946, "reward_std": 0.7887390460819006, "rewards/bm25_retrieval_reward_fn/mean": 0.9379655607044697, "rewards/bm25_retrieval_reward_fn/std": 0.17426727525889874, "rewards/event_reward_fn/mean": 9.4345703125, "rewards/event_reward_fn/std": 5.223158270120621, "rewards/format_reward_fn/mean": 0.9505006894469261, "rewards/format_reward_fn/std": 0.1740011121146381, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 250.125, "completions/max_terminated_length": 246.9375, "completions/mean_length": 205.435546875, "completions/mean_terminated_length": 204.02826595306396, "completions/min_length": 164.875, "completions/min_terminated_length": 164.875, "entropy": 0.07844416983425617, "epoch": 2.425655976676385, "frac_reward_zero_std": 0.34375, "grad_norm": 0.0907289907336235, "learning_rate": 5e-05, "loss": -0.0048, "num_tokens": 202805596.0, "reward": 11.417901694774628, "reward_std": 0.9192124493420124, "rewards/bm25_retrieval_reward_fn/mean": 0.9360267631709576, "rewards/bm25_retrieval_reward_fn/std": 0.18138159497175366, "rewards/event_reward_fn/mean": 9.52734375, "rewards/event_reward_fn/std": 5.153972968459129, "rewards/format_reward_fn/mean": 0.9545312523841858, "rewards/format_reward_fn/std": 0.17224382143467665, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 253.5, "completions/max_terminated_length": 249.08333333333334, "completions/mean_length": 210.64192708333334, "completions/mean_terminated_length": 209.20124689737955, "completions/min_length": 172.41666666666666, "completions/min_terminated_length": 172.41666666666666, "entropy": 0.08086393773555756, "epoch": 2.441205053449951, "frac_reward_zero_std": 0.3802083333333333, "grad_norm": 0.10650806128978729, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 204152564.0, "reward": 11.6739342212677, "reward_std": 0.807344543437163, "rewards/bm25_retrieval_reward_fn/mean": 0.9262488782405853, "rewards/bm25_retrieval_reward_fn/std": 0.19760222919285297, "rewards/event_reward_fn/mean": 9.798177083333334, "rewards/event_reward_fn/std": 5.788699746131897, "rewards/format_reward_fn/mean": 0.9495081007480621, "rewards/format_reward_fn/std": 0.18829844643672308, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0712890625, "completions/max_length": 254.8125, "completions/max_terminated_length": 251.625, "completions/mean_length": 213.2236328125, "completions/mean_terminated_length": 209.86785411834717, "completions/min_length": 171.125, "completions/min_terminated_length": 171.125, "entropy": 0.08136322861537337, "epoch": 2.456754130223518, "frac_reward_zero_std": 0.328125, "grad_norm": 0.2153819352388382, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 205500469.0, "reward": 12.446550607681274, "reward_std": 1.034587848931551, "rewards/bm25_retrieval_reward_fn/mean": 0.8715435974299908, "rewards/bm25_retrieval_reward_fn/std": 0.28421050729230046, "rewards/event_reward_fn/mean": 10.6826171875, "rewards/event_reward_fn/std": 6.132740959525108, "rewards/format_reward_fn/mean": 0.892389789223671, "rewards/format_reward_fn/std": 0.2872252073138952, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 255.5, "completions/max_terminated_length": 251.9375, "completions/mean_length": 216.9306640625, "completions/mean_terminated_length": 214.75403022766113, "completions/min_length": 176.1875, "completions/min_terminated_length": 176.1875, "entropy": 0.08582799974828959, "epoch": 2.4723032069970845, "frac_reward_zero_std": 0.26953125, "grad_norm": 0.1978168785572052, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 206796278.0, "reward": 11.944559633731842, "reward_std": 0.9803863354027271, "rewards/bm25_retrieval_reward_fn/mean": 0.9094629287719727, "rewards/bm25_retrieval_reward_fn/std": 0.2257095631211996, "rewards/event_reward_fn/mean": 10.103515625, "rewards/event_reward_fn/std": 5.498953863978386, "rewards/format_reward_fn/mean": 0.9315809458494186, "rewards/format_reward_fn/std": 0.228111170232296, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 255.625, "completions/max_terminated_length": 250.5, "completions/mean_length": 217.259765625, "completions/mean_terminated_length": 213.05884075164795, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.08552914392203093, "epoch": 2.487852283770651, "frac_reward_zero_std": 0.28515625, "grad_norm": 0.16805820167064667, "learning_rate": 5e-05, "loss": 0.007, "num_tokens": 208168132.0, "reward": 11.79398000240326, "reward_std": 0.9481483921408653, "rewards/bm25_retrieval_reward_fn/mean": 0.868280190974474, "rewards/bm25_retrieval_reward_fn/std": 0.28656442323699594, "rewards/event_reward_fn/mean": 10.037109375, "rewards/event_reward_fn/std": 6.124383822083473, "rewards/format_reward_fn/mean": 0.8885904960334301, "rewards/format_reward_fn/std": 0.28199191950261593, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0458984375, "completions/max_length": 253.1875, "completions/max_terminated_length": 248.0, "completions/mean_length": 210.638671875, "completions/mean_terminated_length": 208.40313148498535, "completions/min_length": 173.625, "completions/min_terminated_length": 173.625, "entropy": 0.08260456612333655, "epoch": 2.503401360544218, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.09216822683811188, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 209501810.0, "reward": 11.016095101833344, "reward_std": 0.860798167064786, "rewards/bm25_retrieval_reward_fn/mean": 0.9306312911212444, "rewards/bm25_retrieval_reward_fn/std": 0.19113765214569867, "rewards/event_reward_fn/mean": 9.1396484375, "rewards/event_reward_fn/std": 5.75250081717968, "rewards/format_reward_fn/mean": 0.9458155073225498, "rewards/format_reward_fn/std": 0.19293752522207797, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 247.375, "completions/max_terminated_length": 242.875, "completions/mean_length": 203.1708984375, "completions/mean_terminated_length": 202.13115978240967, "completions/min_length": 163.9375, "completions/min_terminated_length": 163.9375, "entropy": 0.08376244455575943, "epoch": 2.518950437317784, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.21585437655448914, "learning_rate": 5e-05, "loss": -0.0033, "num_tokens": 210795597.0, "reward": 10.72483429312706, "reward_std": 0.7541004437953234, "rewards/bm25_retrieval_reward_fn/mean": 0.9423710107803345, "rewards/bm25_retrieval_reward_fn/std": 0.14322513493243605, "rewards/event_reward_fn/mean": 8.8203125, "rewards/event_reward_fn/std": 5.188908696174622, "rewards/format_reward_fn/mean": 0.9621507674455643, "rewards/format_reward_fn/std": 0.13946166937239468, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0595703125, "completions/max_length": 253.9375, "completions/max_terminated_length": 249.0, "completions/mean_length": 210.505859375, "completions/mean_terminated_length": 207.62176704406738, "completions/min_length": 173.25, "completions/min_terminated_length": 173.25, "entropy": 0.08947332156822085, "epoch": 2.534499514091351, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.14945168793201447, "learning_rate": 5e-05, "loss": -0.0063, "num_tokens": 212112795.0, "reward": 11.506966352462769, "reward_std": 0.794132512062788, "rewards/bm25_retrieval_reward_fn/mean": 0.9178526736795902, "rewards/bm25_retrieval_reward_fn/std": 0.22195658483542502, "rewards/event_reward_fn/mean": 9.6572265625, "rewards/event_reward_fn/std": 5.744891852140427, "rewards/format_reward_fn/mean": 0.9318870939314365, "rewards/format_reward_fn/std": 0.2204800380859524, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0888671875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.4375, "completions/mean_length": 216.943359375, "completions/mean_terminated_length": 213.17963314056396, "completions/min_length": 173.0625, "completions/min_terminated_length": 173.0625, "entropy": 0.08383294614031911, "epoch": 2.5500485908649173, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.1446155160665512, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 213441821.0, "reward": 11.958227455615997, "reward_std": 0.8823277465999126, "rewards/bm25_retrieval_reward_fn/mean": 0.8775606565177441, "rewards/bm25_retrieval_reward_fn/std": 0.2921582367271185, "rewards/event_reward_fn/mean": 10.185546875, "rewards/event_reward_fn/std": 5.809098601341248, "rewards/format_reward_fn/mean": 0.8951199762523174, "rewards/format_reward_fn/std": 0.29349780175834894, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 253.0, "completions/max_terminated_length": 251.0625, "completions/mean_length": 213.916015625, "completions/mean_terminated_length": 211.6506052017212, "completions/min_length": 178.5, "completions/min_terminated_length": 178.5, "entropy": 0.08359973039478064, "epoch": 2.565597667638484, "frac_reward_zero_std": 0.28125, "grad_norm": 0.16694338619709015, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 214813067.0, "reward": 11.792637586593628, "reward_std": 0.8656186051666737, "rewards/bm25_retrieval_reward_fn/mean": 0.8819389827549458, "rewards/bm25_retrieval_reward_fn/std": 0.24387728050351143, "rewards/event_reward_fn/mean": 9.998046875, "rewards/event_reward_fn/std": 5.933807298541069, "rewards/format_reward_fn/mean": 0.9126519113779068, "rewards/format_reward_fn/std": 0.2305635418742895, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0771484375, "completions/max_length": 255.4375, "completions/max_terminated_length": 249.875, "completions/mean_length": 215.8505859375, "completions/mean_terminated_length": 212.57020092010498, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.08900781767442822, "epoch": 2.5811467444120506, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.1385410875082016, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 216172750.0, "reward": 11.35420310497284, "reward_std": 0.8416576944291592, "rewards/bm25_retrieval_reward_fn/mean": 0.8935875110328197, "rewards/bm25_retrieval_reward_fn/std": 0.26079373457469046, "rewards/event_reward_fn/mean": 9.5556640625, "rewards/event_reward_fn/std": 5.99031862616539, "rewards/format_reward_fn/mean": 0.904951486736536, "rewards/format_reward_fn/std": 0.2644943995401263, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0361328125, "completions/max_length": 254.625, "completions/max_terminated_length": 249.75, "completions/mean_length": 212.0947265625, "completions/mean_terminated_length": 210.445143699646, "completions/min_length": 177.1875, "completions/min_terminated_length": 177.1875, "entropy": 0.08960987254977226, "epoch": 2.5966958211856173, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.15648344159126282, "learning_rate": 5e-05, "loss": 0.0023, "num_tokens": 217499543.0, "reward": 11.51008290052414, "reward_std": 0.7766602244228125, "rewards/bm25_retrieval_reward_fn/mean": 0.9093844145536423, "rewards/bm25_retrieval_reward_fn/std": 0.23443537193816155, "rewards/event_reward_fn/mean": 9.6806640625, "rewards/event_reward_fn/std": 5.529717803001404, "rewards/format_reward_fn/mean": 0.9200344160199165, "rewards/format_reward_fn/std": 0.23509666486643255, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 251.6875, "completions/max_terminated_length": 247.0, "completions/mean_length": 208.2880859375, "completions/mean_terminated_length": 206.4787950515747, "completions/min_length": 165.625, "completions/min_terminated_length": 165.625, "entropy": 0.09177634166553617, "epoch": 2.612244897959184, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.14749974012374878, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 218822370.0, "reward": 11.045877933502197, "reward_std": 0.9622980132699013, "rewards/bm25_retrieval_reward_fn/mean": 0.9265813454985619, "rewards/bm25_retrieval_reward_fn/std": 0.20256941742263734, "rewards/event_reward_fn/mean": 9.1845703125, "rewards/event_reward_fn/std": 5.212202668190002, "rewards/format_reward_fn/mean": 0.9347261041402817, "rewards/format_reward_fn/std": 0.2086858821567148, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 247.8125, "completions/max_terminated_length": 243.4375, "completions/mean_length": 200.87890625, "completions/mean_terminated_length": 199.8343276977539, "completions/min_length": 167.4375, "completions/min_terminated_length": 167.4375, "entropy": 0.09632771136239171, "epoch": 2.62779397473275, "frac_reward_zero_std": 0.28515625, "grad_norm": 0.1771780103445053, "learning_rate": 5e-05, "loss": -0.008, "num_tokens": 220135962.0, "reward": 12.231472432613373, "reward_std": 0.8915320560336113, "rewards/bm25_retrieval_reward_fn/mean": 0.9349792711436749, "rewards/bm25_retrieval_reward_fn/std": 0.16558197524864227, "rewards/event_reward_fn/mean": 10.35546875, "rewards/event_reward_fn/std": 5.747018381953239, "rewards/format_reward_fn/mean": 0.9410244673490524, "rewards/format_reward_fn/std": 0.17926215915940702, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 252.8125, "completions/max_terminated_length": 246.3125, "completions/mean_length": 209.3701171875, "completions/mean_terminated_length": 207.07564544677734, "completions/min_length": 170.625, "completions/min_terminated_length": 170.625, "entropy": 0.10564424749463797, "epoch": 2.6433430515063168, "frac_reward_zero_std": 0.22265625, "grad_norm": 0.10102769732475281, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 221542285.0, "reward": 12.009974837303162, "reward_std": 0.9928734712302685, "rewards/bm25_retrieval_reward_fn/mean": 0.9175033271312714, "rewards/bm25_retrieval_reward_fn/std": 0.1976611790014431, "rewards/event_reward_fn/mean": 10.1767578125, "rewards/event_reward_fn/std": 6.012263968586922, "rewards/format_reward_fn/mean": 0.9157139807939529, "rewards/format_reward_fn/std": 0.21656434168107808, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 255.0625, "completions/max_terminated_length": 253.375, "completions/mean_length": 213.7919921875, "completions/mean_terminated_length": 212.31354141235352, "completions/min_length": 174.5, "completions/min_terminated_length": 174.5, "entropy": 0.10326679470017552, "epoch": 2.6588921282798834, "frac_reward_zero_std": 0.23828125, "grad_norm": 0.15221992135047913, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 222797308.0, "reward": 11.387903690338135, "reward_std": 0.911373607814312, "rewards/bm25_retrieval_reward_fn/mean": 0.9273334704339504, "rewards/bm25_retrieval_reward_fn/std": 0.18763835495337844, "rewards/event_reward_fn/mean": 9.5263671875, "rewards/event_reward_fn/std": 5.706304341554642, "rewards/format_reward_fn/mean": 0.9342031031847, "rewards/format_reward_fn/std": 0.2074666447006166, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0224609375, "completions/max_length": 250.375, "completions/max_terminated_length": 247.4375, "completions/mean_length": 207.4228515625, "completions/mean_terminated_length": 206.33260917663574, "completions/min_length": 170.25, "completions/min_terminated_length": 170.25, "entropy": 0.09272929606959224, "epoch": 2.67444120505345, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.21459202468395233, "learning_rate": 5e-05, "loss": -0.0035, "num_tokens": 224091517.0, "reward": 11.830156862735748, "reward_std": 0.8045283071696758, "rewards/bm25_retrieval_reward_fn/mean": 0.9445540346205235, "rewards/bm25_retrieval_reward_fn/std": 0.15531712002120912, "rewards/event_reward_fn/mean": 9.9248046875, "rewards/event_reward_fn/std": 5.416922226548195, "rewards/format_reward_fn/mean": 0.9607979953289032, "rewards/format_reward_fn/std": 0.1495908577926457, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 250.4375, "completions/max_terminated_length": 247.125, "completions/mean_length": 213.107421875, "completions/mean_terminated_length": 211.65931701660156, "completions/min_length": 173.0625, "completions/min_terminated_length": 173.0625, "entropy": 0.08341792924329638, "epoch": 2.6899902818270167, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.06316018104553223, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 225405591.0, "reward": 11.755984246730804, "reward_std": 0.8010260127484798, "rewards/bm25_retrieval_reward_fn/mean": 0.9238302148878574, "rewards/bm25_retrieval_reward_fn/std": 0.17838482139632106, "rewards/event_reward_fn/mean": 9.8876953125, "rewards/event_reward_fn/std": 5.315806642174721, "rewards/format_reward_fn/mean": 0.9444587081670761, "rewards/format_reward_fn/std": 0.16798695269972086, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0849609375, "completions/max_length": 255.375, "completions/max_terminated_length": 251.875, "completions/mean_length": 216.7353515625, "completions/mean_terminated_length": 213.09019565582275, "completions/min_length": 174.0625, "completions/min_terminated_length": 174.0625, "entropy": 0.08323041070252657, "epoch": 2.705539358600583, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.2660459578037262, "learning_rate": 5e-05, "loss": 0.0082, "num_tokens": 226754744.0, "reward": 11.574803471565247, "reward_std": 0.8111933209002018, "rewards/bm25_retrieval_reward_fn/mean": 0.8713752776384354, "rewards/bm25_retrieval_reward_fn/std": 0.28300391032826155, "rewards/event_reward_fn/mean": 9.8173828125, "rewards/event_reward_fn/std": 5.8233465403318405, "rewards/format_reward_fn/mean": 0.8860453926026821, "rewards/format_reward_fn/std": 0.28504633717238903, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0537109375, "completions/max_length": 254.125, "completions/max_terminated_length": 250.625, "completions/mean_length": 212.8642578125, "completions/mean_terminated_length": 210.45547103881836, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.08199881995096803, "epoch": 2.7210884353741496, "frac_reward_zero_std": 0.296875, "grad_norm": 0.08463400602340698, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 228098621.0, "reward": 11.454033315181732, "reward_std": 0.836145743727684, "rewards/bm25_retrieval_reward_fn/mean": 0.9158100821077824, "rewards/bm25_retrieval_reward_fn/std": 0.19612007169052958, "rewards/event_reward_fn/mean": 9.603515625, "rewards/event_reward_fn/std": 5.212661325931549, "rewards/format_reward_fn/mean": 0.9347075000405312, "rewards/format_reward_fn/std": 0.1920458609238267, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0380859375, "completions/max_length": 254.125, "completions/max_terminated_length": 249.0625, "completions/mean_length": 209.9765625, "completions/mean_terminated_length": 208.1564769744873, "completions/min_length": 168.875, "completions/min_terminated_length": 168.875, "entropy": 0.0759361800737679, "epoch": 2.7366375121477162, "frac_reward_zero_std": 0.328125, "grad_norm": 0.14759230613708496, "learning_rate": 5e-05, "loss": -0.0053, "num_tokens": 229429565.0, "reward": 12.068866312503815, "reward_std": 0.8678888715803623, "rewards/bm25_retrieval_reward_fn/mean": 0.9354286342859268, "rewards/bm25_retrieval_reward_fn/std": 0.19999602530151606, "rewards/event_reward_fn/mean": 10.185546875, "rewards/event_reward_fn/std": 6.09708933532238, "rewards/format_reward_fn/mean": 0.9478906244039536, "rewards/format_reward_fn/std": 0.2007538639008999, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 255.0, "completions/max_terminated_length": 251.4375, "completions/mean_length": 217.4892578125, "completions/mean_terminated_length": 214.5166711807251, "completions/min_length": 171.125, "completions/min_terminated_length": 171.125, "entropy": 0.07404683344066143, "epoch": 2.752186588921283, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.18848936259746552, "learning_rate": 5e-05, "loss": 0.0016, "num_tokens": 230773106.0, "reward": 12.326741218566895, "reward_std": 0.9671976566314697, "rewards/bm25_retrieval_reward_fn/mean": 0.89621976390481, "rewards/bm25_retrieval_reward_fn/std": 0.23690359899774194, "rewards/event_reward_fn/mean": 10.515625, "rewards/event_reward_fn/std": 5.634042501449585, "rewards/format_reward_fn/mean": 0.9148964546620846, "rewards/format_reward_fn/std": 0.2338833932299167, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 255.875, "completions/max_terminated_length": 252.625, "completions/mean_length": 217.236328125, "completions/mean_terminated_length": 214.71324062347412, "completions/min_length": 172.9375, "completions/min_terminated_length": 172.9375, "entropy": 0.08189457282423973, "epoch": 2.7677356656948495, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.20657600462436676, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 232070576.0, "reward": 11.5172398686409, "reward_std": 0.8970336727797985, "rewards/bm25_retrieval_reward_fn/mean": 0.9108828380703926, "rewards/bm25_retrieval_reward_fn/std": 0.22826198721304536, "rewards/event_reward_fn/mean": 9.673828125, "rewards/event_reward_fn/std": 5.733745768666267, "rewards/format_reward_fn/mean": 0.9325288347899914, "rewards/format_reward_fn/std": 0.226588967256248, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 252.5625, "completions/max_terminated_length": 249.4375, "completions/mean_length": 213.166015625, "completions/mean_terminated_length": 210.57254600524902, "completions/min_length": 167.75, "completions/min_terminated_length": 167.75, "entropy": 0.08049681456759572, "epoch": 2.7832847424684157, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.05886400490999222, "learning_rate": 5e-05, "loss": 0.0027, "num_tokens": 233415010.0, "reward": 11.385997593402863, "reward_std": 0.7553573679178953, "rewards/bm25_retrieval_reward_fn/mean": 0.9164049662649632, "rewards/bm25_retrieval_reward_fn/std": 0.2016591742867604, "rewards/event_reward_fn/mean": 9.53515625, "rewards/event_reward_fn/std": 5.440419033169746, "rewards/format_reward_fn/mean": 0.93443638458848, "rewards/format_reward_fn/std": 0.19143922347575426, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0498046875, "completions/max_length": 254.3125, "completions/max_terminated_length": 250.375, "completions/mean_length": 214.185546875, "completions/mean_terminated_length": 212.15652561187744, "completions/min_length": 172.1875, "completions/min_terminated_length": 172.1875, "entropy": 0.0816779644228518, "epoch": 2.7988338192419824, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.0916222557425499, "learning_rate": 5e-05, "loss": -0.0032, "num_tokens": 234744436.0, "reward": 11.663362562656403, "reward_std": 0.9015852566808462, "rewards/bm25_retrieval_reward_fn/mean": 0.9179877303540707, "rewards/bm25_retrieval_reward_fn/std": 0.2022923786425963, "rewards/event_reward_fn/mean": 9.8125, "rewards/event_reward_fn/std": 5.319433629512787, "rewards/format_reward_fn/mean": 0.9328748136758804, "rewards/format_reward_fn/std": 0.20254582911729813, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0966796875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.5625, "completions/mean_length": 220.1044921875, "completions/mean_terminated_length": 216.32228183746338, "completions/min_length": 171.5, "completions/min_terminated_length": 171.5, "entropy": 0.08011228078976274, "epoch": 2.814382896015549, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.08450505882501602, "learning_rate": 5e-05, "loss": 0.0032, "num_tokens": 236075035.0, "reward": 11.988969624042511, "reward_std": 0.7974276356399059, "rewards/bm25_retrieval_reward_fn/mean": 0.8782762736082077, "rewards/bm25_retrieval_reward_fn/std": 0.2924462389200926, "rewards/event_reward_fn/mean": 10.2158203125, "rewards/event_reward_fn/std": 5.5798052698373795, "rewards/format_reward_fn/mean": 0.8948730453848839, "rewards/format_reward_fn/std": 0.2953194109722972, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0849609375, "completions/max_length": 255.875, "completions/max_terminated_length": 254.3125, "completions/mean_length": 219.5263671875, "completions/mean_terminated_length": 216.11603832244873, "completions/min_length": 175.8125, "completions/min_terminated_length": 175.8125, "entropy": 0.08505099918693304, "epoch": 2.8299319727891157, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.12416191399097443, "learning_rate": 5e-05, "loss": 0.0027, "num_tokens": 237421442.0, "reward": 11.693400919437408, "reward_std": 0.8284243606030941, "rewards/bm25_retrieval_reward_fn/mean": 0.8833912238478661, "rewards/bm25_retrieval_reward_fn/std": 0.267708154162392, "rewards/event_reward_fn/mean": 9.90625, "rewards/event_reward_fn/std": 5.605697572231293, "rewards/format_reward_fn/mean": 0.9037597663700581, "rewards/format_reward_fn/std": 0.2683409294113517, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 254.5625, "completions/max_terminated_length": 252.0, "completions/mean_length": 213.083984375, "completions/mean_terminated_length": 211.6382074356079, "completions/min_length": 169.9375, "completions/min_terminated_length": 169.9375, "entropy": 0.08473130548372865, "epoch": 2.8454810495626823, "frac_reward_zero_std": 0.34375, "grad_norm": 0.06745623797178268, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 238715276.0, "reward": 11.77674776315689, "reward_std": 0.685878150165081, "rewards/bm25_retrieval_reward_fn/mean": 0.9190886318683624, "rewards/bm25_retrieval_reward_fn/std": 0.19958114624023438, "rewards/event_reward_fn/mean": 9.9150390625, "rewards/event_reward_fn/std": 5.207145616412163, "rewards/format_reward_fn/mean": 0.9426199793815613, "rewards/format_reward_fn/std": 0.1927571757696569, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 254.25, "completions/max_terminated_length": 250.6875, "completions/mean_length": 214.3232421875, "completions/mean_terminated_length": 212.1032657623291, "completions/min_length": 169.9375, "completions/min_terminated_length": 169.9375, "entropy": 0.08542184252291918, "epoch": 2.8610301263362485, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.10482887178659439, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 240057215.0, "reward": 11.639575242996216, "reward_std": 0.9158763885498047, "rewards/bm25_retrieval_reward_fn/mean": 0.8979970328509808, "rewards/bm25_retrieval_reward_fn/std": 0.23725404776632786, "rewards/event_reward_fn/mean": 9.826171875, "rewards/event_reward_fn/std": 5.315482467412949, "rewards/format_reward_fn/mean": 0.9154064357280731, "rewards/format_reward_fn/std": 0.2300750371068716, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 255.5, "completions/max_terminated_length": 253.1875, "completions/mean_length": 220.142578125, "completions/mean_terminated_length": 216.11438083648682, "completions/min_length": 169.6875, "completions/min_terminated_length": 169.6875, "entropy": 0.0917358947917819, "epoch": 2.8765792031098156, "frac_reward_zero_std": 0.34375, "grad_norm": 0.29249680042266846, "learning_rate": 5e-05, "loss": -0.0023, "num_tokens": 241437397.0, "reward": 12.01733946800232, "reward_std": 0.8955757319927216, "rewards/bm25_retrieval_reward_fn/mean": 0.8503146581351757, "rewards/bm25_retrieval_reward_fn/std": 0.3139411583542824, "rewards/event_reward_fn/mean": 10.298828125, "rewards/event_reward_fn/std": 5.663209050893784, "rewards/format_reward_fn/mean": 0.868196614086628, "rewards/format_reward_fn/std": 0.318668226711452, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0791015625, "completions/max_length": 254.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 215.6435546875, "completions/mean_terminated_length": 212.3185043334961, "completions/min_length": 165.4375, "completions/min_terminated_length": 165.4375, "entropy": 0.09182127751410007, "epoch": 2.892128279883382, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.17700594663619995, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 242749992.0, "reward": 11.478153705596924, "reward_std": 0.8844601437449455, "rewards/bm25_retrieval_reward_fn/mean": 0.880519162863493, "rewards/bm25_retrieval_reward_fn/std": 0.25536160822957754, "rewards/event_reward_fn/mean": 9.697265625, "rewards/event_reward_fn/std": 5.846217334270477, "rewards/format_reward_fn/mean": 0.9003689214587212, "rewards/format_reward_fn/std": 0.24951867014169693, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0615234375, "completions/max_length": 255.5625, "completions/max_terminated_length": 251.8125, "completions/mean_length": 214.3974609375, "completions/mean_terminated_length": 211.83877277374268, "completions/min_length": 166.3125, "completions/min_terminated_length": 166.3125, "entropy": 0.0888472800143063, "epoch": 2.9076773566569485, "frac_reward_zero_std": 0.359375, "grad_norm": 0.18143412470817566, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 244089207.0, "reward": 11.602717459201813, "reward_std": 0.7762532383203506, "rewards/bm25_retrieval_reward_fn/mean": 0.8992017544806004, "rewards/bm25_retrieval_reward_fn/std": 0.23547889525070786, "rewards/event_reward_fn/mean": 9.78125, "rewards/event_reward_fn/std": 5.857791095972061, "rewards/format_reward_fn/mean": 0.9222656264901161, "rewards/format_reward_fn/std": 0.2304223021492362, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 247.5, "completions/max_terminated_length": 245.1875, "completions/mean_length": 204.5185546875, "completions/mean_terminated_length": 203.2769651412964, "completions/min_length": 161.625, "completions/min_terminated_length": 161.625, "entropy": 0.08608831372112036, "epoch": 2.923226433430515, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.18656234443187714, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 245366298.0, "reward": 11.507224977016449, "reward_std": 0.8891939371824265, "rewards/bm25_retrieval_reward_fn/mean": 0.9319907054305077, "rewards/bm25_retrieval_reward_fn/std": 0.16323891701176763, "rewards/event_reward_fn/mean": 9.6142578125, "rewards/event_reward_fn/std": 6.0165297240018845, "rewards/format_reward_fn/mean": 0.9609765633940697, "rewards/format_reward_fn/std": 0.1338434610515833, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 243.125, "completions/max_terminated_length": 239.3125, "completions/mean_length": 190.7744140625, "completions/mean_terminated_length": 189.83202362060547, "completions/min_length": 149.75, "completions/min_terminated_length": 149.75, "entropy": 0.0839080074802041, "epoch": 2.938775510204082, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.16527223587036133, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 246703351.0, "reward": 11.010808229446411, "reward_std": 0.8288077171891928, "rewards/bm25_retrieval_reward_fn/mean": 0.9462215937674046, "rewards/bm25_retrieval_reward_fn/std": 0.14411098731216043, "rewards/event_reward_fn/mean": 9.1025390625, "rewards/event_reward_fn/std": 5.691614359617233, "rewards/format_reward_fn/mean": 0.9620475210249424, "rewards/format_reward_fn/std": 0.13395208539441228, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0146484375, "completions/max_length": 246.125, "completions/max_terminated_length": 240.25, "completions/mean_length": 197.7421875, "completions/mean_terminated_length": 196.85297679901123, "completions/min_length": 157.125, "completions/min_terminated_length": 157.125, "entropy": 0.0805180431343615, "epoch": 2.9543245869776484, "frac_reward_zero_std": 0.390625, "grad_norm": 0.16700057685375214, "learning_rate": 5e-05, "loss": -0.0039, "num_tokens": 247982459.0, "reward": 11.723504066467285, "reward_std": 0.775495782494545, "rewards/bm25_retrieval_reward_fn/mean": 0.9559190906584263, "rewards/bm25_retrieval_reward_fn/std": 0.11062386812409386, "rewards/event_reward_fn/mean": 9.7900390625, "rewards/event_reward_fn/std": 5.5720172971487045, "rewards/format_reward_fn/mean": 0.9775458797812462, "rewards/format_reward_fn/std": 0.08744857460260391, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 252.125, "completions/max_terminated_length": 244.875, "completions/mean_length": 207.955078125, "completions/mean_terminated_length": 206.359375, "completions/min_length": 170.625, "completions/min_terminated_length": 170.625, "entropy": 0.07879460602998734, "epoch": 2.9698736637512146, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.1064968854188919, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 249284553.0, "reward": 12.131292760372162, "reward_std": 0.7837562952190638, "rewards/bm25_retrieval_reward_fn/mean": 0.9279702864587307, "rewards/bm25_retrieval_reward_fn/std": 0.17746176407672465, "rewards/event_reward_fn/mean": 10.2548828125, "rewards/event_reward_fn/std": 5.574135601520538, "rewards/format_reward_fn/mean": 0.9484398253262043, "rewards/format_reward_fn/std": 0.1754506565630436, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0146484375, "completions/max_length": 240.0, "completions/max_terminated_length": 236.1875, "completions/mean_length": 195.7041015625, "completions/mean_terminated_length": 194.84180450439453, "completions/min_length": 159.0625, "completions/min_terminated_length": 159.0625, "entropy": 0.07347549963742495, "epoch": 2.9854227405247813, "frac_reward_zero_std": 0.359375, "grad_norm": 0.12872080504894257, "learning_rate": 5e-05, "loss": 0.0021, "num_tokens": 250584122.0, "reward": 11.6422598361969, "reward_std": 0.8738533556461334, "rewards/bm25_retrieval_reward_fn/mean": 0.9514190852642059, "rewards/bm25_retrieval_reward_fn/std": 0.12013476574793458, "rewards/event_reward_fn/mean": 9.7216796875, "rewards/event_reward_fn/std": 4.768230766057968, "rewards/format_reward_fn/mean": 0.9691610857844353, "rewards/format_reward_fn/std": 0.10613342700526118, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0107421875, "completions/max_length": 234.25, "completions/max_terminated_length": 229.25, "completions/mean_length": 173.66015625, "completions/mean_terminated_length": 172.75994396209717, "completions/min_length": 120.375, "completions/min_terminated_length": 120.375, "entropy": 0.06631791149266064, "epoch": 3.000971817298348, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.268284410238266, "learning_rate": 5e-05, "loss": -0.0048, "num_tokens": 251857282.0, "reward": 11.125836312770844, "reward_std": 0.8690453059971333, "rewards/bm25_retrieval_reward_fn/mean": 0.9682918414473534, "rewards/bm25_retrieval_reward_fn/std": 0.09923727967543527, "rewards/event_reward_fn/mean": 9.181640625, "rewards/event_reward_fn/std": 5.686726421117783, "rewards/format_reward_fn/mean": 0.9759038686752319, "rewards/format_reward_fn/std": 0.10093728080391884, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 236.875, "completions/max_terminated_length": 231.75, "completions/mean_length": 185.73828125, "completions/mean_terminated_length": 184.20531558990479, "completions/min_length": 140.75, "completions/min_terminated_length": 140.75, "entropy": 0.07297877874225378, "epoch": 3.0165208940719146, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.14597758650779724, "learning_rate": 5e-05, "loss": -0.0027, "num_tokens": 253203846.0, "reward": 11.508928120136261, "reward_std": 0.8718460761010647, "rewards/bm25_retrieval_reward_fn/mean": 0.9537964537739754, "rewards/bm25_retrieval_reward_fn/std": 0.133202571363654, "rewards/event_reward_fn/mean": 9.58984375, "rewards/event_reward_fn/std": 5.781486123800278, "rewards/format_reward_fn/mean": 0.9652878567576408, "rewards/format_reward_fn/std": 0.11640464654192328, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 246.5625, "completions/max_terminated_length": 239.75, "completions/mean_length": 192.57421875, "completions/mean_terminated_length": 191.06838130950928, "completions/min_length": 151.75, "completions/min_terminated_length": 151.75, "entropy": 0.07239698874764144, "epoch": 3.0320699708454812, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.15297071635723114, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 254532774.0, "reward": 12.089753448963165, "reward_std": 0.8900428749620914, "rewards/bm25_retrieval_reward_fn/mean": 0.9412974454462528, "rewards/bm25_retrieval_reward_fn/std": 0.17066996253561229, "rewards/event_reward_fn/mean": 10.1943359375, "rewards/event_reward_fn/std": 6.097415968775749, "rewards/format_reward_fn/mean": 0.9541201665997505, "rewards/format_reward_fn/std": 0.16532070748507977, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 236.9375, "completions/max_terminated_length": 232.375, "completions/mean_length": 185.283203125, "completions/mean_terminated_length": 183.95048999786377, "completions/min_length": 140.375, "completions/min_terminated_length": 140.375, "entropy": 0.06962199346162379, "epoch": 3.0476190476190474, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.1586209386587143, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 255823680.0, "reward": 11.430478930473328, "reward_std": 0.8875509612262249, "rewards/bm25_retrieval_reward_fn/mean": 0.9428003318607807, "rewards/bm25_retrieval_reward_fn/std": 0.14735072664916515, "rewards/event_reward_fn/mean": 9.525390625, "rewards/event_reward_fn/std": 5.551691547036171, "rewards/format_reward_fn/mean": 0.9622879475355148, "rewards/format_reward_fn/std": 0.13102243188768625, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0107421875, "completions/max_length": 241.5, "completions/max_terminated_length": 237.625, "completions/mean_length": 194.337890625, "completions/mean_terminated_length": 193.67024612426758, "completions/min_length": 160.5, "completions/min_terminated_length": 160.5, "entropy": 0.07421417301520705, "epoch": 3.063168124392614, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.32556113600730896, "learning_rate": 5e-05, "loss": -0.0055, "num_tokens": 257162266.0, "reward": 11.765808463096619, "reward_std": 0.8776319213211536, "rewards/bm25_retrieval_reward_fn/mean": 0.9556044563651085, "rewards/bm25_retrieval_reward_fn/std": 0.12997814314439893, "rewards/event_reward_fn/mean": 9.837890625, "rewards/event_reward_fn/std": 5.4677809327840805, "rewards/format_reward_fn/mean": 0.972313366830349, "rewards/format_reward_fn/std": 0.11132679507136345, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 246.4375, "completions/max_terminated_length": 241.25, "completions/mean_length": 201.2490234375, "completions/mean_terminated_length": 200.61123752593994, "completions/min_length": 161.0625, "completions/min_terminated_length": 161.0625, "entropy": 0.08062579715624452, "epoch": 3.0787172011661808, "frac_reward_zero_std": 0.31640625, "grad_norm": 0.0818430706858635, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 258469721.0, "reward": 12.113971889019012, "reward_std": 0.8606467135250568, "rewards/bm25_retrieval_reward_fn/mean": 0.9496653489768505, "rewards/bm25_retrieval_reward_fn/std": 0.13710944051854312, "rewards/event_reward_fn/mean": 10.19921875, "rewards/event_reward_fn/std": 5.852348044514656, "rewards/format_reward_fn/mean": 0.9650877378880978, "rewards/format_reward_fn/std": 0.12316453643143177, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 249.8125, "completions/max_terminated_length": 245.25, "completions/mean_length": 207.7333984375, "completions/mean_terminated_length": 206.9770908355713, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.07632905803620815, "epoch": 3.0942662779397474, "frac_reward_zero_std": 0.34375, "grad_norm": 0.20121721923351288, "learning_rate": 5e-05, "loss": 0.0009, "num_tokens": 259793840.0, "reward": 11.953573882579803, "reward_std": 0.7916774693876505, "rewards/bm25_retrieval_reward_fn/mean": 0.9334963597357273, "rewards/bm25_retrieval_reward_fn/std": 0.18081966822501272, "rewards/event_reward_fn/mean": 10.07421875, "rewards/event_reward_fn/std": 5.564135581254959, "rewards/format_reward_fn/mean": 0.9458589181303978, "rewards/format_reward_fn/std": 0.1874212771654129, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 254.1875, "completions/max_terminated_length": 251.4375, "completions/mean_length": 213.3369140625, "completions/mean_terminated_length": 211.478759765625, "completions/min_length": 170.6875, "completions/min_terminated_length": 170.6875, "entropy": 0.07402227586135268, "epoch": 3.109815354713314, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.17332999408245087, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 261173137.0, "reward": 12.101770102977753, "reward_std": 0.8903996516019106, "rewards/bm25_retrieval_reward_fn/mean": 0.9108825102448463, "rewards/bm25_retrieval_reward_fn/std": 0.22369781765155494, "rewards/event_reward_fn/mean": 10.2685546875, "rewards/event_reward_fn/std": 5.45231431722641, "rewards/format_reward_fn/mean": 0.9223329871892929, "rewards/format_reward_fn/std": 0.2215037615969777, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 253.875, "completions/max_terminated_length": 250.5625, "completions/mean_length": 212.9697265625, "completions/mean_terminated_length": 210.7997007369995, "completions/min_length": 174.8125, "completions/min_terminated_length": 174.8125, "entropy": 0.07718956796452403, "epoch": 3.1253644314868803, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.1662418693304062, "learning_rate": 5e-05, "loss": 0.005, "num_tokens": 262593334.0, "reward": 11.946735978126526, "reward_std": 0.8465413227677345, "rewards/bm25_retrieval_reward_fn/mean": 0.8991723321378231, "rewards/bm25_retrieval_reward_fn/std": 0.24368242837954313, "rewards/event_reward_fn/mean": 10.126953125, "rewards/event_reward_fn/std": 5.943547070026398, "rewards/format_reward_fn/mean": 0.9206105917692184, "rewards/format_reward_fn/std": 0.24353812169283628, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0205078125, "completions/max_length": 250.5625, "completions/max_terminated_length": 248.25, "completions/mean_length": 205.919921875, "completions/mean_terminated_length": 204.90336322784424, "completions/min_length": 166.75, "completions/min_terminated_length": 166.75, "entropy": 0.074956723023206, "epoch": 3.140913508260447, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.09807421267032623, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 263927740.0, "reward": 11.93316513299942, "reward_std": 0.7299522124230862, "rewards/bm25_retrieval_reward_fn/mean": 0.9466019049286842, "rewards/bm25_retrieval_reward_fn/std": 0.13588694983627647, "rewards/event_reward_fn/mean": 10.0244140625, "rewards/event_reward_fn/std": 5.49932274222374, "rewards/format_reward_fn/mean": 0.9621492139995098, "rewards/format_reward_fn/std": 0.13393445825204253, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 240.125, "completions/max_terminated_length": 232.5, "completions/mean_length": 195.6865234375, "completions/mean_terminated_length": 194.702299118042, "completions/min_length": 154.9375, "completions/min_terminated_length": 154.9375, "entropy": 0.0733064110390842, "epoch": 3.1564625850340136, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.09938943386077881, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 265309295.0, "reward": 11.604403555393219, "reward_std": 0.7237380500882864, "rewards/bm25_retrieval_reward_fn/mean": 0.9366314336657524, "rewards/bm25_retrieval_reward_fn/std": 0.15581788471899927, "rewards/event_reward_fn/mean": 9.7099609375, "rewards/event_reward_fn/std": 5.092981055378914, "rewards/format_reward_fn/mean": 0.9578111059963703, "rewards/format_reward_fn/std": 0.14878937718458474, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0126953125, "completions/max_length": 242.6875, "completions/max_terminated_length": 239.1875, "completions/mean_length": 197.634765625, "completions/mean_terminated_length": 196.94854736328125, "completions/min_length": 162.4375, "completions/min_terminated_length": 162.4375, "entropy": 0.0749533399939537, "epoch": 3.17201166180758, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.12545543909072876, "learning_rate": 5e-05, "loss": -0.0078, "num_tokens": 266596509.0, "reward": 11.91082489490509, "reward_std": 0.8464185632765293, "rewards/bm25_retrieval_reward_fn/mean": 0.9437732025980949, "rewards/bm25_retrieval_reward_fn/std": 0.15822483785450459, "rewards/event_reward_fn/mean": 10.00390625, "rewards/event_reward_fn/std": 5.849025100469589, "rewards/format_reward_fn/mean": 0.9631454646587372, "rewards/format_reward_fn/std": 0.14366952097043395, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 247.4375, "completions/max_terminated_length": 245.125, "completions/mean_length": 199.3134765625, "completions/mean_terminated_length": 198.85503959655762, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.07353265816345811, "epoch": 3.187560738581147, "frac_reward_zero_std": 0.2890625, "grad_norm": 0.21840062737464905, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 267914170.0, "reward": 12.273382246494293, "reward_std": 0.83649617806077, "rewards/bm25_retrieval_reward_fn/mean": 0.9368309266865253, "rewards/bm25_retrieval_reward_fn/std": 0.1605551114771515, "rewards/event_reward_fn/mean": 10.3828125, "rewards/event_reward_fn/std": 5.610780626535416, "rewards/format_reward_fn/mean": 0.9537388421595097, "rewards/format_reward_fn/std": 0.1550324517302215, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 244.125, "completions/max_terminated_length": 233.5625, "completions/mean_length": 188.4580078125, "completions/mean_terminated_length": 186.60568237304688, "completions/min_length": 136.875, "completions/min_terminated_length": 136.875, "entropy": 0.07190486835315824, "epoch": 3.2031098153547135, "frac_reward_zero_std": 0.40234375, "grad_norm": 0.10517474263906479, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 269206483.0, "reward": 11.828980565071106, "reward_std": 0.7958023902028799, "rewards/bm25_retrieval_reward_fn/mean": 0.9383596889674664, "rewards/bm25_retrieval_reward_fn/std": 0.17574476334266365, "rewards/event_reward_fn/mean": 9.9365234375, "rewards/event_reward_fn/std": 6.193027026951313, "rewards/format_reward_fn/mean": 0.9540975317358971, "rewards/format_reward_fn/std": 0.1640933039598167, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 246.125, "completions/max_terminated_length": 242.0, "completions/mean_length": 199.916015625, "completions/mean_terminated_length": 199.05778789520264, "completions/min_length": 159.6875, "completions/min_terminated_length": 159.6875, "entropy": 0.06998816644772887, "epoch": 3.2186588921282797, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.16048327088356018, "learning_rate": 5e-05, "loss": -0.0047, "num_tokens": 270519973.0, "reward": 11.97882354259491, "reward_std": 0.769274152815342, "rewards/bm25_retrieval_reward_fn/mean": 0.9493311978876591, "rewards/bm25_retrieval_reward_fn/std": 0.13935352605767548, "rewards/event_reward_fn/mean": 10.0625, "rewards/event_reward_fn/std": 5.823389694094658, "rewards/format_reward_fn/mean": 0.9669921882450581, "rewards/format_reward_fn/std": 0.11799978371709585, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 250.375, "completions/max_terminated_length": 244.75, "completions/mean_length": 198.3857421875, "completions/mean_terminated_length": 197.3439645767212, "completions/min_length": 157.5625, "completions/min_terminated_length": 157.5625, "entropy": 0.06354829482734203, "epoch": 3.2342079689018464, "frac_reward_zero_std": 0.4609375, "grad_norm": 0.1142616868019104, "learning_rate": 5e-05, "loss": 0.0017, "num_tokens": 271864552.0, "reward": 11.812129974365234, "reward_std": 0.7051564212888479, "rewards/bm25_retrieval_reward_fn/mean": 0.9286896027624607, "rewards/bm25_retrieval_reward_fn/std": 0.19053616502787918, "rewards/event_reward_fn/mean": 9.935546875, "rewards/event_reward_fn/std": 5.488028556108475, "rewards/format_reward_fn/mean": 0.9478934183716774, "rewards/format_reward_fn/std": 0.1760867452248931, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 250.5, "completions/max_terminated_length": 242.75, "completions/mean_length": 200.3955078125, "completions/mean_terminated_length": 198.8016004562378, "completions/min_length": 156.375, "completions/min_terminated_length": 156.375, "entropy": 0.06551302410662174, "epoch": 3.249757045675413, "frac_reward_zero_std": 0.42578125, "grad_norm": 0.10454633086919785, "learning_rate": 5e-05, "loss": -0.0045, "num_tokens": 273133345.0, "reward": 11.325606524944305, "reward_std": 0.8327386099845171, "rewards/bm25_retrieval_reward_fn/mean": 0.9499945268034935, "rewards/bm25_retrieval_reward_fn/std": 0.13595928740687668, "rewards/event_reward_fn/mean": 9.4111328125, "rewards/event_reward_fn/std": 5.7270414382219315, "rewards/format_reward_fn/mean": 0.9644791670143604, "rewards/format_reward_fn/std": 0.12074299133382738, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.4375, "completions/mean_length": 211.5947265625, "completions/mean_terminated_length": 208.13346004486084, "completions/min_length": 161.6875, "completions/min_terminated_length": 161.6875, "entropy": 0.06520150555297732, "epoch": 3.2653061224489797, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.08934107422828674, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 274438762.0, "reward": 11.95677363872528, "reward_std": 0.7490882519632578, "rewards/bm25_retrieval_reward_fn/mean": 0.8981688618659973, "rewards/bm25_retrieval_reward_fn/std": 0.2647989746183157, "rewards/event_reward_fn/mean": 10.14453125, "rewards/event_reward_fn/std": 6.044169589877129, "rewards/format_reward_fn/mean": 0.9140736609697342, "rewards/format_reward_fn/std": 0.2635462637990713, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0478515625, "completions/max_length": 253.5, "completions/max_terminated_length": 245.5, "completions/mean_length": 207.7646484375, "completions/mean_terminated_length": 205.32763290405273, "completions/min_length": 165.4375, "completions/min_terminated_length": 165.4375, "entropy": 0.06412141490727663, "epoch": 3.2808551992225463, "frac_reward_zero_std": 0.359375, "grad_norm": 0.19203978776931763, "learning_rate": 5e-05, "loss": 0.0009, "num_tokens": 275769713.0, "reward": 11.799296379089355, "reward_std": 0.7531254291534424, "rewards/bm25_retrieval_reward_fn/mean": 0.9147956632077694, "rewards/bm25_retrieval_reward_fn/std": 0.2275586040923372, "rewards/event_reward_fn/mean": 9.947265625, "rewards/event_reward_fn/std": 5.504115954041481, "rewards/format_reward_fn/mean": 0.9372349306941032, "rewards/format_reward_fn/std": 0.2251202268525958, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0380859375, "completions/max_length": 254.3125, "completions/max_terminated_length": 250.8125, "completions/mean_length": 213.9912109375, "completions/mean_terminated_length": 212.369891166687, "completions/min_length": 172.9375, "completions/min_terminated_length": 172.9375, "entropy": 0.06467607943341136, "epoch": 3.296404275996113, "frac_reward_zero_std": 0.43359375, "grad_norm": 0.10870110988616943, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 277049204.0, "reward": 11.820498406887054, "reward_std": 0.6512585924938321, "rewards/bm25_retrieval_reward_fn/mean": 0.9223602823913097, "rewards/bm25_retrieval_reward_fn/std": 0.18132009892724454, "rewards/event_reward_fn/mean": 9.9521484375, "rewards/event_reward_fn/std": 5.741435334086418, "rewards/format_reward_fn/mean": 0.9459895864129066, "rewards/format_reward_fn/std": 0.16436301171779633, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 255.0, "completions/max_terminated_length": 248.3125, "completions/mean_length": 216.11328125, "completions/mean_terminated_length": 212.78788471221924, "completions/min_length": 169.875, "completions/min_terminated_length": 169.875, "entropy": 0.06166102201677859, "epoch": 3.311953352769679, "frac_reward_zero_std": 0.453125, "grad_norm": 0.0720224604010582, "learning_rate": 5e-05, "loss": -0.0008, "num_tokens": 278391292.0, "reward": 11.994673013687134, "reward_std": 0.6659458577632904, "rewards/bm25_retrieval_reward_fn/mean": 0.8917759135365486, "rewards/bm25_retrieval_reward_fn/std": 0.25888109114021063, "rewards/event_reward_fn/mean": 10.1962890625, "rewards/event_reward_fn/std": 5.986569568514824, "rewards/format_reward_fn/mean": 0.9066080749034882, "rewards/format_reward_fn/std": 0.25973749114200473, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 248.5, "completions/max_terminated_length": 245.1875, "completions/mean_length": 204.8330078125, "completions/mean_terminated_length": 203.63013172149658, "completions/min_length": 162.875, "completions/min_terminated_length": 162.875, "entropy": 0.06246258458122611, "epoch": 3.327502429543246, "frac_reward_zero_std": 0.453125, "grad_norm": 0.10192258656024933, "learning_rate": 5e-05, "loss": -0.0042, "num_tokens": 279658405.0, "reward": 12.02261358499527, "reward_std": 0.7050989326089621, "rewards/bm25_retrieval_reward_fn/mean": 0.9559198245406151, "rewards/bm25_retrieval_reward_fn/std": 0.14632097427966073, "rewards/event_reward_fn/mean": 10.1015625, "rewards/event_reward_fn/std": 5.938043773174286, "rewards/format_reward_fn/mean": 0.9651312977075577, "rewards/format_reward_fn/std": 0.14133254252374172, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 249.8125, "completions/max_terminated_length": 246.5625, "completions/mean_length": 202.8173828125, "completions/mean_terminated_length": 201.44210147857666, "completions/min_length": 161.1875, "completions/min_terminated_length": 161.1875, "entropy": 0.05804568435996771, "epoch": 3.3430515063168125, "frac_reward_zero_std": 0.4375, "grad_norm": 0.09696446359157562, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 280979450.0, "reward": 11.77318161725998, "reward_std": 0.6543006896972656, "rewards/bm25_retrieval_reward_fn/mean": 0.9384081587195396, "rewards/bm25_retrieval_reward_fn/std": 0.1720411020796746, "rewards/event_reward_fn/mean": 9.8798828125, "rewards/event_reward_fn/std": 5.3430622816085815, "rewards/format_reward_fn/mean": 0.954890564084053, "rewards/format_reward_fn/std": 0.1621245201677084, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0283203125, "completions/max_length": 250.875, "completions/max_terminated_length": 243.375, "completions/mean_length": 204.373046875, "completions/mean_terminated_length": 202.94876098632812, "completions/min_length": 164.75, "completions/min_terminated_length": 164.75, "entropy": 0.06122026569209993, "epoch": 3.358600583090379, "frac_reward_zero_std": 0.4453125, "grad_norm": 0.12297879159450531, "learning_rate": 5e-05, "loss": -0.0054, "num_tokens": 282332772.0, "reward": 11.589432656764984, "reward_std": 0.7322587119415402, "rewards/bm25_retrieval_reward_fn/mean": 0.9409170113503933, "rewards/bm25_retrieval_reward_fn/std": 0.17370267800288275, "rewards/event_reward_fn/mean": 9.6923828125, "rewards/event_reward_fn/std": 6.004166945815086, "rewards/format_reward_fn/mean": 0.9561328142881393, "rewards/format_reward_fn/std": 0.16955442121252418, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0400390625, "completions/max_length": 252.8125, "completions/max_terminated_length": 249.25, "completions/mean_length": 212.8505859375, "completions/mean_terminated_length": 211.24829959869385, "completions/min_length": 171.4375, "completions/min_terminated_length": 171.4375, "entropy": 0.06467843032442033, "epoch": 3.3741496598639458, "frac_reward_zero_std": 0.453125, "grad_norm": 0.11574175953865051, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 283596291.0, "reward": 11.298483848571777, "reward_std": 0.699859144166112, "rewards/bm25_retrieval_reward_fn/mean": 0.9317664988338947, "rewards/bm25_retrieval_reward_fn/std": 0.1917368060676381, "rewards/event_reward_fn/mean": 9.4208984375, "rewards/event_reward_fn/std": 5.096125215291977, "rewards/format_reward_fn/mean": 0.9458189196884632, "rewards/format_reward_fn/std": 0.17844219831749797, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0302734375, "completions/max_length": 253.125, "completions/max_terminated_length": 248.8125, "completions/mean_length": 213.7158203125, "completions/mean_terminated_length": 212.43880653381348, "completions/min_length": 174.75, "completions/min_terminated_length": 174.75, "entropy": 0.07107805530540645, "epoch": 3.389698736637512, "frac_reward_zero_std": 0.4765625, "grad_norm": 0.14759734272956848, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 284865764.0, "reward": 11.34376209974289, "reward_std": 0.6468667350709438, "rewards/bm25_retrieval_reward_fn/mean": 0.9472848623991013, "rewards/bm25_retrieval_reward_fn/std": 0.16370269027538598, "rewards/event_reward_fn/mean": 9.4375, "rewards/event_reward_fn/std": 5.265123501420021, "rewards/format_reward_fn/mean": 0.9589774012565613, "rewards/format_reward_fn/std": 0.15847991104237735, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0615234375, "completions/max_length": 255.875, "completions/max_terminated_length": 252.5625, "completions/mean_length": 216.7431640625, "completions/mean_terminated_length": 214.12075424194336, "completions/min_length": 169.1875, "completions/min_terminated_length": 169.1875, "entropy": 0.07305671996437013, "epoch": 3.4052478134110786, "frac_reward_zero_std": 0.390625, "grad_norm": 0.17206382751464844, "learning_rate": 5e-05, "loss": 0.003, "num_tokens": 286236689.0, "reward": 11.9978928565979, "reward_std": 0.8643622100353241, "rewards/bm25_retrieval_reward_fn/mean": 0.9041581116616726, "rewards/bm25_retrieval_reward_fn/std": 0.2597373647149652, "rewards/event_reward_fn/mean": 10.1796875, "rewards/event_reward_fn/std": 5.569000482559204, "rewards/format_reward_fn/mean": 0.9140473119914532, "rewards/format_reward_fn/std": 0.26027182303369045, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0458984375, "completions/max_length": 254.8125, "completions/max_terminated_length": 252.75, "completions/mean_length": 216.333984375, "completions/mean_terminated_length": 214.39577198028564, "completions/min_length": 169.125, "completions/min_terminated_length": 169.125, "entropy": 0.07661846978589892, "epoch": 3.4207968901846453, "frac_reward_zero_std": 0.375, "grad_norm": 0.12547151744365692, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 287559795.0, "reward": 11.89124745130539, "reward_std": 0.7206966131925583, "rewards/bm25_retrieval_reward_fn/mean": 0.931918803602457, "rewards/bm25_retrieval_reward_fn/std": 0.19364137423690408, "rewards/event_reward_fn/mean": 10.0146484375, "rewards/event_reward_fn/std": 5.905052408576012, "rewards/format_reward_fn/mean": 0.9446800611913204, "rewards/format_reward_fn/std": 0.1913862293586135, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0654296875, "completions/max_length": 254.75, "completions/max_terminated_length": 251.125, "completions/mean_length": 213.421875, "completions/mean_terminated_length": 210.42116260528564, "completions/min_length": 163.4375, "completions/min_terminated_length": 163.4375, "entropy": 0.07146193599328399, "epoch": 3.436345966958212, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.17685569822788239, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 288922347.0, "reward": 11.876615107059479, "reward_std": 0.8880495801568031, "rewards/bm25_retrieval_reward_fn/mean": 0.9142365269362926, "rewards/bm25_retrieval_reward_fn/std": 0.22835631167981774, "rewards/event_reward_fn/mean": 10.0341796875, "rewards/event_reward_fn/std": 5.525876700878143, "rewards/format_reward_fn/mean": 0.92819894105196, "rewards/format_reward_fn/std": 0.22682476695626974, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 253.8125, "completions/max_terminated_length": 248.5, "completions/mean_length": 208.0751953125, "completions/mean_terminated_length": 206.6888551712036, "completions/min_length": 161.5, "completions/min_terminated_length": 161.5, "entropy": 0.06790656200610101, "epoch": 3.4518950437317786, "frac_reward_zero_std": 0.421875, "grad_norm": 0.17039044201374054, "learning_rate": 5e-05, "loss": 0.0012, "num_tokens": 290290916.0, "reward": 12.047883689403534, "reward_std": 0.7042225562036037, "rewards/bm25_retrieval_reward_fn/mean": 0.948846910148859, "rewards/bm25_retrieval_reward_fn/std": 0.14023484371136874, "rewards/event_reward_fn/mean": 10.134765625, "rewards/event_reward_fn/std": 5.774273455142975, "rewards/format_reward_fn/mean": 0.9642711319029331, "rewards/format_reward_fn/std": 0.12487931735813618, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0498046875, "completions/max_length": 254.25, "completions/max_terminated_length": 253.3125, "completions/mean_length": 217.3359375, "completions/mean_terminated_length": 215.44786262512207, "completions/min_length": 175.3125, "completions/min_terminated_length": 175.3125, "entropy": 0.07336867321282625, "epoch": 3.467444120505345, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.17650093138217926, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 291484148.0, "reward": 11.402482271194458, "reward_std": 0.7676825225353241, "rewards/bm25_retrieval_reward_fn/mean": 0.9262127205729485, "rewards/bm25_retrieval_reward_fn/std": 0.19901330675929785, "rewards/event_reward_fn/mean": 9.5341796875, "rewards/event_reward_fn/std": 5.093527913093567, "rewards/format_reward_fn/mean": 0.9420898444950581, "rewards/format_reward_fn/std": 0.19261480076238513, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 255.125, "completions/max_terminated_length": 252.25, "completions/mean_length": 217.9296875, "completions/mean_terminated_length": 214.96538639068604, "completions/min_length": 174.375, "completions/min_terminated_length": 174.375, "entropy": 0.07610900048166513, "epoch": 3.4829931972789114, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.2133682370185852, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 292856568.0, "reward": 11.831640183925629, "reward_std": 0.8922509625554085, "rewards/bm25_retrieval_reward_fn/mean": 0.9065587967634201, "rewards/bm25_retrieval_reward_fn/std": 0.23603793187066913, "rewards/event_reward_fn/mean": 10.00390625, "rewards/event_reward_fn/std": 5.970632314682007, "rewards/format_reward_fn/mean": 0.9211751334369183, "rewards/format_reward_fn/std": 0.23383971489965916, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 254.0625, "completions/max_terminated_length": 251.4375, "completions/mean_length": 218.853515625, "completions/mean_terminated_length": 216.94453525543213, "completions/min_length": 179.1875, "completions/min_terminated_length": 179.1875, "entropy": 0.0776207884773612, "epoch": 3.498542274052478, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.08279092609882355, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 294202350.0, "reward": 12.301475286483765, "reward_std": 0.799699567258358, "rewards/bm25_retrieval_reward_fn/mean": 0.9285315871238708, "rewards/bm25_retrieval_reward_fn/std": 0.2019159458577633, "rewards/event_reward_fn/mean": 10.4345703125, "rewards/event_reward_fn/std": 5.9011257737874985, "rewards/format_reward_fn/mean": 0.9383733309805393, "rewards/format_reward_fn/std": 0.20106245297938585, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0087890625, "completions/max_length": 249.0625, "completions/max_terminated_length": 247.9375, "completions/mean_length": 211.4404296875, "completions/mean_terminated_length": 211.08014678955078, "completions/min_length": 170.5, "completions/min_terminated_length": 170.5, "entropy": 0.07809598417952657, "epoch": 3.5140913508260447, "frac_reward_zero_std": 0.29296875, "grad_norm": 0.15841352939605713, "learning_rate": 5e-05, "loss": -0.0026, "num_tokens": 295538057.0, "reward": 11.816706955432892, "reward_std": 0.8686831034719944, "rewards/bm25_retrieval_reward_fn/mean": 0.9696620255708694, "rewards/bm25_retrieval_reward_fn/std": 0.08471985626965761, "rewards/event_reward_fn/mean": 9.865234375, "rewards/event_reward_fn/std": 5.069239139556885, "rewards/format_reward_fn/mean": 0.9818103611469269, "rewards/format_reward_fn/std": 0.07833539508283138, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 248.5, "completions/max_terminated_length": 244.0625, "completions/mean_length": 207.5703125, "completions/mean_terminated_length": 206.40735816955566, "completions/min_length": 167.8125, "completions/min_terminated_length": 167.8125, "entropy": 0.07908624736592174, "epoch": 3.5296404275996114, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.11043195426464081, "learning_rate": 5e-05, "loss": -0.0023, "num_tokens": 296834797.0, "reward": 11.86592173576355, "reward_std": 0.8323170337826014, "rewards/bm25_retrieval_reward_fn/mean": 0.9380478039383888, "rewards/bm25_retrieval_reward_fn/std": 0.17831697227666155, "rewards/event_reward_fn/mean": 9.974609375, "rewards/event_reward_fn/std": 5.845152243971825, "rewards/format_reward_fn/mean": 0.9532645083963871, "rewards/format_reward_fn/std": 0.17017430812120438, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 250.1875, "completions/max_terminated_length": 247.3125, "completions/mean_length": 207.181640625, "completions/mean_terminated_length": 206.51923084259033, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.08088134974241257, "epoch": 3.5451895043731776, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.15753485262393951, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 298106083.0, "reward": 11.870498239994049, "reward_std": 0.757409542798996, "rewards/bm25_retrieval_reward_fn/mean": 0.955335222184658, "rewards/bm25_retrieval_reward_fn/std": 0.13619694579392672, "rewards/event_reward_fn/mean": 9.94921875, "rewards/event_reward_fn/std": 5.25686414539814, "rewards/format_reward_fn/mean": 0.9659440144896507, "rewards/format_reward_fn/std": 0.13222627015784383, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0107421875, "completions/max_length": 244.625, "completions/max_terminated_length": 241.5, "completions/mean_length": 203.419921875, "completions/mean_terminated_length": 202.85576915740967, "completions/min_length": 161.8125, "completions/min_terminated_length": 161.8125, "entropy": 0.07925571827217937, "epoch": 3.5607385811467447, "frac_reward_zero_std": 0.3984375, "grad_norm": 0.06831870973110199, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 299424197.0, "reward": 11.959875285625458, "reward_std": 0.8259032759815454, "rewards/bm25_retrieval_reward_fn/mean": 0.9445747584104538, "rewards/bm25_retrieval_reward_fn/std": 0.1655241074040532, "rewards/event_reward_fn/mean": 10.0576171875, "rewards/event_reward_fn/std": 5.623490899801254, "rewards/format_reward_fn/mean": 0.9576835297048092, "rewards/format_reward_fn/std": 0.15519985277205706, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 247.1875, "completions/max_terminated_length": 243.25, "completions/mean_length": 200.2255859375, "completions/mean_terminated_length": 199.5936861038208, "completions/min_length": 159.9375, "completions/min_terminated_length": 159.9375, "entropy": 0.07751462701708078, "epoch": 3.576287657920311, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.1378462016582489, "learning_rate": 5e-05, "loss": -0.0039, "num_tokens": 300693292.0, "reward": 11.735808372497559, "reward_std": 0.7609035409986973, "rewards/bm25_retrieval_reward_fn/mean": 0.9578013271093369, "rewards/bm25_retrieval_reward_fn/std": 0.11039561772486195, "rewards/event_reward_fn/mean": 9.8076171875, "rewards/event_reward_fn/std": 5.369291961193085, "rewards/format_reward_fn/mean": 0.9703896977007389, "rewards/format_reward_fn/std": 0.10083504673093557, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 244.625, "completions/max_terminated_length": 240.25, "completions/mean_length": 204.474609375, "completions/mean_terminated_length": 203.49135780334473, "completions/min_length": 162.5625, "completions/min_terminated_length": 162.5625, "entropy": 0.07588420668616891, "epoch": 3.5918367346938775, "frac_reward_zero_std": 0.328125, "grad_norm": 0.15248270332813263, "learning_rate": 5e-05, "loss": -0.0039, "num_tokens": 302052422.0, "reward": 11.87961357831955, "reward_std": 0.9203954320400953, "rewards/bm25_retrieval_reward_fn/mean": 0.931057620793581, "rewards/bm25_retrieval_reward_fn/std": 0.16641236003488302, "rewards/event_reward_fn/mean": 9.994140625, "rewards/event_reward_fn/std": 5.567791044712067, "rewards/format_reward_fn/mean": 0.9544154554605484, "rewards/format_reward_fn/std": 0.14701158134266734, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0224609375, "completions/max_length": 250.5625, "completions/max_terminated_length": 246.6875, "completions/mean_length": 208.583984375, "completions/mean_terminated_length": 207.5050506591797, "completions/min_length": 171.0625, "completions/min_terminated_length": 171.0625, "entropy": 0.07612508768215775, "epoch": 3.607385811467444, "frac_reward_zero_std": 0.3046875, "grad_norm": 0.23306649923324585, "learning_rate": 5e-05, "loss": 0.0009, "num_tokens": 303372112.0, "reward": 11.453840970993042, "reward_std": 0.7904142383486032, "rewards/bm25_retrieval_reward_fn/mean": 0.9510433189570904, "rewards/bm25_retrieval_reward_fn/std": 0.14118389890063554, "rewards/event_reward_fn/mean": 9.5380859375, "rewards/event_reward_fn/std": 5.8418983072042465, "rewards/format_reward_fn/mean": 0.9647116847336292, "rewards/format_reward_fn/std": 0.1381231863051653, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 249.25, "completions/max_terminated_length": 248.0, "completions/mean_length": 209.95703125, "completions/mean_terminated_length": 209.5190019607544, "completions/min_length": 171.8125, "completions/min_terminated_length": 171.8125, "entropy": 0.0707268959376961, "epoch": 3.622934888241011, "frac_reward_zero_std": 0.375, "grad_norm": 0.07385765016078949, "learning_rate": 5e-05, "loss": -0.0036, "num_tokens": 304684204.0, "reward": 11.709131598472595, "reward_std": 0.8085791561752558, "rewards/bm25_retrieval_reward_fn/mean": 0.9635815359652042, "rewards/bm25_retrieval_reward_fn/std": 0.10360126395244151, "rewards/event_reward_fn/mean": 9.76953125, "rewards/event_reward_fn/std": 5.746441006660461, "rewards/format_reward_fn/mean": 0.9760188795626163, "rewards/format_reward_fn/std": 0.09353231685236096, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 252.5, "completions/max_terminated_length": 251.4375, "completions/mean_length": 213.353515625, "completions/mean_terminated_length": 212.29650974273682, "completions/min_length": 169.625, "completions/min_terminated_length": 169.625, "entropy": 0.0739557440392673, "epoch": 3.6384839650145775, "frac_reward_zero_std": 0.30078125, "grad_norm": 0.1414438933134079, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 306009662.0, "reward": 11.917364478111267, "reward_std": 0.9005591496825218, "rewards/bm25_retrieval_reward_fn/mean": 0.9439749345183372, "rewards/bm25_retrieval_reward_fn/std": 0.1546652951510623, "rewards/event_reward_fn/mean": 10.013671875, "rewards/event_reward_fn/std": 6.16796900331974, "rewards/format_reward_fn/mean": 0.9597175717353821, "rewards/format_reward_fn/std": 0.14394072350114584, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0107421875, "completions/max_length": 251.375, "completions/max_terminated_length": 247.9375, "completions/mean_length": 210.458984375, "completions/mean_terminated_length": 209.95756244659424, "completions/min_length": 171.9375, "completions/min_terminated_length": 171.9375, "entropy": 0.07131304289214313, "epoch": 3.6540330417881437, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.2263481467962265, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 307342912.0, "reward": 11.548463881015778, "reward_std": 0.7522790785878897, "rewards/bm25_retrieval_reward_fn/mean": 0.93173423781991, "rewards/bm25_retrieval_reward_fn/std": 0.17529538087546825, "rewards/event_reward_fn/mean": 9.6767578125, "rewards/event_reward_fn/std": 6.1354245990514755, "rewards/format_reward_fn/mean": 0.9399717897176743, "rewards/format_reward_fn/std": 0.17958084493875504, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 238.25, "completions/max_terminated_length": 238.0625, "completions/mean_length": 201.1005859375, "completions/mean_terminated_length": 201.05244064331055, "completions/min_length": 159.5, "completions/min_terminated_length": 159.5, "entropy": 0.06464898842386901, "epoch": 3.6695821185617103, "frac_reward_zero_std": 0.390625, "grad_norm": 0.09882862865924835, "learning_rate": 5e-05, "loss": -0.0046, "num_tokens": 308668915.0, "reward": 11.542174577713013, "reward_std": 0.6965284887701273, "rewards/bm25_retrieval_reward_fn/mean": 0.9554044380784035, "rewards/bm25_retrieval_reward_fn/std": 0.11500480188988149, "rewards/event_reward_fn/mean": 9.619140625, "rewards/event_reward_fn/std": 5.47371518611908, "rewards/format_reward_fn/mean": 0.9676294289529324, "rewards/format_reward_fn/std": 0.11533198039978743, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 241.8125, "completions/max_terminated_length": 241.0625, "completions/mean_length": 203.2158203125, "completions/mean_terminated_length": 202.8022975921631, "completions/min_length": 164.75, "completions/min_terminated_length": 164.75, "entropy": 0.07011180557310581, "epoch": 3.685131195335277, "frac_reward_zero_std": 0.34375, "grad_norm": 0.13302014768123627, "learning_rate": 5e-05, "loss": -0.0071, "num_tokens": 309952960.0, "reward": 12.114792346954346, "reward_std": 0.7427178621292114, "rewards/bm25_retrieval_reward_fn/mean": 0.9610139429569244, "rewards/bm25_retrieval_reward_fn/std": 0.09992504899855703, "rewards/event_reward_fn/mean": 10.1787109375, "rewards/event_reward_fn/std": 5.834049671888351, "rewards/format_reward_fn/mean": 0.9750674292445183, "rewards/format_reward_fn/std": 0.08985681226477027, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 248.75, "completions/max_terminated_length": 247.3125, "completions/mean_length": 212.1357421875, "completions/mean_terminated_length": 211.2209234237671, "completions/min_length": 172.25, "completions/min_terminated_length": 172.25, "entropy": 0.07245555147528648, "epoch": 3.7006802721088436, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16854174435138702, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 311247611.0, "reward": 11.610692858695984, "reward_std": 0.6659273523837328, "rewards/bm25_retrieval_reward_fn/mean": 0.9450487568974495, "rewards/bm25_retrieval_reward_fn/std": 0.11994595197029412, "rewards/event_reward_fn/mean": 9.697265625, "rewards/event_reward_fn/std": 5.562545984983444, "rewards/format_reward_fn/mean": 0.9683784395456314, "rewards/format_reward_fn/std": 0.11825172184035182, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 254.6875, "completions/max_terminated_length": 252.3125, "completions/mean_length": 217.888671875, "completions/mean_terminated_length": 216.13310527801514, "completions/min_length": 172.5, "completions/min_terminated_length": 172.5, "entropy": 0.0743629289790988, "epoch": 3.7162293488824103, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.16505570709705353, "learning_rate": 5e-05, "loss": 0.0016, "num_tokens": 312542821.0, "reward": 11.156748652458191, "reward_std": 0.7675234004855156, "rewards/bm25_retrieval_reward_fn/mean": 0.9249761961400509, "rewards/bm25_retrieval_reward_fn/std": 0.20397475379286334, "rewards/event_reward_fn/mean": 9.2919921875, "rewards/event_reward_fn/std": 5.000220879912376, "rewards/format_reward_fn/mean": 0.939780205488205, "rewards/format_reward_fn/std": 0.20532220043241978, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 254.25, "completions/max_terminated_length": 251.5, "completions/mean_length": 212.4443359375, "completions/mean_terminated_length": 210.93013668060303, "completions/min_length": 162.625, "completions/min_terminated_length": 162.625, "entropy": 0.07023975322954357, "epoch": 3.7317784256559765, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.060382600873708725, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 313934308.0, "reward": 11.860353231430054, "reward_std": 0.8548767194151878, "rewards/bm25_retrieval_reward_fn/mean": 0.9291994869709015, "rewards/bm25_retrieval_reward_fn/std": 0.19955480488715693, "rewards/event_reward_fn/mean": 9.9921875, "rewards/event_reward_fn/std": 6.334605395793915, "rewards/format_reward_fn/mean": 0.9389663524925709, "rewards/format_reward_fn/std": 0.20007416978478432, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0185546875, "completions/max_length": 253.8125, "completions/max_terminated_length": 250.0625, "completions/mean_length": 210.4931640625, "completions/mean_terminated_length": 209.59508609771729, "completions/min_length": 168.1875, "completions/min_terminated_length": 168.1875, "entropy": 0.07439161464571953, "epoch": 3.747327502429543, "frac_reward_zero_std": 0.34375, "grad_norm": 0.2331985980272293, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 315250945.0, "reward": 11.873708844184875, "reward_std": 0.8904512841254473, "rewards/bm25_retrieval_reward_fn/mean": 0.9391452148556709, "rewards/bm25_retrieval_reward_fn/std": 0.17918783274944872, "rewards/event_reward_fn/mean": 9.984375, "rewards/event_reward_fn/std": 5.551610827445984, "rewards/format_reward_fn/mean": 0.9501884989440441, "rewards/format_reward_fn/std": 0.1802559308707714, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 247.875, "completions/max_terminated_length": 246.25, "completions/mean_length": 208.615234375, "completions/mean_terminated_length": 208.4278688430786, "completions/min_length": 165.25, "completions/min_terminated_length": 165.25, "entropy": 0.06975571275688708, "epoch": 3.76287657920311, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.12414126843214035, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 316553199.0, "reward": 12.228178679943085, "reward_std": 0.7352720461785793, "rewards/bm25_retrieval_reward_fn/mean": 0.961551733314991, "rewards/bm25_retrieval_reward_fn/std": 0.11315030924743041, "rewards/event_reward_fn/mean": 10.298828125, "rewards/event_reward_fn/std": 5.328288942575455, "rewards/format_reward_fn/mean": 0.9677988588809967, "rewards/format_reward_fn/std": 0.11893212096765637, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0068359375, "completions/max_length": 245.6875, "completions/max_terminated_length": 244.5, "completions/mean_length": 206.140625, "completions/mean_terminated_length": 205.78240394592285, "completions/min_length": 165.625, "completions/min_terminated_length": 165.625, "entropy": 0.07621000800281763, "epoch": 3.7784256559766765, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.20296157896518707, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 317903031.0, "reward": 11.587688386440277, "reward_std": 0.7579143699258566, "rewards/bm25_retrieval_reward_fn/mean": 0.9600600115954876, "rewards/bm25_retrieval_reward_fn/std": 0.12998962739948183, "rewards/event_reward_fn/mean": 9.66796875, "rewards/event_reward_fn/std": 5.291949540376663, "rewards/format_reward_fn/mean": 0.9596595913171768, "rewards/format_reward_fn/std": 0.14018871169537306, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 249.25, "completions/max_terminated_length": 244.5, "completions/mean_length": 201.33203125, "completions/mean_terminated_length": 200.6855239868164, "completions/min_length": 158.8125, "completions/min_terminated_length": 158.8125, "entropy": 0.07235691323876381, "epoch": 3.793974732750243, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.10855089873075485, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 319240939.0, "reward": 11.683230638504028, "reward_std": 0.7290781699120998, "rewards/bm25_retrieval_reward_fn/mean": 0.9578493759036064, "rewards/bm25_retrieval_reward_fn/std": 0.13556190475355834, "rewards/event_reward_fn/mean": 9.7548828125, "rewards/event_reward_fn/std": 5.5955929309129715, "rewards/format_reward_fn/mean": 0.970498513430357, "rewards/format_reward_fn/std": 0.12245456594973803, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 247.1875, "completions/max_terminated_length": 244.0625, "completions/mean_length": 205.2587890625, "completions/mean_terminated_length": 204.43410301208496, "completions/min_length": 162.8125, "completions/min_terminated_length": 162.8125, "entropy": 0.07462676661089063, "epoch": 3.8095238095238093, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.23824094235897064, "learning_rate": 5e-05, "loss": -0.004, "num_tokens": 320572924.0, "reward": 12.206887483596802, "reward_std": 0.8420960828661919, "rewards/bm25_retrieval_reward_fn/mean": 0.9535179361701012, "rewards/bm25_retrieval_reward_fn/std": 0.13041677000001073, "rewards/event_reward_fn/mean": 10.291015625, "rewards/event_reward_fn/std": 5.89475154876709, "rewards/format_reward_fn/mean": 0.9623538255691528, "rewards/format_reward_fn/std": 0.1202199412509799, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 253.1875, "completions/max_terminated_length": 251.5, "completions/mean_length": 210.583984375, "completions/mean_terminated_length": 209.80176162719727, "completions/min_length": 169.9375, "completions/min_terminated_length": 169.9375, "entropy": 0.07527026068419218, "epoch": 3.825072886297376, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.09440134465694427, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 321928146.0, "reward": 12.062871038913727, "reward_std": 0.6898276209831238, "rewards/bm25_retrieval_reward_fn/mean": 0.9501573704183102, "rewards/bm25_retrieval_reward_fn/std": 0.13593709268025123, "rewards/event_reward_fn/mean": 10.1474609375, "rewards/event_reward_fn/std": 5.55875962972641, "rewards/format_reward_fn/mean": 0.9652528204023838, "rewards/format_reward_fn/std": 0.13466597883962095, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 255.125, "completions/max_terminated_length": 253.9375, "completions/mean_length": 223.7958984375, "completions/mean_terminated_length": 219.62405586242676, "completions/min_length": 177.5, "completions/min_terminated_length": 177.5, "entropy": 0.07991416612640023, "epoch": 3.8406219630709426, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.15865761041641235, "learning_rate": 5e-05, "loss": 0.0038, "num_tokens": 323291405.0, "reward": 12.076306104660034, "reward_std": 0.9530179928988218, "rewards/bm25_retrieval_reward_fn/mean": 0.854170698672533, "rewards/bm25_retrieval_reward_fn/std": 0.2860095868818462, "rewards/event_reward_fn/mean": 10.3525390625, "rewards/event_reward_fn/std": 5.96200692653656, "rewards/format_reward_fn/mean": 0.869596354663372, "rewards/format_reward_fn/std": 0.2936730571091175, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0986328125, "completions/max_length": 255.0, "completions/max_terminated_length": 252.75, "completions/mean_length": 222.8603515625, "completions/mean_terminated_length": 219.4017848968506, "completions/min_length": 178.6875, "completions/min_terminated_length": 178.6875, "entropy": 0.07927003409713507, "epoch": 3.8561710398445093, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.12801237404346466, "learning_rate": 5e-05, "loss": 0.0036, "num_tokens": 324610378.0, "reward": 11.521483957767487, "reward_std": 0.8510149158537388, "rewards/bm25_retrieval_reward_fn/mean": 0.8705999292433262, "rewards/bm25_retrieval_reward_fn/std": 0.26414805941749364, "rewards/event_reward_fn/mean": 9.7568359375, "rewards/event_reward_fn/std": 5.6417785584926605, "rewards/format_reward_fn/mean": 0.8940479382872581, "rewards/format_reward_fn/std": 0.26540128607302904, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 253.25, "completions/max_terminated_length": 251.8125, "completions/mean_length": 213.95703125, "completions/mean_terminated_length": 211.99990463256836, "completions/min_length": 172.375, "completions/min_terminated_length": 172.375, "entropy": 0.07248568348586559, "epoch": 3.871720116618076, "frac_reward_zero_std": 0.31640625, "grad_norm": 0.14652633666992188, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 325886034.0, "reward": 11.792625546455383, "reward_std": 0.7393063232302666, "rewards/bm25_retrieval_reward_fn/mean": 0.9195492528378963, "rewards/bm25_retrieval_reward_fn/std": 0.19524600135628134, "rewards/event_reward_fn/mean": 9.9345703125, "rewards/event_reward_fn/std": 5.343173682689667, "rewards/format_reward_fn/mean": 0.9385060183703899, "rewards/format_reward_fn/std": 0.1972238675225526, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0341796875, "completions/max_length": 251.8125, "completions/max_terminated_length": 249.875, "completions/mean_length": 212.359375, "completions/mean_terminated_length": 210.8219394683838, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.07265612436458468, "epoch": 3.887269193391642, "frac_reward_zero_std": 0.359375, "grad_norm": 0.16204002499580383, "learning_rate": 5e-05, "loss": 0.0017, "num_tokens": 327199118.0, "reward": 11.693296015262604, "reward_std": 0.753578519448638, "rewards/bm25_retrieval_reward_fn/mean": 0.9361162185668945, "rewards/bm25_retrieval_reward_fn/std": 0.16267945885192603, "rewards/event_reward_fn/mean": 9.806640625, "rewards/event_reward_fn/std": 5.245450034737587, "rewards/format_reward_fn/mean": 0.9505392760038376, "rewards/format_reward_fn/std": 0.15567059512250125, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 250.8125, "completions/max_terminated_length": 247.9375, "completions/mean_length": 209.841796875, "completions/mean_terminated_length": 209.13395309448242, "completions/min_length": 168.9375, "completions/min_terminated_length": 168.9375, "entropy": 0.07126375450752676, "epoch": 3.902818270165209, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.2677111327648163, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 328480424.0, "reward": 11.549213945865631, "reward_std": 0.8479338348843157, "rewards/bm25_retrieval_reward_fn/mean": 0.9514924548566341, "rewards/bm25_retrieval_reward_fn/std": 0.1373359472490847, "rewards/event_reward_fn/mean": 9.6357421875, "rewards/event_reward_fn/std": 5.592780202627182, "rewards/format_reward_fn/mean": 0.9619791656732559, "rewards/format_reward_fn/std": 0.13886410370469093, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 255.125, "completions/max_terminated_length": 252.75, "completions/mean_length": 217.701171875, "completions/mean_terminated_length": 215.90612506866455, "completions/min_length": 175.25, "completions/min_terminated_length": 175.25, "entropy": 0.0726012377999723, "epoch": 3.9183673469387754, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1763237565755844, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 329827018.0, "reward": 11.842731773853302, "reward_std": 0.6917910370975733, "rewards/bm25_retrieval_reward_fn/mean": 0.9177339598536491, "rewards/bm25_retrieval_reward_fn/std": 0.18734293011948466, "rewards/event_reward_fn/mean": 9.9873046875, "rewards/event_reward_fn/std": 6.566082060337067, "rewards/format_reward_fn/mean": 0.9376931414008141, "rewards/format_reward_fn/std": 0.17624951899051666, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 253.25, "completions/max_terminated_length": 250.625, "completions/mean_length": 218.74609375, "completions/mean_terminated_length": 217.62857151031494, "completions/min_length": 182.8125, "completions/min_terminated_length": 182.8125, "entropy": 0.07676544087007642, "epoch": 3.933916423712342, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.2421189248561859, "learning_rate": 5e-05, "loss": 0.003, "num_tokens": 331133410.0, "reward": 11.190333425998688, "reward_std": 0.7282675616443157, "rewards/bm25_retrieval_reward_fn/mean": 0.9304395839571953, "rewards/bm25_retrieval_reward_fn/std": 0.1906340589048341, "rewards/event_reward_fn/mean": 9.3154296875, "rewards/event_reward_fn/std": 5.540377572178841, "rewards/format_reward_fn/mean": 0.9444642849266529, "rewards/format_reward_fn/std": 0.1848340081050992, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0556640625, "completions/max_length": 255.1875, "completions/max_terminated_length": 253.125, "completions/mean_length": 217.5888671875, "completions/mean_terminated_length": 215.29860401153564, "completions/min_length": 177.25, "completions/min_terminated_length": 177.25, "entropy": 0.07212572102434933, "epoch": 3.9494655004859087, "frac_reward_zero_std": 0.3125, "grad_norm": 0.10932140052318573, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 332477061.0, "reward": 12.174343943595886, "reward_std": 0.8341186344623566, "rewards/bm25_retrieval_reward_fn/mean": 0.9223817475140095, "rewards/bm25_retrieval_reward_fn/std": 0.2118896566098556, "rewards/event_reward_fn/mean": 10.31640625, "rewards/event_reward_fn/std": 5.680862247943878, "rewards/format_reward_fn/mean": 0.9355558678507805, "rewards/format_reward_fn/std": 0.21579903922975063, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 254.6875, "completions/max_terminated_length": 251.5, "completions/mean_length": 215.8310546875, "completions/mean_terminated_length": 214.54155158996582, "completions/min_length": 171.75, "completions/min_terminated_length": 171.75, "entropy": 0.07576306629925966, "epoch": 3.9650145772594754, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.1289004385471344, "learning_rate": 5e-05, "loss": 0.0046, "num_tokens": 333788636.0, "reward": 12.123865008354187, "reward_std": 0.7025708928704262, "rewards/bm25_retrieval_reward_fn/mean": 0.9466467574238777, "rewards/bm25_retrieval_reward_fn/std": 0.1455282896058634, "rewards/event_reward_fn/mean": 10.212890625, "rewards/event_reward_fn/std": 5.390189632773399, "rewards/format_reward_fn/mean": 0.9643276371061802, "rewards/format_reward_fn/std": 0.14344790298491716, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 253.5, "completions/max_terminated_length": 249.625, "completions/mean_length": 215.7744140625, "completions/mean_terminated_length": 214.6886749267578, "completions/min_length": 172.5, "completions/min_terminated_length": 172.5, "entropy": 0.08155694557353854, "epoch": 3.980563654033042, "frac_reward_zero_std": 0.33984375, "grad_norm": 0.10887642949819565, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 335104729.0, "reward": 12.040693879127502, "reward_std": 0.7290069870650768, "rewards/bm25_retrieval_reward_fn/mean": 0.9345810934901237, "rewards/bm25_retrieval_reward_fn/std": 0.1722267406876199, "rewards/event_reward_fn/mean": 10.150390625, "rewards/event_reward_fn/std": 5.202703006565571, "rewards/format_reward_fn/mean": 0.9557221904397011, "rewards/format_reward_fn/std": 0.16959328716620803, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0458984375, "completions/max_length": 254.375, "completions/max_terminated_length": 251.0, "completions/mean_length": 214.6845703125, "completions/mean_terminated_length": 212.69677925109863, "completions/min_length": 165.9375, "completions/min_terminated_length": 165.9375, "entropy": 0.08474897220730782, "epoch": 3.9961127308066082, "frac_reward_zero_std": 0.2421875, "grad_norm": 0.11697645485401154, "learning_rate": 5e-05, "loss": 0.0037, "num_tokens": 336411186.0, "reward": 11.956246078014374, "reward_std": 0.9525406192988157, "rewards/bm25_retrieval_reward_fn/mean": 0.9217490442097187, "rewards/bm25_retrieval_reward_fn/std": 0.17466088302899152, "rewards/event_reward_fn/mean": 10.0849609375, "rewards/event_reward_fn/std": 5.622701555490494, "rewards/format_reward_fn/mean": 0.9495361521840096, "rewards/format_reward_fn/std": 0.1639328496530652, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 247.9375, "completions/max_terminated_length": 247.0625, "completions/mean_length": 208.591796875, "completions/mean_terminated_length": 207.1873264312744, "completions/min_length": 165.8125, "completions/min_terminated_length": 165.8125, "entropy": 0.08839736273512244, "epoch": 4.011661807580175, "frac_reward_zero_std": 0.31640625, "grad_norm": 0.11042279005050659, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 337786032.0, "reward": 11.83129894733429, "reward_std": 0.8202849626541138, "rewards/bm25_retrieval_reward_fn/mean": 0.9457570128142834, "rewards/bm25_retrieval_reward_fn/std": 0.1432389054680243, "rewards/event_reward_fn/mean": 9.9228515625, "rewards/event_reward_fn/std": 6.496607750654221, "rewards/format_reward_fn/mean": 0.9626903533935547, "rewards/format_reward_fn/std": 0.1279599037952721, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 249.125, "completions/max_terminated_length": 246.25, "completions/mean_length": 205.5087890625, "completions/mean_terminated_length": 204.59263134002686, "completions/min_length": 160.125, "completions/min_terminated_length": 160.125, "entropy": 0.08152704173699021, "epoch": 4.0272108843537415, "frac_reward_zero_std": 0.375, "grad_norm": 0.23320095241069794, "learning_rate": 5e-05, "loss": -0.0061, "num_tokens": 339108101.0, "reward": 11.968081533908844, "reward_std": 0.7537729293107986, "rewards/bm25_retrieval_reward_fn/mean": 0.9513398185372353, "rewards/bm25_retrieval_reward_fn/std": 0.1318189318990335, "rewards/event_reward_fn/mean": 10.0517578125, "rewards/event_reward_fn/std": 5.673033118247986, "rewards/format_reward_fn/mean": 0.9649838842451572, "rewards/format_reward_fn/std": 0.12552043702453375, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 253.625, "completions/max_terminated_length": 249.9375, "completions/mean_length": 212.2314453125, "completions/mean_terminated_length": 210.16049671173096, "completions/min_length": 164.8125, "completions/min_terminated_length": 164.8125, "entropy": 0.08093675132840872, "epoch": 4.042759961127308, "frac_reward_zero_std": 0.265625, "grad_norm": 0.126747727394104, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 340463642.0, "reward": 11.930916666984558, "reward_std": 1.0069974604994059, "rewards/bm25_retrieval_reward_fn/mean": 0.9255944900214672, "rewards/bm25_retrieval_reward_fn/std": 0.19054480863269418, "rewards/event_reward_fn/mean": 10.0576171875, "rewards/event_reward_fn/std": 5.881117805838585, "rewards/format_reward_fn/mean": 0.9477050788700581, "rewards/format_reward_fn/std": 0.1838087635114789, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.625, "completions/mean_length": 223.8154296875, "completions/mean_terminated_length": 217.5712661743164, "completions/min_length": 164.0625, "completions/min_terminated_length": 164.0625, "entropy": 0.07918633660301566, "epoch": 4.058309037900875, "frac_reward_zero_std": 0.24609375, "grad_norm": 0.3148137629032135, "learning_rate": 5e-05, "loss": 0.0083, "num_tokens": 341802333.0, "reward": 12.038546562194824, "reward_std": 0.9452639296650887, "rewards/bm25_retrieval_reward_fn/mean": 0.8190805651247501, "rewards/bm25_retrieval_reward_fn/std": 0.34027846716344357, "rewards/event_reward_fn/mean": 10.375, "rewards/event_reward_fn/std": 5.990983292460442, "rewards/format_reward_fn/mean": 0.844466146081686, "rewards/format_reward_fn/std": 0.3460330106317997, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 253.1875, "completions/max_terminated_length": 249.6875, "completions/mean_length": 212.3359375, "completions/mean_terminated_length": 210.6906509399414, "completions/min_length": 165.5625, "completions/min_terminated_length": 165.5625, "entropy": 0.07842250727117062, "epoch": 4.073858114674441, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.1384952962398529, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 343114541.0, "reward": 12.094359815120697, "reward_std": 0.7967111878097057, "rewards/bm25_retrieval_reward_fn/mean": 0.9312739335000515, "rewards/bm25_retrieval_reward_fn/std": 0.17515901010483503, "rewards/event_reward_fn/mean": 10.20703125, "rewards/event_reward_fn/std": 6.081012561917305, "rewards/format_reward_fn/mean": 0.9560546875, "rewards/format_reward_fn/std": 0.16031964495778084, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0361328125, "completions/max_length": 254.1875, "completions/max_terminated_length": 249.9375, "completions/mean_length": 213.181640625, "completions/mean_terminated_length": 211.6773853302002, "completions/min_length": 171.3125, "completions/min_terminated_length": 171.3125, "entropy": 0.08099523605778813, "epoch": 4.089407191448008, "frac_reward_zero_std": 0.26953125, "grad_norm": 0.09165678173303604, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 344439075.0, "reward": 12.245625615119934, "reward_std": 0.8452774472534657, "rewards/bm25_retrieval_reward_fn/mean": 0.9267520122230053, "rewards/bm25_retrieval_reward_fn/std": 0.18223469553049654, "rewards/event_reward_fn/mean": 10.365234375, "rewards/event_reward_fn/std": 6.175159931182861, "rewards/format_reward_fn/mean": 0.9536393284797668, "rewards/format_reward_fn/std": 0.17545001558028162, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0263671875, "completions/max_length": 252.5, "completions/max_terminated_length": 248.75, "completions/mean_length": 211.978515625, "completions/mean_terminated_length": 210.79150772094727, "completions/min_length": 172.3125, "completions/min_terminated_length": 172.3125, "entropy": 0.07734930235892534, "epoch": 4.104956268221574, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1310277283191681, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 345764597.0, "reward": 12.470105409622192, "reward_std": 0.7958364505320787, "rewards/bm25_retrieval_reward_fn/mean": 0.9391484148800373, "rewards/bm25_retrieval_reward_fn/std": 0.15891925140749663, "rewards/event_reward_fn/mean": 10.5673828125, "rewards/event_reward_fn/std": 5.292239099740982, "rewards/format_reward_fn/mean": 0.9635742194950581, "rewards/format_reward_fn/std": 0.13998021464794874, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 255.4375, "completions/max_terminated_length": 252.25, "completions/mean_length": 215.884765625, "completions/mean_terminated_length": 213.35174751281738, "completions/min_length": 174.375, "completions/min_terminated_length": 174.375, "entropy": 0.08016611728817225, "epoch": 4.1205053449951405, "frac_reward_zero_std": 0.328125, "grad_norm": 0.16507475078105927, "learning_rate": 5e-05, "loss": 0.0012, "num_tokens": 347241199.0, "reward": 12.658133804798126, "reward_std": 0.771758034825325, "rewards/bm25_retrieval_reward_fn/mean": 0.9066168181598186, "rewards/bm25_retrieval_reward_fn/std": 0.22669256868539378, "rewards/event_reward_fn/mean": 10.82421875, "rewards/event_reward_fn/std": 6.56681552529335, "rewards/format_reward_fn/mean": 0.9272981770336628, "rewards/format_reward_fn/std": 0.2251733886078, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 255.0, "completions/max_terminated_length": 251.8125, "completions/mean_length": 213.9658203125, "completions/mean_terminated_length": 212.64927101135254, "completions/min_length": 169.25, "completions/min_terminated_length": 169.25, "entropy": 0.07921183155849576, "epoch": 4.136054421768708, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.10566289722919464, "learning_rate": 5e-05, "loss": -0.0023, "num_tokens": 348577168.0, "reward": 12.272885859012604, "reward_std": 0.759581271559, "rewards/bm25_retrieval_reward_fn/mean": 0.9340510219335556, "rewards/bm25_retrieval_reward_fn/std": 0.18297056294977665, "rewards/event_reward_fn/mean": 10.3828125, "rewards/event_reward_fn/std": 6.081824213266373, "rewards/format_reward_fn/mean": 0.956022135913372, "rewards/format_reward_fn/std": 0.1737882625311613, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0361328125, "completions/max_length": 254.375, "completions/max_terminated_length": 250.1875, "completions/mean_length": 212.3984375, "completions/mean_terminated_length": 210.8444414138794, "completions/min_length": 169.5, "completions/min_terminated_length": 169.5, "entropy": 0.08075638441368937, "epoch": 4.151603498542274, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.23219875991344452, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 349949360.0, "reward": 11.566748321056366, "reward_std": 0.6999896839261055, "rewards/bm25_retrieval_reward_fn/mean": 0.9331243596971035, "rewards/bm25_retrieval_reward_fn/std": 0.1664692930644378, "rewards/event_reward_fn/mean": 9.6806640625, "rewards/event_reward_fn/std": 4.993004456162453, "rewards/format_reward_fn/mean": 0.952959917485714, "rewards/format_reward_fn/std": 0.15185340540483594, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 253.375, "completions/max_terminated_length": 251.0, "completions/mean_length": 209.7763671875, "completions/mean_terminated_length": 208.1946315765381, "completions/min_length": 167.75, "completions/min_terminated_length": 167.75, "entropy": 0.07447958691045642, "epoch": 4.167152575315841, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.11696634441614151, "learning_rate": 5e-05, "loss": -0.0037, "num_tokens": 351294483.0, "reward": 12.276101768016815, "reward_std": 0.7893455550074577, "rewards/bm25_retrieval_reward_fn/mean": 0.9404859393835068, "rewards/bm25_retrieval_reward_fn/std": 0.1471004862105474, "rewards/event_reward_fn/mean": 10.375, "rewards/event_reward_fn/std": 5.744745135307312, "rewards/format_reward_fn/mean": 0.9606158547103405, "rewards/format_reward_fn/std": 0.12628577766008675, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 253.1875, "completions/max_terminated_length": 252.3125, "completions/mean_length": 213.9658203125, "completions/mean_terminated_length": 212.74925136566162, "completions/min_length": 172.25, "completions/min_terminated_length": 172.25, "entropy": 0.07558452151715755, "epoch": 4.182701652089407, "frac_reward_zero_std": 0.421875, "grad_norm": 0.09304752945899963, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 352616868.0, "reward": 12.646113216876984, "reward_std": 0.6985902674496174, "rewards/bm25_retrieval_reward_fn/mean": 0.9403096325695515, "rewards/bm25_retrieval_reward_fn/std": 0.16551246301969513, "rewards/event_reward_fn/mean": 10.7451171875, "rewards/event_reward_fn/std": 6.137658327817917, "rewards/format_reward_fn/mean": 0.9606863856315613, "rewards/format_reward_fn/std": 0.15825087064877152, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0400390625, "completions/max_length": 254.1875, "completions/max_terminated_length": 252.5625, "completions/mean_length": 215.220703125, "completions/mean_terminated_length": 213.58215522766113, "completions/min_length": 175.3125, "completions/min_terminated_length": 175.3125, "entropy": 0.08136412966996431, "epoch": 4.198250728862973, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.11478164047002792, "learning_rate": 5e-05, "loss": 0.0012, "num_tokens": 353947786.0, "reward": 12.772146999835968, "reward_std": 0.8985444996505976, "rewards/bm25_retrieval_reward_fn/mean": 0.9437018856406212, "rewards/bm25_retrieval_reward_fn/std": 0.17041826335480437, "rewards/event_reward_fn/mean": 10.8681640625, "rewards/event_reward_fn/std": 6.595528960227966, "rewards/format_reward_fn/mean": 0.9602811932563782, "rewards/format_reward_fn/std": 0.16472775302827358, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0673828125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 220.6962890625, "completions/mean_terminated_length": 218.14343070983887, "completions/min_length": 178.3125, "completions/min_terminated_length": 178.3125, "entropy": 0.08518477249890566, "epoch": 4.21379980563654, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.39661943912506104, "learning_rate": 5e-05, "loss": 0.0026, "num_tokens": 355262159.0, "reward": 12.166186690330505, "reward_std": 0.8557635508477688, "rewards/bm25_retrieval_reward_fn/mean": 0.9099063575267792, "rewards/bm25_retrieval_reward_fn/std": 0.22038055513985455, "rewards/event_reward_fn/mean": 10.322265625, "rewards/event_reward_fn/std": 6.33206932246685, "rewards/format_reward_fn/mean": 0.9340147599577904, "rewards/format_reward_fn/std": 0.2138909688219428, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0322265625, "completions/max_length": 254.125, "completions/max_terminated_length": 252.5, "completions/mean_length": 215.99609375, "completions/mean_terminated_length": 214.6772975921631, "completions/min_length": 171.6875, "completions/min_terminated_length": 171.6875, "entropy": 0.08633322129026055, "epoch": 4.229348882410107, "frac_reward_zero_std": 0.34375, "grad_norm": 0.08727186918258667, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 356636515.0, "reward": 12.367621660232544, "reward_std": 0.7627793811261654, "rewards/bm25_retrieval_reward_fn/mean": 0.9262477792799473, "rewards/bm25_retrieval_reward_fn/std": 0.19048181863036007, "rewards/event_reward_fn/mean": 10.494140625, "rewards/event_reward_fn/std": 5.73208275437355, "rewards/format_reward_fn/mean": 0.9472332261502743, "rewards/format_reward_fn/std": 0.18207533098757267, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 252.6875, "completions/max_terminated_length": 251.125, "completions/mean_length": 214.400390625, "completions/mean_terminated_length": 213.53761100769043, "completions/min_length": 171.1875, "completions/min_terminated_length": 171.1875, "entropy": 0.09359000297263265, "epoch": 4.244897959183674, "frac_reward_zero_std": 0.359375, "grad_norm": 0.207139253616333, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 357958357.0, "reward": 11.592573344707489, "reward_std": 0.7030898407101631, "rewards/bm25_retrieval_reward_fn/mean": 0.9569450728595257, "rewards/bm25_retrieval_reward_fn/std": 0.1302037090063095, "rewards/event_reward_fn/mean": 9.658203125, "rewards/event_reward_fn/std": 5.755192518234253, "rewards/format_reward_fn/mean": 0.9774251356720924, "rewards/format_reward_fn/std": 0.11306497757323086, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0478515625, "completions/max_length": 254.25, "completions/max_terminated_length": 252.875, "completions/mean_length": 218.54296875, "completions/mean_terminated_length": 216.6855926513672, "completions/min_length": 176.625, "completions/min_terminated_length": 176.625, "entropy": 0.09238096605986357, "epoch": 4.26044703595724, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.09042836725711823, "learning_rate": 5e-05, "loss": -0.005, "num_tokens": 359265841.0, "reward": 12.531767010688782, "reward_std": 0.789877756498754, "rewards/bm25_retrieval_reward_fn/mean": 0.9228904247283936, "rewards/bm25_retrieval_reward_fn/std": 0.18061268841847777, "rewards/event_reward_fn/mean": 10.66015625, "rewards/event_reward_fn/std": 5.896639049053192, "rewards/format_reward_fn/mean": 0.9487202428281307, "rewards/format_reward_fn/std": 0.16975674428977072, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0810546875, "completions/max_length": 254.75, "completions/max_terminated_length": 253.5, "completions/mean_length": 223.12109375, "completions/mean_terminated_length": 220.2361536026001, "completions/min_length": 183.375, "completions/min_terminated_length": 183.375, "entropy": 0.09400972304865718, "epoch": 4.275996112730807, "frac_reward_zero_std": 0.26171875, "grad_norm": 0.13734078407287598, "learning_rate": 5e-05, "loss": 0.0066, "num_tokens": 360610689.0, "reward": 12.017096877098083, "reward_std": 0.8595972079783678, "rewards/bm25_retrieval_reward_fn/mean": 0.8854949772357941, "rewards/bm25_retrieval_reward_fn/std": 0.252409529639408, "rewards/event_reward_fn/mean": 10.2197265625, "rewards/event_reward_fn/std": 5.637563467025757, "rewards/format_reward_fn/mean": 0.9118753150105476, "rewards/format_reward_fn/std": 0.2438918575644493, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0068359375, "completions/max_length": 249.25, "completions/max_terminated_length": 247.8125, "completions/mean_length": 207.8759765625, "completions/mean_terminated_length": 207.5603437423706, "completions/min_length": 174.75, "completions/min_terminated_length": 174.75, "entropy": 0.08496905583888292, "epoch": 4.291545189504373, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.09454280883073807, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 361880382.0, "reward": 11.80184918642044, "reward_std": 0.6977823339402676, "rewards/bm25_retrieval_reward_fn/mean": 0.9602529257535934, "rewards/bm25_retrieval_reward_fn/std": 0.10920671455096453, "rewards/event_reward_fn/mean": 9.8564453125, "rewards/event_reward_fn/std": 5.88802507519722, "rewards/format_reward_fn/mean": 0.9851508289575577, "rewards/format_reward_fn/std": 0.08428931841626763, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 248.125, "completions/max_terminated_length": 246.375, "completions/mean_length": 209.8857421875, "completions/mean_terminated_length": 209.5314416885376, "completions/min_length": 176.8125, "completions/min_terminated_length": 176.8125, "entropy": 0.07903782045468688, "epoch": 4.3070942662779395, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.2290063053369522, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 363192181.0, "reward": 12.047638177871704, "reward_std": 0.7197975367307663, "rewards/bm25_retrieval_reward_fn/mean": 0.9533804319798946, "rewards/bm25_retrieval_reward_fn/std": 0.13947686227038503, "rewards/event_reward_fn/mean": 10.1259765625, "rewards/event_reward_fn/std": 5.645702123641968, "rewards/format_reward_fn/mean": 0.9682812504470348, "rewards/format_reward_fn/std": 0.13028152799233794, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0126953125, "completions/max_length": 250.625, "completions/max_terminated_length": 248.0625, "completions/mean_length": 211.4912109375, "completions/mean_terminated_length": 210.90715408325195, "completions/min_length": 173.6875, "completions/min_terminated_length": 173.6875, "entropy": 0.08106682682409883, "epoch": 4.3226433430515065, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.12199271470308304, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 364506372.0, "reward": 12.547775268554688, "reward_std": 0.7325320076197386, "rewards/bm25_retrieval_reward_fn/mean": 0.9543510787189007, "rewards/bm25_retrieval_reward_fn/std": 0.1384572609094903, "rewards/event_reward_fn/mean": 10.6259765625, "rewards/event_reward_fn/std": 6.172823116183281, "rewards/format_reward_fn/mean": 0.9674477651715279, "rewards/format_reward_fn/std": 0.13982101762667298, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 250.75, "completions/max_terminated_length": 248.75, "completions/mean_length": 214.36328125, "completions/mean_terminated_length": 213.90539455413818, "completions/min_length": 178.3125, "completions/min_terminated_length": 178.3125, "entropy": 0.08011418208479881, "epoch": 4.338192419825073, "frac_reward_zero_std": 0.34375, "grad_norm": 0.10289853811264038, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 365784232.0, "reward": 12.01547396183014, "reward_std": 0.627776425331831, "rewards/bm25_retrieval_reward_fn/mean": 0.949002493172884, "rewards/bm25_retrieval_reward_fn/std": 0.12272156146354973, "rewards/event_reward_fn/mean": 10.0888671875, "rewards/event_reward_fn/std": 5.482552140951157, "rewards/format_reward_fn/mean": 0.9776041693985462, "rewards/format_reward_fn/std": 0.09575722855515778, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0146484375, "completions/max_length": 250.0625, "completions/max_terminated_length": 247.25, "completions/mean_length": 214.0712890625, "completions/mean_terminated_length": 213.4475040435791, "completions/min_length": 177.6875, "completions/min_terminated_length": 177.6875, "entropy": 0.08435638947412372, "epoch": 4.35374149659864, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.08181194961071014, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 367170541.0, "reward": 11.442171514034271, "reward_std": 0.7030144482851028, "rewards/bm25_retrieval_reward_fn/mean": 0.9583823382854462, "rewards/bm25_retrieval_reward_fn/std": 0.106479546520859, "rewards/event_reward_fn/mean": 9.4990234375, "rewards/event_reward_fn/std": 5.919656038284302, "rewards/format_reward_fn/mean": 0.9847656264901161, "rewards/format_reward_fn/std": 0.07690410828217864, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 252.4375, "completions/max_terminated_length": 250.9375, "completions/mean_length": 215.8115234375, "completions/mean_terminated_length": 214.75897407531738, "completions/min_length": 180.4375, "completions/min_terminated_length": 180.4375, "entropy": 0.0830869055353105, "epoch": 4.369290573372206, "frac_reward_zero_std": 0.375, "grad_norm": 0.14057432115077972, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 368451128.0, "reward": 12.22743684053421, "reward_std": 0.7562771774828434, "rewards/bm25_retrieval_reward_fn/mean": 0.9470355100929737, "rewards/bm25_retrieval_reward_fn/std": 0.1319723033811897, "rewards/event_reward_fn/mean": 10.3154296875, "rewards/event_reward_fn/std": 5.559557408094406, "rewards/format_reward_fn/mean": 0.9649716354906559, "rewards/format_reward_fn/std": 0.11026093480177224, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 254.3125, "completions/max_terminated_length": 252.1875, "completions/mean_length": 219.9169921875, "completions/mean_terminated_length": 218.84753227233887, "completions/min_length": 177.5, "completions/min_terminated_length": 177.5, "entropy": 0.0869236602447927, "epoch": 4.384839650145772, "frac_reward_zero_std": 0.3515625, "grad_norm": 0.12595002353191376, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 369721867.0, "reward": 11.16374546289444, "reward_std": 0.7131532970815897, "rewards/bm25_retrieval_reward_fn/mean": 0.9536585137248039, "rewards/bm25_retrieval_reward_fn/std": 0.13799344294238836, "rewards/event_reward_fn/mean": 9.2431640625, "rewards/event_reward_fn/std": 4.887677401304245, "rewards/format_reward_fn/mean": 0.9669227488338947, "rewards/format_reward_fn/std": 0.1246087858453393, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1357421875, "completions/max_length": 255.5, "completions/max_terminated_length": 253.875, "completions/mean_length": 225.7529296875, "completions/mean_terminated_length": 221.23834419250488, "completions/min_length": 174.875, "completions/min_terminated_length": 174.875, "entropy": 0.08455759705975652, "epoch": 4.400388726919339, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.0808541551232338, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 371102210.0, "reward": 11.924792170524597, "reward_std": 0.7625311650335789, "rewards/bm25_retrieval_reward_fn/mean": 0.858902994543314, "rewards/bm25_retrieval_reward_fn/std": 0.3007725060451776, "rewards/event_reward_fn/mean": 10.1962890625, "rewards/event_reward_fn/std": 5.602404475212097, "rewards/format_reward_fn/mean": 0.869599923491478, "rewards/format_reward_fn/std": 0.3008687235414982, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 253.75, "completions/max_terminated_length": 252.0625, "completions/mean_length": 220.9267578125, "completions/mean_terminated_length": 219.16146183013916, "completions/min_length": 178.9375, "completions/min_terminated_length": 178.9375, "entropy": 0.08046228950843215, "epoch": 4.415937803692906, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.09979816526174545, "learning_rate": 5e-05, "loss": 0.0016, "num_tokens": 372432039.0, "reward": 12.132369935512543, "reward_std": 0.7911638263612986, "rewards/bm25_retrieval_reward_fn/mean": 0.9381347000598907, "rewards/bm25_retrieval_reward_fn/std": 0.184728623367846, "rewards/event_reward_fn/mean": 10.2431640625, "rewards/event_reward_fn/std": 6.270698189735413, "rewards/format_reward_fn/mean": 0.9510712176561356, "rewards/format_reward_fn/std": 0.17310465592890978, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 250.5, "completions/max_terminated_length": 249.5, "completions/mean_length": 212.8916015625, "completions/mean_terminated_length": 212.41744136810303, "completions/min_length": 170.1875, "completions/min_terminated_length": 170.1875, "entropy": 0.07715298281982541, "epoch": 4.431486880466473, "frac_reward_zero_std": 0.43359375, "grad_norm": 0.12553343176841736, "learning_rate": 5e-05, "loss": -0.0034, "num_tokens": 373759536.0, "reward": 11.972736299037933, "reward_std": 0.7828308828175068, "rewards/bm25_retrieval_reward_fn/mean": 0.9649507515132427, "rewards/bm25_retrieval_reward_fn/std": 0.10414338018745184, "rewards/event_reward_fn/mean": 10.0283203125, "rewards/event_reward_fn/std": 5.264572739601135, "rewards/format_reward_fn/mean": 0.9794652201235294, "rewards/format_reward_fn/std": 0.09091756423003972, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 252.5, "completions/max_terminated_length": 246.875, "completions/mean_length": 211.771484375, "completions/mean_terminated_length": 210.6889190673828, "completions/min_length": 175.1875, "completions/min_terminated_length": 175.1875, "entropy": 0.0748588687274605, "epoch": 4.447035957240039, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.14217767119407654, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 375162258.0, "reward": 12.27553415298462, "reward_std": 0.8959056548774242, "rewards/bm25_retrieval_reward_fn/mean": 0.9571230597794056, "rewards/bm25_retrieval_reward_fn/std": 0.13766769837820902, "rewards/event_reward_fn/mean": 10.3515625, "rewards/event_reward_fn/std": 5.9426403641700745, "rewards/format_reward_fn/mean": 0.966848649084568, "rewards/format_reward_fn/std": 0.14077154966071248, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0146484375, "completions/max_length": 251.75, "completions/max_terminated_length": 250.0, "completions/mean_length": 214.3134765625, "completions/mean_terminated_length": 213.71609592437744, "completions/min_length": 172.4375, "completions/min_terminated_length": 172.4375, "entropy": 0.08040324132889509, "epoch": 4.462585034013605, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.10610129684209824, "learning_rate": 5e-05, "loss": -0.0025, "num_tokens": 376386051.0, "reward": 11.649299383163452, "reward_std": 0.7246943525969982, "rewards/bm25_retrieval_reward_fn/mean": 0.9564645774662495, "rewards/bm25_retrieval_reward_fn/std": 0.128838874574285, "rewards/event_reward_fn/mean": 9.7177734375, "rewards/event_reward_fn/std": 4.996019497513771, "rewards/format_reward_fn/mean": 0.9750613868236542, "rewards/format_reward_fn/std": 0.10886701662093401, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 253.75, "completions/max_terminated_length": 252.25, "completions/mean_length": 217.634765625, "completions/mean_terminated_length": 216.70848751068115, "completions/min_length": 180.125, "completions/min_terminated_length": 180.125, "entropy": 0.0827216855250299, "epoch": 4.478134110787172, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.06507635116577148, "learning_rate": 5e-05, "loss": -0.0014, "num_tokens": 377674649.0, "reward": 12.131418943405151, "reward_std": 0.6769323218613863, "rewards/bm25_retrieval_reward_fn/mean": 0.9498664774000645, "rewards/bm25_retrieval_reward_fn/std": 0.13098848913796246, "rewards/event_reward_fn/mean": 10.2109375, "rewards/event_reward_fn/std": 5.762604504823685, "rewards/format_reward_fn/mean": 0.9706148952245712, "rewards/format_reward_fn/std": 0.10980169754475355, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0263671875, "completions/max_length": 254.3125, "completions/max_terminated_length": 250.375, "completions/mean_length": 216.69921875, "completions/mean_terminated_length": 215.64843940734863, "completions/min_length": 181.5625, "completions/min_terminated_length": 181.5625, "entropy": 0.08147751213982701, "epoch": 4.493683187560738, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.09443672001361847, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 379054909.0, "reward": 12.458342015743256, "reward_std": 0.8503458648920059, "rewards/bm25_retrieval_reward_fn/mean": 0.9344224706292152, "rewards/bm25_retrieval_reward_fn/std": 0.16761525836773217, "rewards/event_reward_fn/mean": 10.5703125, "rewards/event_reward_fn/std": 6.101438790559769, "rewards/format_reward_fn/mean": 0.9536067768931389, "rewards/format_reward_fn/std": 0.15860154060646892, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 247.625, "completions/max_terminated_length": 245.9375, "completions/mean_length": 211.0009765625, "completions/mean_terminated_length": 210.65614318847656, "completions/min_length": 179.6875, "completions/min_terminated_length": 179.6875, "entropy": 0.08105296548455954, "epoch": 4.5092322643343055, "frac_reward_zero_std": 0.2734375, "grad_norm": 0.13886581361293793, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 380383922.0, "reward": 11.8076793551445, "reward_std": 0.8404425587505102, "rewards/bm25_retrieval_reward_fn/mean": 0.9563166238367558, "rewards/bm25_retrieval_reward_fn/std": 0.10713632625993341, "rewards/event_reward_fn/mean": 9.876953125, "rewards/event_reward_fn/std": 5.32866133749485, "rewards/format_reward_fn/mean": 0.974409569054842, "rewards/format_reward_fn/std": 0.09941133600659668, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 244.9375, "completions/max_terminated_length": 243.75, "completions/mean_length": 210.9580078125, "completions/mean_terminated_length": 210.7022762298584, "completions/min_length": 176.9375, "completions/min_terminated_length": 176.9375, "entropy": 0.07536840462125838, "epoch": 4.524781341107872, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.12522609531879425, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 381693591.0, "reward": 11.885925889015198, "reward_std": 0.6851696334779263, "rewards/bm25_retrieval_reward_fn/mean": 0.9520582780241966, "rewards/bm25_retrieval_reward_fn/std": 0.12435426318552345, "rewards/event_reward_fn/mean": 9.9599609375, "rewards/event_reward_fn/std": 5.595930874347687, "rewards/format_reward_fn/mean": 0.9739067144691944, "rewards/format_reward_fn/std": 0.11203544959425926, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 253.3125, "completions/max_terminated_length": 249.125, "completions/mean_length": 215.0244140625, "completions/mean_terminated_length": 213.8462839126587, "completions/min_length": 169.6875, "completions/min_terminated_length": 169.6875, "entropy": 0.0726128879468888, "epoch": 4.540330417881439, "frac_reward_zero_std": 0.4140625, "grad_norm": 0.08706779778003693, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 382994364.0, "reward": 12.469999372959137, "reward_std": 0.6653014570474625, "rewards/bm25_retrieval_reward_fn/mean": 0.9453453533351421, "rewards/bm25_retrieval_reward_fn/std": 0.14407718984875828, "rewards/event_reward_fn/mean": 10.560546875, "rewards/event_reward_fn/std": 5.933085352182388, "rewards/format_reward_fn/mean": 0.9641071446239948, "rewards/format_reward_fn/std": 0.13372006034478545, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 255.625, "completions/max_terminated_length": 252.3125, "completions/mean_length": 220.556640625, "completions/mean_terminated_length": 219.14645195007324, "completions/min_length": 177.5625, "completions/min_terminated_length": 177.5625, "entropy": 0.07643542671576142, "epoch": 4.555879494655005, "frac_reward_zero_std": 0.37109375, "grad_norm": 0.11401298642158508, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 384316582.0, "reward": 11.956826388835907, "reward_std": 0.752917755395174, "rewards/bm25_retrieval_reward_fn/mean": 0.9352249316871166, "rewards/bm25_retrieval_reward_fn/std": 0.17955804523080587, "rewards/event_reward_fn/mean": 10.072265625, "rewards/event_reward_fn/std": 5.650606602430344, "rewards/format_reward_fn/mean": 0.949335940182209, "rewards/format_reward_fn/std": 0.18370232917368412, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0419921875, "completions/max_length": 255.125, "completions/max_terminated_length": 252.5625, "completions/mean_length": 220.98828125, "completions/mean_terminated_length": 219.46258068084717, "completions/min_length": 176.75, "completions/min_terminated_length": 176.75, "entropy": 0.08043610630556941, "epoch": 4.571428571428571, "frac_reward_zero_std": 0.32421875, "grad_norm": 0.1541932076215744, "learning_rate": 5e-05, "loss": 0.0023, "num_tokens": 385619530.0, "reward": 12.387038767337799, "reward_std": 0.9704460687935352, "rewards/bm25_retrieval_reward_fn/mean": 0.9340765401721001, "rewards/bm25_retrieval_reward_fn/std": 0.17475787783041596, "rewards/event_reward_fn/mean": 10.5029296875, "rewards/event_reward_fn/std": 5.490474209189415, "rewards/format_reward_fn/mean": 0.9500325620174408, "rewards/format_reward_fn/std": 0.17480701440945268, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0244140625, "completions/max_length": 254.5625, "completions/max_terminated_length": 253.25, "completions/mean_length": 219.203125, "completions/mean_terminated_length": 218.32938385009766, "completions/min_length": 173.25, "completions/min_terminated_length": 173.25, "entropy": 0.07852176204323769, "epoch": 4.586977648202138, "frac_reward_zero_std": 0.30859375, "grad_norm": 0.154206320643425, "learning_rate": 5e-05, "loss": -0.0036, "num_tokens": 386950750.0, "reward": 12.324360966682434, "reward_std": 0.8198084980249405, "rewards/bm25_retrieval_reward_fn/mean": 0.9449009336531162, "rewards/bm25_retrieval_reward_fn/std": 0.12958236644044518, "rewards/event_reward_fn/mean": 10.4111328125, "rewards/event_reward_fn/std": 5.862970620393753, "rewards/format_reward_fn/mean": 0.9683272950351238, "rewards/format_reward_fn/std": 0.1171579877845943, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1611328125, "completions/max_length": 255.875, "completions/max_terminated_length": 254.8125, "completions/mean_length": 229.4404296875, "completions/mean_terminated_length": 224.22948551177979, "completions/min_length": 182.875, "completions/min_terminated_length": 182.875, "entropy": 0.08241429831832647, "epoch": 4.6025267249757045, "frac_reward_zero_std": 0.27734375, "grad_norm": 0.07539704442024231, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 388299461.0, "reward": 11.465377151966095, "reward_std": 0.8274618536233902, "rewards/bm25_retrieval_reward_fn/mean": 0.827308963984251, "rewards/bm25_retrieval_reward_fn/std": 0.3203687067143619, "rewards/event_reward_fn/mean": 9.7841796875, "rewards/event_reward_fn/std": 5.850812315940857, "rewards/format_reward_fn/mean": 0.8538884222507477, "rewards/format_reward_fn/std": 0.3226360958069563, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 254.125, "completions/max_terminated_length": 251.5625, "completions/mean_length": 218.27734375, "completions/mean_terminated_length": 216.53139400482178, "completions/min_length": 176.125, "completions/min_terminated_length": 176.125, "entropy": 0.08156100008636713, "epoch": 4.618075801749271, "frac_reward_zero_std": 0.34375, "grad_norm": 0.16608324646949768, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 389634989.0, "reward": 11.895731985569, "reward_std": 0.7938553486019373, "rewards/bm25_retrieval_reward_fn/mean": 0.9349915757775307, "rewards/bm25_retrieval_reward_fn/std": 0.1642955782590434, "rewards/event_reward_fn/mean": 10.0078125, "rewards/event_reward_fn/std": 5.811924383044243, "rewards/format_reward_fn/mean": 0.952927827835083, "rewards/format_reward_fn/std": 0.16472134506329894, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0029296875, "completions/max_length": 242.6875, "completions/max_terminated_length": 240.75, "completions/mean_length": 204.7666015625, "completions/mean_terminated_length": 204.61684799194336, "completions/min_length": 162.8125, "completions/min_terminated_length": 162.8125, "entropy": 0.07700010249391198, "epoch": 4.633624878522838, "frac_reward_zero_std": 0.33984375, "grad_norm": 0.09481830894947052, "learning_rate": 5e-05, "loss": -0.004, "num_tokens": 390946810.0, "reward": 11.631008863449097, "reward_std": 0.7731746193021536, "rewards/bm25_retrieval_reward_fn/mean": 0.9748496301472187, "rewards/bm25_retrieval_reward_fn/std": 0.060335727874189615, "rewards/event_reward_fn/mean": 9.6630859375, "rewards/event_reward_fn/std": 5.950529634952545, "rewards/format_reward_fn/mean": 0.9930733852088451, "rewards/format_reward_fn/std": 0.03480626177042723, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 240.5, "completions/max_terminated_length": 240.25, "completions/mean_length": 204.19921875, "completions/mean_terminated_length": 204.1085557937622, "completions/min_length": 163.8125, "completions/min_terminated_length": 163.8125, "entropy": 0.0720194885507226, "epoch": 4.649173955296404, "frac_reward_zero_std": 0.42578125, "grad_norm": 0.24066881835460663, "learning_rate": 5e-05, "loss": -0.0053, "num_tokens": 392285786.0, "reward": 11.837441265583038, "reward_std": 0.6970504522323608, "rewards/bm25_retrieval_reward_fn/mean": 0.9701077155768871, "rewards/bm25_retrieval_reward_fn/std": 0.06552095845108852, "rewards/event_reward_fn/mean": 9.8798828125, "rewards/event_reward_fn/std": 6.1304861307144165, "rewards/format_reward_fn/mean": 0.987450860440731, "rewards/format_reward_fn/std": 0.050714970799162984, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0068359375, "completions/max_length": 247.6875, "completions/max_terminated_length": 245.4375, "completions/mean_length": 207.455078125, "completions/mean_terminated_length": 207.11551570892334, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.06734136585146189, "epoch": 4.664723032069971, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.24333445727825165, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 393620732.0, "reward": 11.832262814044952, "reward_std": 0.7198988972231746, "rewards/bm25_retrieval_reward_fn/mean": 0.9663410037755966, "rewards/bm25_retrieval_reward_fn/std": 0.10767719999421388, "rewards/event_reward_fn/mean": 9.888671875, "rewards/event_reward_fn/std": 6.22281976044178, "rewards/format_reward_fn/mean": 0.9772499725222588, "rewards/format_reward_fn/std": 0.09165127645246685, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 244.5625, "completions/max_terminated_length": 241.875, "completions/mean_length": 206.2265625, "completions/mean_terminated_length": 205.5375509262085, "completions/min_length": 168.4375, "completions/min_terminated_length": 168.4375, "entropy": 0.068746835924685, "epoch": 4.680272108843537, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.13866840302944183, "learning_rate": 5e-05, "loss": -0.0036, "num_tokens": 394912384.0, "reward": 12.363645255565643, "reward_std": 0.6785086588934064, "rewards/bm25_retrieval_reward_fn/mean": 0.9618904925882816, "rewards/bm25_retrieval_reward_fn/std": 0.111503601889126, "rewards/event_reward_fn/mean": 10.42578125, "rewards/event_reward_fn/std": 5.934814959764481, "rewards/format_reward_fn/mean": 0.9759734608232975, "rewards/format_reward_fn/std": 0.08922615088522434, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0107421875, "completions/max_length": 247.75, "completions/max_terminated_length": 243.8125, "completions/mean_length": 207.9228515625, "completions/mean_terminated_length": 207.40814781188965, "completions/min_length": 171.5, "completions/min_terminated_length": 171.5, "entropy": 0.0720445194747299, "epoch": 4.695821185617104, "frac_reward_zero_std": 0.40234375, "grad_norm": 0.15189078450202942, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 396201629.0, "reward": 12.15940910577774, "reward_std": 0.7553090676665306, "rewards/bm25_retrieval_reward_fn/mean": 0.9640238769352436, "rewards/bm25_retrieval_reward_fn/std": 0.10036803607363254, "rewards/event_reward_fn/mean": 10.22265625, "rewards/event_reward_fn/std": 5.967520788311958, "rewards/format_reward_fn/mean": 0.9727289602160454, "rewards/format_reward_fn/std": 0.09985992661677301, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 247.25, "completions/max_terminated_length": 246.3125, "completions/mean_length": 208.2490234375, "completions/mean_terminated_length": 207.7828426361084, "completions/min_length": 171.5, "completions/min_terminated_length": 171.5, "entropy": 0.0726703389082104, "epoch": 4.711370262390671, "frac_reward_zero_std": 0.4140625, "grad_norm": 0.12212812900543213, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 397504832.0, "reward": 11.704569518566132, "reward_std": 0.8146627731621265, "rewards/bm25_retrieval_reward_fn/mean": 0.9750679209828377, "rewards/bm25_retrieval_reward_fn/std": 0.06717803853098303, "rewards/event_reward_fn/mean": 9.7451171875, "rewards/event_reward_fn/std": 5.367679685354233, "rewards/format_reward_fn/mean": 0.9843843020498753, "rewards/format_reward_fn/std": 0.061483025085181, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 250.0, "completions/max_terminated_length": 246.6875, "completions/mean_length": 208.2158203125, "completions/mean_terminated_length": 207.4158697128296, "completions/min_length": 169.8125, "completions/min_terminated_length": 169.8125, "entropy": 0.07577541843056679, "epoch": 4.726919339164237, "frac_reward_zero_std": 0.40234375, "grad_norm": 0.13794957101345062, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 398863225.0, "reward": 11.808978796005249, "reward_std": 0.94271419942379, "rewards/bm25_retrieval_reward_fn/mean": 0.9500653333961964, "rewards/bm25_retrieval_reward_fn/std": 0.1544017958221957, "rewards/event_reward_fn/mean": 9.9072265625, "rewards/event_reward_fn/std": 5.786400109529495, "rewards/format_reward_fn/mean": 0.9516868181526661, "rewards/format_reward_fn/std": 0.16639976995065808, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 248.3125, "completions/max_terminated_length": 245.3125, "completions/mean_length": 208.3583984375, "completions/mean_terminated_length": 207.6431589126587, "completions/min_length": 172.25, "completions/min_terminated_length": 172.25, "entropy": 0.07559068105183542, "epoch": 4.742468415937804, "frac_reward_zero_std": 0.4375, "grad_norm": 0.16644462943077087, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 400194388.0, "reward": 12.06817501783371, "reward_std": 0.8118875231593847, "rewards/bm25_retrieval_reward_fn/mean": 0.9551199078559875, "rewards/bm25_retrieval_reward_fn/std": 0.1355801696772687, "rewards/event_reward_fn/mean": 10.15234375, "rewards/event_reward_fn/std": 6.247093990445137, "rewards/format_reward_fn/mean": 0.9607114940881729, "rewards/format_reward_fn/std": 0.13486498198471963, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 249.8125, "completions/max_terminated_length": 245.25, "completions/mean_length": 208.1708984375, "completions/mean_terminated_length": 207.60796546936035, "completions/min_length": 172.5, "completions/min_terminated_length": 172.5, "entropy": 0.07368506025522947, "epoch": 4.75801749271137, "frac_reward_zero_std": 0.42578125, "grad_norm": 0.09005624800920486, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 401506939.0, "reward": 12.930509805679321, "reward_std": 0.7797500379383564, "rewards/bm25_retrieval_reward_fn/mean": 0.9727158024907112, "rewards/bm25_retrieval_reward_fn/std": 0.09574340214021504, "rewards/event_reward_fn/mean": 10.9794921875, "rewards/event_reward_fn/std": 6.308150812983513, "rewards/format_reward_fn/mean": 0.9783018380403519, "rewards/format_reward_fn/std": 0.09503353573381901, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 248.125, "completions/max_terminated_length": 246.9375, "completions/mean_length": 212.078125, "completions/mean_terminated_length": 211.53092288970947, "completions/min_length": 175.0625, "completions/min_terminated_length": 175.0625, "entropy": 0.08496078243479133, "epoch": 4.773566569484937, "frac_reward_zero_std": 0.35546875, "grad_norm": 0.1415478140115738, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 402830559.0, "reward": 11.695642411708832, "reward_std": 0.9189990721642971, "rewards/bm25_retrieval_reward_fn/mean": 0.9574025720357895, "rewards/bm25_retrieval_reward_fn/std": 0.1232752677751705, "rewards/event_reward_fn/mean": 9.7783203125, "rewards/event_reward_fn/std": 5.1400560438632965, "rewards/format_reward_fn/mean": 0.9599194973707199, "rewards/format_reward_fn/std": 0.13297926378436387, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0380859375, "completions/max_length": 253.5, "completions/max_terminated_length": 251.5625, "completions/mean_length": 216.451171875, "completions/mean_terminated_length": 214.94143295288086, "completions/min_length": 179.0625, "completions/min_terminated_length": 179.0625, "entropy": 0.08813147945329547, "epoch": 4.789115646258503, "frac_reward_zero_std": 0.3203125, "grad_norm": 0.07135059684515, "learning_rate": 5e-05, "loss": -0.0027, "num_tokens": 404200177.0, "reward": 12.266283452510834, "reward_std": 0.8791834656149149, "rewards/bm25_retrieval_reward_fn/mean": 0.9312233664095402, "rewards/bm25_retrieval_reward_fn/std": 0.2005148883908987, "rewards/event_reward_fn/mean": 10.400390625, "rewards/event_reward_fn/std": 6.1433481723070145, "rewards/format_reward_fn/mean": 0.9346696473658085, "rewards/format_reward_fn/std": 0.20454937545582652, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 251.75, "completions/max_terminated_length": 249.375, "completions/mean_length": 214.8701171875, "completions/mean_terminated_length": 213.78954410552979, "completions/min_length": 175.6875, "completions/min_terminated_length": 175.6875, "entropy": 0.08725593006238341, "epoch": 4.80466472303207, "frac_reward_zero_std": 0.3671875, "grad_norm": 0.14082126319408417, "learning_rate": 5e-05, "loss": -0.0043, "num_tokens": 405533868.0, "reward": 12.019694983959198, "reward_std": 0.8072663694620132, "rewards/bm25_retrieval_reward_fn/mean": 0.9528482407331467, "rewards/bm25_retrieval_reward_fn/std": 0.1505628222366795, "rewards/event_reward_fn/mean": 10.111328125, "rewards/event_reward_fn/std": 5.826146110892296, "rewards/format_reward_fn/mean": 0.9555186629295349, "rewards/format_reward_fn/std": 0.16073728911578655, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 253.5625, "completions/max_terminated_length": 249.375, "completions/mean_length": 216.126953125, "completions/mean_terminated_length": 214.97348499298096, "completions/min_length": 179.25, "completions/min_terminated_length": 179.25, "entropy": 0.07998230727389455, "epoch": 4.820213799805637, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.13172361254692078, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 406892362.0, "reward": 12.584176301956177, "reward_std": 0.8060983493924141, "rewards/bm25_retrieval_reward_fn/mean": 0.9421436227858067, "rewards/bm25_retrieval_reward_fn/std": 0.17349553992971778, "rewards/event_reward_fn/mean": 10.697265625, "rewards/event_reward_fn/std": 6.188347667455673, "rewards/format_reward_fn/mean": 0.944767028093338, "rewards/format_reward_fn/std": 0.17709117522463202, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0146484375, "completions/max_length": 249.0625, "completions/max_terminated_length": 248.5, "completions/mean_length": 211.833984375, "completions/mean_terminated_length": 211.20284271240234, "completions/min_length": 171.625, "completions/min_terminated_length": 171.625, "entropy": 0.07674705190584064, "epoch": 4.835762876579203, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.27421054244041443, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 408234032.0, "reward": 12.206626057624817, "reward_std": 0.839366152882576, "rewards/bm25_retrieval_reward_fn/mean": 0.9551889784634113, "rewards/bm25_retrieval_reward_fn/std": 0.12793191766832024, "rewards/event_reward_fn/mean": 10.2958984375, "rewards/event_reward_fn/std": 5.720379784703255, "rewards/format_reward_fn/mean": 0.955538809299469, "rewards/format_reward_fn/std": 0.13686896581202745, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0439453125, "completions/max_length": 253.875, "completions/max_terminated_length": 251.25, "completions/mean_length": 219.1953125, "completions/mean_terminated_length": 217.51319122314453, "completions/min_length": 177.875, "completions/min_terminated_length": 177.875, "entropy": 0.0816301996819675, "epoch": 4.85131195335277, "frac_reward_zero_std": 0.41015625, "grad_norm": 0.18152131140232086, "learning_rate": 5e-05, "loss": -0.0008, "num_tokens": 409579660.0, "reward": 11.989133656024933, "reward_std": 0.8817657474428415, "rewards/bm25_retrieval_reward_fn/mean": 0.932769563049078, "rewards/bm25_retrieval_reward_fn/std": 0.18802405067253858, "rewards/event_reward_fn/mean": 10.119140625, "rewards/event_reward_fn/std": 5.9855871349573135, "rewards/format_reward_fn/mean": 0.9372236467897892, "rewards/format_reward_fn/std": 0.1983571257442236, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 254.4375, "completions/max_terminated_length": 252.1875, "completions/mean_length": 217.8271484375, "completions/mean_terminated_length": 216.6529426574707, "completions/min_length": 175.625, "completions/min_terminated_length": 175.625, "entropy": 0.07996702333912253, "epoch": 4.866861030126336, "frac_reward_zero_std": 0.41015625, "grad_norm": 0.11488137394189835, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 410859759.0, "reward": 11.951142311096191, "reward_std": 0.7472913395613432, "rewards/bm25_retrieval_reward_fn/mean": 0.9457512833178043, "rewards/bm25_retrieval_reward_fn/std": 0.17608064785599709, "rewards/event_reward_fn/mean": 10.0546875, "rewards/event_reward_fn/std": 5.59067901968956, "rewards/format_reward_fn/mean": 0.950703427195549, "rewards/format_reward_fn/std": 0.17153813573531806, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0244140625, "completions/max_length": 253.0, "completions/max_terminated_length": 250.0625, "completions/mean_length": 213.92578125, "completions/mean_terminated_length": 212.86985301971436, "completions/min_length": 169.25, "completions/min_terminated_length": 169.25, "entropy": 0.07613265956752002, "epoch": 4.882410106899902, "frac_reward_zero_std": 0.46484375, "grad_norm": 0.15658670663833618, "learning_rate": 5e-05, "loss": -0.0075, "num_tokens": 412146683.0, "reward": 11.895162045955658, "reward_std": 0.7129523921757936, "rewards/bm25_retrieval_reward_fn/mean": 0.9551772475242615, "rewards/bm25_retrieval_reward_fn/std": 0.1298921147827059, "rewards/event_reward_fn/mean": 9.9853515625, "rewards/event_reward_fn/std": 5.476163282990456, "rewards/format_reward_fn/mean": 0.9546333625912666, "rewards/format_reward_fn/std": 0.14684197306632996, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0205078125, "completions/max_length": 252.0625, "completions/max_terminated_length": 250.0625, "completions/mean_length": 210.5185546875, "completions/mean_terminated_length": 209.6259593963623, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.07199173117987812, "epoch": 4.8979591836734695, "frac_reward_zero_std": 0.4453125, "grad_norm": 0.21756106615066528, "learning_rate": 5e-05, "loss": -0.0021, "num_tokens": 413506590.0, "reward": 12.365760207176208, "reward_std": 0.745238907635212, "rewards/bm25_retrieval_reward_fn/mean": 0.9555656909942627, "rewards/bm25_retrieval_reward_fn/std": 0.13530326791806147, "rewards/event_reward_fn/mean": 10.4541015625, "rewards/event_reward_fn/std": 5.998309597373009, "rewards/format_reward_fn/mean": 0.9560929797589779, "rewards/format_reward_fn/std": 0.14361765328794718, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0166015625, "completions/max_length": 251.75, "completions/max_terminated_length": 249.6875, "completions/mean_length": 211.0107421875, "completions/mean_terminated_length": 210.25583267211914, "completions/min_length": 172.8125, "completions/min_terminated_length": 172.8125, "entropy": 0.07869777269661427, "epoch": 4.913508260447036, "frac_reward_zero_std": 0.4296875, "grad_norm": 0.07694108039140701, "learning_rate": 5e-05, "loss": -0.0025, "num_tokens": 414872965.0, "reward": 12.788713455200195, "reward_std": 0.7702588140964508, "rewards/bm25_retrieval_reward_fn/mean": 0.9623404443264008, "rewards/bm25_retrieval_reward_fn/std": 0.11270316521404311, "rewards/event_reward_fn/mean": 10.876953125, "rewards/event_reward_fn/std": 6.525661692023277, "rewards/format_reward_fn/mean": 0.9494199566543102, "rewards/format_reward_fn/std": 0.1416560309007764, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0146484375, "completions/max_length": 252.75, "completions/max_terminated_length": 251.125, "completions/mean_length": 214.677734375, "completions/mean_terminated_length": 214.11969184875488, "completions/min_length": 172.3125, "completions/min_terminated_length": 172.3125, "entropy": 0.08010096289217472, "epoch": 4.929057337220603, "frac_reward_zero_std": 0.3359375, "grad_norm": 0.19344469904899597, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 416184203.0, "reward": 12.021986961364746, "reward_std": 0.8370283525437117, "rewards/bm25_retrieval_reward_fn/mean": 0.9379756189882755, "rewards/bm25_retrieval_reward_fn/std": 0.18868807784747332, "rewards/event_reward_fn/mean": 10.1474609375, "rewards/event_reward_fn/std": 5.739748045802116, "rewards/format_reward_fn/mean": 0.9365505129098892, "rewards/format_reward_fn/std": 0.18898644018918276, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0791015625, "completions/max_length": 255.625, "completions/max_terminated_length": 254.1875, "completions/mean_length": 223.2958984375, "completions/mean_terminated_length": 220.47602653503418, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.09088941430673003, "epoch": 4.944606413994169, "frac_reward_zero_std": 0.296875, "grad_norm": 0.20225514471530914, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 417535758.0, "reward": 12.14924430847168, "reward_std": 0.9182783588767052, "rewards/bm25_retrieval_reward_fn/mean": 0.9020416662096977, "rewards/bm25_retrieval_reward_fn/std": 0.25824297685176134, "rewards/event_reward_fn/mean": 10.345703125, "rewards/event_reward_fn/std": 5.885191112756729, "rewards/format_reward_fn/mean": 0.9014995731413364, "rewards/format_reward_fn/std": 0.2582928016781807, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.4375, "completions/mean_length": 226.154296875, "completions/mean_terminated_length": 222.74933338165283, "completions/min_length": 181.875, "completions/min_terminated_length": 181.875, "entropy": 0.09840598748996854, "epoch": 4.960155490767736, "frac_reward_zero_std": 0.34765625, "grad_norm": 0.13706494867801666, "learning_rate": 5e-05, "loss": 0.0047, "num_tokens": 418836740.0, "reward": 11.611397385597229, "reward_std": 0.8141429983079433, "rewards/bm25_retrieval_reward_fn/mean": 0.8677695393562317, "rewards/bm25_retrieval_reward_fn/std": 0.3009802335873246, "rewards/event_reward_fn/mean": 9.8740234375, "rewards/event_reward_fn/std": 5.783898174762726, "rewards/format_reward_fn/mean": 0.8696044124662876, "rewards/format_reward_fn/std": 0.3018876016139984, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0400390625, "completions/max_length": 252.625, "completions/max_terminated_length": 250.375, "completions/mean_length": 216.8056640625, "completions/mean_terminated_length": 215.23225784301758, "completions/min_length": 174.125, "completions/min_terminated_length": 174.125, "entropy": 0.09257404552772641, "epoch": 4.975704567541302, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.1244494840502739, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 420147069.0, "reward": 12.01289427280426, "reward_std": 0.9822186566889286, "rewards/bm25_retrieval_reward_fn/mean": 0.9450008794665337, "rewards/bm25_retrieval_reward_fn/std": 0.16165780992014334, "rewards/event_reward_fn/mean": 10.1240234375, "rewards/event_reward_fn/std": 5.432392194867134, "rewards/format_reward_fn/mean": 0.9438699819147587, "rewards/format_reward_fn/std": 0.1795343121048063, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0283203125, "completions/max_length": 251.0, "completions/max_terminated_length": 247.75, "completions/mean_length": 212.3759765625, "completions/mean_terminated_length": 211.170654296875, "completions/min_length": 175.5, "completions/min_terminated_length": 175.5, "entropy": 0.08855262584984303, "epoch": 4.9912536443148685, "frac_reward_zero_std": 0.421875, "grad_norm": 0.08971451967954636, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 421466326.0, "reward": 12.228147089481354, "reward_std": 0.8180289585143328, "rewards/bm25_retrieval_reward_fn/mean": 0.9339725822210312, "rewards/bm25_retrieval_reward_fn/std": 0.20187974988948554, "rewards/event_reward_fn/mean": 10.3623046875, "rewards/event_reward_fn/std": 6.2895103842020035, "rewards/format_reward_fn/mean": 0.9318698942661285, "rewards/format_reward_fn/std": 0.2051788135431707, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 250.6875, "completions/max_terminated_length": 248.625, "completions/mean_length": 210.5322265625, "completions/mean_terminated_length": 210.076735496521, "completions/min_length": 171.8125, "completions/min_terminated_length": 171.8125, "entropy": 0.09253401588648558, "epoch": 5.006802721088436, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.12024762481451035, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 422781699.0, "reward": 12.157626032829285, "reward_std": 0.7537666261196136, "rewards/bm25_retrieval_reward_fn/mean": 0.9695763923227787, "rewards/bm25_retrieval_reward_fn/std": 0.10307722567813471, "rewards/event_reward_fn/mean": 10.2197265625, "rewards/event_reward_fn/std": 5.788311317563057, "rewards/format_reward_fn/mean": 0.9683229438960552, "rewards/format_reward_fn/std": 0.1254920600913465, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0087890625, "completions/max_length": 247.625, "completions/max_terminated_length": 246.0, "completions/mean_length": 210.9833984375, "completions/mean_terminated_length": 210.60074615478516, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.09116627182811499, "epoch": 5.022351797862002, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.12762567400932312, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 423997238.0, "reward": 11.60620766878128, "reward_std": 0.7599927112460136, "rewards/bm25_retrieval_reward_fn/mean": 0.9486931003630161, "rewards/bm25_retrieval_reward_fn/std": 0.15455256192944944, "rewards/event_reward_fn/mean": 9.7001953125, "rewards/event_reward_fn/std": 5.030976966023445, "rewards/format_reward_fn/mean": 0.9573194123804569, "rewards/format_reward_fn/std": 0.15234834514558315, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 251.125, "completions/max_terminated_length": 246.875, "completions/mean_length": 213.2099609375, "completions/mean_terminated_length": 212.3174467086792, "completions/min_length": 177.75, "completions/min_terminated_length": 177.75, "entropy": 0.0908237830735743, "epoch": 5.037900874635569, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.09851375967264175, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 425258569.0, "reward": 12.098788142204285, "reward_std": 0.7658343818038702, "rewards/bm25_retrieval_reward_fn/mean": 0.9538118988275528, "rewards/bm25_retrieval_reward_fn/std": 0.14205206802580506, "rewards/event_reward_fn/mean": 10.189453125, "rewards/event_reward_fn/std": 5.354374468326569, "rewards/format_reward_fn/mean": 0.9555229768157005, "rewards/format_reward_fn/std": 0.16036075167357922, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 255.0625, "completions/max_terminated_length": 251.875, "completions/mean_length": 219.5107421875, "completions/mean_terminated_length": 217.43249702453613, "completions/min_length": 176.375, "completions/min_terminated_length": 176.375, "entropy": 0.08694863086566329, "epoch": 5.053449951409135, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15762481093406677, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 426630832.0, "reward": 12.561151146888733, "reward_std": 0.8509459039196372, "rewards/bm25_retrieval_reward_fn/mean": 0.9281382337212563, "rewards/bm25_retrieval_reward_fn/std": 0.21045659307856113, "rewards/event_reward_fn/mean": 10.69921875, "rewards/event_reward_fn/std": 6.01863569021225, "rewards/format_reward_fn/mean": 0.9337940216064453, "rewards/format_reward_fn/std": 0.2126003741286695, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 254.6875, "completions/max_terminated_length": 252.0, "completions/mean_length": 218.7431640625, "completions/mean_terminated_length": 217.25553607940674, "completions/min_length": 182.0625, "completions/min_terminated_length": 182.0625, "entropy": 0.08299205871298909, "epoch": 5.068999028182701, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.15465058386325836, "learning_rate": 5e-05, "loss": -0.0061, "num_tokens": 427929997.0, "reward": 12.534621059894562, "reward_std": 0.8048240784555674, "rewards/bm25_retrieval_reward_fn/mean": 0.9337992817163467, "rewards/bm25_retrieval_reward_fn/std": 0.1961612788727507, "rewards/event_reward_fn/mean": 10.6591796875, "rewards/event_reward_fn/std": 5.7010853588581085, "rewards/format_reward_fn/mean": 0.9416420236229897, "rewards/format_reward_fn/std": 0.19764397107064724, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 252.0625, "completions/max_terminated_length": 250.6875, "completions/mean_length": 213.8251953125, "completions/mean_terminated_length": 213.00183773040771, "completions/min_length": 173.75, "completions/min_terminated_length": 173.75, "entropy": 0.07737769559025764, "epoch": 5.084548104956268, "frac_reward_zero_std": 0.4765625, "grad_norm": 0.1074177548289299, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 429246174.0, "reward": 12.525750398635864, "reward_std": 0.6775472220033407, "rewards/bm25_retrieval_reward_fn/mean": 0.9568941742181778, "rewards/bm25_retrieval_reward_fn/std": 0.14373183471616358, "rewards/event_reward_fn/mean": 10.611328125, "rewards/event_reward_fn/std": 5.806536749005318, "rewards/format_reward_fn/mean": 0.9575280509889126, "rewards/format_reward_fn/std": 0.15517638879828155, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 253.25, "completions/max_terminated_length": 249.6875, "completions/mean_length": 217.044921875, "completions/mean_terminated_length": 215.57467937469482, "completions/min_length": 174.4375, "completions/min_terminated_length": 174.4375, "entropy": 0.07887168414890766, "epoch": 5.100097181729835, "frac_reward_zero_std": 0.40234375, "grad_norm": 0.100493885576725, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 430597164.0, "reward": 12.71329003572464, "reward_std": 0.7885365970432758, "rewards/bm25_retrieval_reward_fn/mean": 0.9540953673422337, "rewards/bm25_retrieval_reward_fn/std": 0.14646161976270378, "rewards/event_reward_fn/mean": 10.8056640625, "rewards/event_reward_fn/std": 6.382747828960419, "rewards/format_reward_fn/mean": 0.9535306617617607, "rewards/format_reward_fn/std": 0.15756959468126297, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0517578125, "completions/max_length": 255.6875, "completions/max_terminated_length": 249.375, "completions/mean_length": 215.978515625, "completions/mean_terminated_length": 213.8056936264038, "completions/min_length": 172.3125, "completions/min_terminated_length": 172.3125, "entropy": 0.07278733002021909, "epoch": 5.115646258503402, "frac_reward_zero_std": 0.41015625, "grad_norm": 0.10533929616212845, "learning_rate": 5e-05, "loss": 0.0052, "num_tokens": 431979870.0, "reward": 12.67308360338211, "reward_std": 0.8340425789356232, "rewards/bm25_retrieval_reward_fn/mean": 0.9266472458839417, "rewards/bm25_retrieval_reward_fn/std": 0.2227602507919073, "rewards/event_reward_fn/mean": 10.8173828125, "rewards/event_reward_fn/std": 6.615099906921387, "rewards/format_reward_fn/mean": 0.9290535151958466, "rewards/format_reward_fn/std": 0.22935187257826328, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0166015625, "completions/max_length": 254.375, "completions/max_terminated_length": 251.125, "completions/mean_length": 215.640625, "completions/mean_terminated_length": 214.95352268218994, "completions/min_length": 173.5, "completions/min_terminated_length": 173.5, "entropy": 0.07842768542468548, "epoch": 5.131195335276968, "frac_reward_zero_std": 0.38671875, "grad_norm": 0.0805554911494255, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 433291126.0, "reward": 12.185501873493195, "reward_std": 0.7526118885725737, "rewards/bm25_retrieval_reward_fn/mean": 0.9601360224187374, "rewards/bm25_retrieval_reward_fn/std": 0.13111005796235986, "rewards/event_reward_fn/mean": 10.2646484375, "rewards/event_reward_fn/std": 6.194005638360977, "rewards/format_reward_fn/mean": 0.9607173651456833, "rewards/format_reward_fn/std": 0.14473758242093027, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 253.0625, "completions/max_terminated_length": 252.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 215.49572944641113, "completions/min_length": 170.0625, "completions/min_terminated_length": 170.0625, "entropy": 0.08066530339419842, "epoch": 5.146744412050534, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.15137051045894623, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 434651818.0, "reward": 12.391696512699127, "reward_std": 0.8405030891299248, "rewards/bm25_retrieval_reward_fn/mean": 0.9705284647643566, "rewards/bm25_retrieval_reward_fn/std": 0.10062799096340314, "rewards/event_reward_fn/mean": 10.4541015625, "rewards/event_reward_fn/std": 5.5794040858745575, "rewards/format_reward_fn/mean": 0.9670664444565773, "rewards/format_reward_fn/std": 0.12125032884068787, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0185546875, "completions/max_length": 253.0625, "completions/max_terminated_length": 250.75, "completions/mean_length": 214.7646484375, "completions/mean_terminated_length": 214.00485610961914, "completions/min_length": 173.375, "completions/min_terminated_length": 173.375, "entropy": 0.08317997679114342, "epoch": 5.162293488824101, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.1106635183095932, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 436045961.0, "reward": 12.810622453689575, "reward_std": 0.8223424591124058, "rewards/bm25_retrieval_reward_fn/mean": 0.960607685148716, "rewards/bm25_retrieval_reward_fn/std": 0.13043461105553433, "rewards/event_reward_fn/mean": 10.8955078125, "rewards/event_reward_fn/std": 6.705768942832947, "rewards/format_reward_fn/mean": 0.9545068889856339, "rewards/format_reward_fn/std": 0.16276416694745421, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0244140625, "completions/max_length": 253.5625, "completions/max_terminated_length": 250.0, "completions/mean_length": 212.7001953125, "completions/mean_terminated_length": 211.5588846206665, "completions/min_length": 165.875, "completions/min_terminated_length": 165.875, "entropy": 0.07951848162338138, "epoch": 5.177842565597667, "frac_reward_zero_std": 0.40625, "grad_norm": 0.08208124339580536, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 437368618.0, "reward": 12.59171849489212, "reward_std": 0.7564017958939075, "rewards/bm25_retrieval_reward_fn/mean": 0.952126257121563, "rewards/bm25_retrieval_reward_fn/std": 0.14843681757338345, "rewards/event_reward_fn/mean": 10.6962890625, "rewards/event_reward_fn/std": 5.812787741422653, "rewards/format_reward_fn/mean": 0.9433031603693962, "rewards/format_reward_fn/std": 0.16410461044870317, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 253.375, "completions/max_terminated_length": 251.25, "completions/mean_length": 216.083984375, "completions/mean_terminated_length": 215.36172103881836, "completions/min_length": 177.1875, "completions/min_terminated_length": 177.1875, "entropy": 0.08092350885272026, "epoch": 5.1933916423712345, "frac_reward_zero_std": 0.3125, "grad_norm": 0.18105757236480713, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 438724608.0, "reward": 12.999197125434875, "reward_std": 0.9012727215886116, "rewards/bm25_retrieval_reward_fn/mean": 0.9417021572589874, "rewards/bm25_retrieval_reward_fn/std": 0.17351033969316632, "rewards/event_reward_fn/mean": 11.1259765625, "rewards/event_reward_fn/std": 5.892582669854164, "rewards/format_reward_fn/mean": 0.9315183274447918, "rewards/format_reward_fn/std": 0.18927003536373377, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0341796875, "completions/max_length": 254.625, "completions/max_terminated_length": 251.5, "completions/mean_length": 215.775390625, "completions/mean_terminated_length": 214.33139896392822, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.08449570368975401, "epoch": 5.208940719144801, "frac_reward_zero_std": 0.39453125, "grad_norm": 0.07373011112213135, "learning_rate": 5e-05, "loss": 0.0017, "num_tokens": 439989246.0, "reward": 11.719346940517426, "reward_std": 0.6102266386151314, "rewards/bm25_retrieval_reward_fn/mean": 0.9474032297730446, "rewards/bm25_retrieval_reward_fn/std": 0.1586525976890698, "rewards/event_reward_fn/mean": 9.826171875, "rewards/event_reward_fn/std": 5.788570657372475, "rewards/format_reward_fn/mean": 0.9457719549536705, "rewards/format_reward_fn/std": 0.176345658255741, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 250.3125, "completions/max_terminated_length": 249.1875, "completions/mean_length": 213.5673828125, "completions/mean_terminated_length": 213.04458808898926, "completions/min_length": 174.0625, "completions/min_terminated_length": 174.0625, "entropy": 0.08259732741862535, "epoch": 5.224489795918367, "frac_reward_zero_std": 0.4140625, "grad_norm": 0.09539102017879486, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 441313295.0, "reward": 12.156057178974152, "reward_std": 0.6409936174750328, "rewards/bm25_retrieval_reward_fn/mean": 0.9664278998970985, "rewards/bm25_retrieval_reward_fn/std": 0.0996909779496491, "rewards/event_reward_fn/mean": 10.23046875, "rewards/event_reward_fn/std": 6.2610200345516205, "rewards/format_reward_fn/mean": 0.9591603875160217, "rewards/format_reward_fn/std": 0.11145789409056306, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 253.125, "completions/max_terminated_length": 250.1875, "completions/mean_length": 214.4453125, "completions/mean_terminated_length": 213.2188024520874, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.0805953610688448, "epoch": 5.240038872691934, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.1731644719839096, "learning_rate": 5e-05, "loss": -0.0044, "num_tokens": 442696435.0, "reward": 12.83944684267044, "reward_std": 0.8571697734296322, "rewards/bm25_retrieval_reward_fn/mean": 0.9432271271944046, "rewards/bm25_retrieval_reward_fn/std": 0.17556072783190757, "rewards/event_reward_fn/mean": 10.9716796875, "rewards/event_reward_fn/std": 6.772303909063339, "rewards/format_reward_fn/mean": 0.9245400987565517, "rewards/format_reward_fn/std": 0.20365634886547923, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0263671875, "completions/max_length": 254.75, "completions/max_terminated_length": 252.3125, "completions/mean_length": 219.8271484375, "completions/mean_terminated_length": 218.86177444458008, "completions/min_length": 183.75, "completions/min_terminated_length": 183.75, "entropy": 0.08317722985520959, "epoch": 5.2555879494655, "frac_reward_zero_std": 0.41796875, "grad_norm": 0.12763230502605438, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 443952362.0, "reward": 11.955138087272644, "reward_std": 0.7103091701865196, "rewards/bm25_retrieval_reward_fn/mean": 0.946000337600708, "rewards/bm25_retrieval_reward_fn/std": 0.1564433122985065, "rewards/event_reward_fn/mean": 10.068359375, "rewards/event_reward_fn/std": 6.221550449728966, "rewards/format_reward_fn/mean": 0.9407782070338726, "rewards/format_reward_fn/std": 0.15689158393070102, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 254.875, "completions/max_terminated_length": 251.6875, "completions/mean_length": 220.3798828125, "completions/mean_terminated_length": 218.413254737854, "completions/min_length": 178.125, "completions/min_terminated_length": 178.125, "entropy": 0.0799885387532413, "epoch": 5.271137026239067, "frac_reward_zero_std": 0.37890625, "grad_norm": 0.19113846123218536, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 445303815.0, "reward": 12.331937193870544, "reward_std": 0.80683533847332, "rewards/bm25_retrieval_reward_fn/mean": 0.9195377230644226, "rewards/bm25_retrieval_reward_fn/std": 0.22412633727071807, "rewards/event_reward_fn/mean": 10.5029296875, "rewards/event_reward_fn/std": 6.035468250513077, "rewards/format_reward_fn/mean": 0.9094697572290897, "rewards/format_reward_fn/std": 0.23325852677226067, "step": 5424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 254.875, "completions/max_terminated_length": 253.25, "completions/mean_length": 219.4560546875, "completions/mean_terminated_length": 218.1432819366455, "completions/min_length": 179.625, "completions/min_terminated_length": 179.625, "entropy": 0.08487229980528355, "epoch": 5.2866861030126335, "frac_reward_zero_std": 0.3828125, "grad_norm": 0.15567469596862793, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 446613174.0, "reward": 12.17100590467453, "reward_std": 0.8579323226585984, "rewards/bm25_retrieval_reward_fn/mean": 0.9449139796197414, "rewards/bm25_retrieval_reward_fn/std": 0.1787831949768588, "rewards/event_reward_fn/mean": 10.2880859375, "rewards/event_reward_fn/std": 5.768570572137833, "rewards/format_reward_fn/mean": 0.9380059503018856, "rewards/format_reward_fn/std": 0.20308683020994067, "step": 5440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0341796875, "completions/max_length": 252.875, "completions/max_terminated_length": 251.25, "completions/mean_length": 218.060546875, "completions/mean_terminated_length": 216.607084274292, "completions/min_length": 173.25, "completions/min_terminated_length": 173.25, "entropy": 0.08181583508849144, "epoch": 5.3022351797862, "frac_reward_zero_std": 0.3984375, "grad_norm": 0.0759691596031189, "learning_rate": 5e-05, "loss": 0.0023, "num_tokens": 447935032.0, "reward": 12.78515636920929, "reward_std": 0.7136668600142002, "rewards/bm25_retrieval_reward_fn/mean": 0.9424515776336193, "rewards/bm25_retrieval_reward_fn/std": 0.16937226109439507, "rewards/event_reward_fn/mean": 10.8974609375, "rewards/event_reward_fn/std": 6.170664951205254, "rewards/format_reward_fn/mean": 0.9452439360320568, "rewards/format_reward_fn/std": 0.17747941031120718, "step": 5456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0244140625, "completions/max_length": 252.5625, "completions/max_terminated_length": 250.125, "completions/mean_length": 216.1611328125, "completions/mean_terminated_length": 215.20103359222412, "completions/min_length": 176.5, "completions/min_terminated_length": 176.5, "entropy": 0.0815726825967431, "epoch": 5.317784256559767, "frac_reward_zero_std": 0.33203125, "grad_norm": 0.08463463187217712, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 449236385.0, "reward": 12.28379362821579, "reward_std": 0.7982866708189249, "rewards/bm25_retrieval_reward_fn/mean": 0.9552294872701168, "rewards/bm25_retrieval_reward_fn/std": 0.13586847530677915, "rewards/event_reward_fn/mean": 10.3662109375, "rewards/event_reward_fn/std": 5.656336680054665, "rewards/format_reward_fn/mean": 0.962353054434061, "rewards/format_reward_fn/std": 0.13789360341615975, "step": 5472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0263671875, "completions/max_length": 253.3125, "completions/max_terminated_length": 251.5, "completions/mean_length": 217.390625, "completions/mean_terminated_length": 216.35210609436035, "completions/min_length": 176.8125, "completions/min_terminated_length": 176.8125, "entropy": 0.08938139118254185, "epoch": 5.333333333333333, "frac_reward_zero_std": 0.36328125, "grad_norm": 0.1346571296453476, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 450614525.0, "reward": 12.205934286117554, "reward_std": 0.8306602947413921, "rewards/bm25_retrieval_reward_fn/mean": 0.9631715565919876, "rewards/bm25_retrieval_reward_fn/std": 0.1261273269483354, "rewards/event_reward_fn/mean": 10.283203125, "rewards/event_reward_fn/std": 5.946963086724281, "rewards/format_reward_fn/mean": 0.9595597796142101, "rewards/format_reward_fn/std": 0.14464661804959178, "step": 5488 } ], "logging_steps": 16, "max_steps": 10290, "num_input_tokens_seen": 451600012, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }