diff --git "a/lora/lora-stage3/trainer_state.json" "b/lora/lora-stage3/trainer_state.json" new file mode 100644--- /dev/null +++ "b/lora/lora-stage3/trainer_state.json" @@ -0,0 +1,8107 @@ +{ + "best_global_step": 120, + "best_metric": 0.00398228, + "best_model_checkpoint": "/data/haobin/pky_train/qwen3_swift/pky_out/qwen3asr_dapo_reward5_3x8x8_12gen_3GPU/v3-20260410-173721/checkpoint-120", + "epoch": 0.14936519790888722, + "eval_steps": 20, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.5, + "completions/mean_length": 54.968750953674316, + "completions/min_length": 38.5, + "epoch": 0.0001066894270777766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5366653203964233, + "kl": 0.0, + "learning_rate": 5.924170616113745e-08, + "loss": -0.006252682767808437, + "reward": -0.008284313604235649, + "reward_std": 0.14971118047833443, + "rewards/ASRWerHalluLenRewardV5/mean": -0.00828430987894535, + "rewards/ASRWerHalluLenRewardV5/std": 0.37644892558455467, + "step": 1, + "step_time": 32.383059789426625 + }, + { + "clip_ratio/high_max": 0.010287621029419824, + "clip_ratio/high_mean": 0.005885126573048183, + "clip_ratio/low_mean": 0.010550106904702261, + "clip_ratio/low_min": 0.0038332803414959926, + "clip_ratio/region_mean": 0.016435233688753215, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.5625, + "completions/mean_length": 49.605469942092896, + "completions/min_length": 33.0, + "epoch": 0.000533447135388883, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3757129907608032, + "kl": 0.004078771042259177, + "learning_rate": 2.962085308056872e-07, + "loss": -0.004787730984389782, + "reward": -0.0559620619751513, + "reward_std": 0.11525698751211166, + "rewards/ASRWerHalluLenRewardV5/mean": -0.05596206639893353, + "rewards/ASRWerHalluLenRewardV5/std": 0.33463087398558855, + "step": 5, + "step_time": 32.695561482803896 + }, + { + "clip_ratio/high_max": 0.02050668186857365, + "clip_ratio/high_mean": 0.011801439117698464, + "clip_ratio/low_mean": 0.015632632881170137, + "clip_ratio/low_min": 0.005662010301603004, + "clip_ratio/region_mean": 0.02743407236994244, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.8, + "completions/mean_length": 46.09166793823242, + "completions/min_length": 25.95, + "epoch": 0.001066894270777766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4245464503765106, + "kl": 0.006405531693599187, + "learning_rate": 5.924170616113744e-07, + "loss": 0.004541196674108505, + "reward": -0.052772215008735655, + "reward_std": 0.1341860804706812, + "rewards/ASRWerHalluLenRewardV5/mean": -0.05277221612632275, + "rewards/ASRWerHalluLenRewardV5/std": 0.3885686233639717, + "step": 10, + "step_time": 30.932701653800905 + }, + { + "clip_ratio/high_max": 0.016934940381906925, + "clip_ratio/high_mean": 0.008885953362914733, + "clip_ratio/low_mean": 0.018656761306920087, + "clip_ratio/low_min": 0.007387336596730165, + "clip_ratio/region_mean": 0.027542714396258817, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.6, + "completions/mean_length": 49.7895845413208, + "completions/min_length": 31.15, + "epoch": 0.0016003414061666488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35864728689193726, + "kl": 0.005789317585004028, + "learning_rate": 8.886255924170616e-07, + "loss": 0.02560996413230896, + "reward": -0.1326259233057499, + "reward_std": 0.12834879010915756, + "rewards/ASRWerHalluLenRewardV5/mean": -0.1326259197667241, + "rewards/ASRWerHalluLenRewardV5/std": 0.3399562261998653, + "step": 15, + "step_time": 31.15337351411581 + }, + { + "clip_ratio/high_max": 0.018067091691773386, + "clip_ratio/high_mean": 0.010425598973233719, + "clip_ratio/low_mean": 0.016544708141009324, + "clip_ratio/low_min": 0.0060993719496764244, + "clip_ratio/region_mean": 0.026970307307783513, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.6, + "completions/mean_length": 49.05104274749756, + "completions/min_length": 33.7, + "epoch": 0.002133788541555532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40666571259498596, + "kl": 0.007977236385340803, + "learning_rate": 1.1848341232227488e-06, + "loss": -3.11974436044693e-05, + "reward": -0.002711239829659462, + "reward_std": 0.13073199540376662, + "rewards/ASRWerHalluLenRewardV5/mean": -0.0027112421579658986, + "rewards/ASRWerHalluLenRewardV5/std": 0.3654151536524296, + "step": 20, + "step_time": 30.926650398410857 + }, + { + "epoch": 0.002133788541555532, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 60.38028169014085, + "eval_completions/mean_length": 47.48709052717182, + "eval_completions/min_length": 33.816901408450704, + "eval_frac_reward_zero_std": 0.14553990960121155, + "eval_kl": 0.0038592269938048894, + "eval_loss": 0.004625469911843538, + "eval_reward": 0.5383826068694323, + "eval_reward_std": 0.07035777712581863, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5383825907090181, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3283752265950324, + "eval_runtime": 348.7194, + "eval_samples_per_second": 0.614, + "eval_steps_per_second": 0.052, + "step": 20 + }, + { + "clip_ratio/high_max": 0.02173188276938163, + "clip_ratio/high_mean": 0.011891455961449537, + "clip_ratio/low_mean": 0.016158911694947164, + "clip_ratio/low_min": 0.007344893465051428, + "clip_ratio/region_mean": 0.028050367522519083, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.85, + "completions/mean_length": 51.607293128967285, + "completions/min_length": 36.6, + "epoch": 0.002667235676944415, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35587191581726074, + "kl": 0.006316223932662979, + "learning_rate": 1.4810426540284362e-06, + "loss": -0.005569204688072205, + "reward": -0.03285313583910465, + "reward_std": 0.11189563143998385, + "rewards/ASRWerHalluLenRewardV5/mean": -0.0328531313687563, + "rewards/ASRWerHalluLenRewardV5/std": 0.38822815343737604, + "step": 25, + "step_time": 32.50153556242585 + }, + { + "clip_ratio/high_max": 0.019681457325350492, + "clip_ratio/high_mean": 0.011581507485243493, + "clip_ratio/low_mean": 0.014346418644709046, + "clip_ratio/low_min": 0.004216273431666195, + "clip_ratio/region_mean": 0.0259279259305913, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.15, + "completions/mean_length": 50.61875114440918, + "completions/min_length": 35.5, + "epoch": 0.0032006828123332977, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36899587512016296, + "kl": 0.0058037140828673724, + "learning_rate": 1.7772511848341232e-06, + "loss": -0.017619018256664277, + "reward": 0.027506281994283198, + "reward_std": 0.11301573794335126, + "rewards/ASRWerHalluLenRewardV5/mean": 0.027506274916231633, + "rewards/ASRWerHalluLenRewardV5/std": 0.3467141596600413, + "step": 30, + "step_time": 31.22732403241098 + }, + { + "clip_ratio/high_max": 0.017552763904677703, + "clip_ratio/high_mean": 0.009895933838561178, + "clip_ratio/low_mean": 0.01608712183806347, + "clip_ratio/low_min": 0.00540842063492164, + "clip_ratio/region_mean": 0.025983055820688605, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.95, + "completions/mean_length": 52.19375133514404, + "completions/min_length": 32.8, + "epoch": 0.0037341299477221808, + "frac_reward_zero_std": 0.0, + "grad_norm": 15.277986526489258, + "kl": 0.1584522240795195, + "learning_rate": 2.0734597156398104e-06, + "loss": 0.014190158247947693, + "reward": 0.062482820078730586, + "reward_std": 0.11723268739879131, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06248282203450799, + "rewards/ASRWerHalluLenRewardV5/std": 0.3866789147257805, + "step": 35, + "step_time": 31.873961539939046 + }, + { + "clip_ratio/high_max": 0.018186163157224655, + "clip_ratio/high_mean": 0.010431363967654761, + "clip_ratio/low_mean": 0.019862729392480107, + "clip_ratio/low_min": 0.007713729102397338, + "clip_ratio/region_mean": 0.03029409329756163, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.7, + "completions/mean_length": 50.00416793823242, + "completions/min_length": 33.05, + "epoch": 0.004267577083111064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40463021397590637, + "kl": 0.006458690573344939, + "learning_rate": 2.3696682464454976e-06, + "loss": 0.003388461098074913, + "reward": -0.09057582542300224, + "reward_std": 0.13050497882068157, + "rewards/ASRWerHalluLenRewardV5/mean": -0.0905758316628635, + "rewards/ASRWerHalluLenRewardV5/std": 0.3859805755317211, + "step": 40, + "step_time": 30.857379550114274 + }, + { + "epoch": 0.004267577083111064, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 60.774647887323944, + "eval_completions/mean_length": 47.62206703508404, + "eval_completions/min_length": 33.67605633802817, + "eval_frac_reward_zero_std": 0.12676056715804088, + "eval_kl": 0.003688746271736946, + "eval_loss": 0.00750101450830698, + "eval_reward": 0.5368548375882314, + "eval_reward_std": 0.07582821484497736, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5368548211392382, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.32948720885414473, + "eval_runtime": 347.711, + "eval_samples_per_second": 0.615, + "eval_steps_per_second": 0.052, + "step": 40 + }, + { + "clip_ratio/high_max": 0.019021007767878472, + "clip_ratio/high_mean": 0.011180857103317976, + "clip_ratio/low_mean": 0.015716493135550992, + "clip_ratio/low_min": 0.00532691280823201, + "clip_ratio/region_mean": 0.026897350046783685, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.55, + "completions/mean_length": 48.30104293823242, + "completions/min_length": 31.9, + "epoch": 0.0048010242184999465, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3601289391517639, + "kl": 0.005931287228304427, + "learning_rate": 2.665876777251185e-06, + "loss": -0.0021273791790008545, + "reward": -0.046650438010692595, + "reward_std": 0.13604085817933081, + "rewards/ASRWerHalluLenRewardV5/mean": -0.04665043377317488, + "rewards/ASRWerHalluLenRewardV5/std": 0.37722305208444595, + "step": 45, + "step_time": 32.072614601254465 + }, + { + "clip_ratio/high_max": 0.021707691345363857, + "clip_ratio/high_mean": 0.01193708685459569, + "clip_ratio/low_mean": 0.015254302670655306, + "clip_ratio/low_min": 0.005419506985344924, + "clip_ratio/region_mean": 0.02719138945103623, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.55, + "completions/mean_length": 48.147917938232425, + "completions/min_length": 32.2, + "epoch": 0.00533447135388883, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31121569871902466, + "kl": 0.006634783888875973, + "learning_rate": 2.9620853080568724e-06, + "loss": -0.0024803981184959413, + "reward": 0.07556772343814373, + "reward_std": 0.11609874516725541, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07556771684903651, + "rewards/ASRWerHalluLenRewardV5/std": 0.4364402830600739, + "step": 50, + "step_time": 30.61926589012146 + }, + { + "clip_ratio/high_max": 0.01914288430707529, + "clip_ratio/high_mean": 0.010745798880816438, + "clip_ratio/low_mean": 0.016522686363896356, + "clip_ratio/low_min": 0.005739973881281913, + "clip_ratio/region_mean": 0.02726848509046249, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.35, + "completions/mean_length": 47.98854312896729, + "completions/min_length": 30.95, + "epoch": 0.005867918489277713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36357131600379944, + "kl": 0.006701874089776539, + "learning_rate": 3.2582938388625596e-06, + "loss": -0.0071951538324356076, + "reward": -0.03313510902225971, + "reward_std": 0.13253900371491908, + "rewards/ASRWerHalluLenRewardV5/mean": -0.03313510593725368, + "rewards/ASRWerHalluLenRewardV5/std": 0.46602033972740176, + "step": 55, + "step_time": 31.07161899022758 + }, + { + "clip_ratio/high_max": 0.017760933103272692, + "clip_ratio/high_mean": 0.009427498732111416, + "clip_ratio/low_mean": 0.01392236549872905, + "clip_ratio/low_min": 0.0035754880256718023, + "clip_ratio/region_mean": 0.02334986422210932, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/mean_length": 47.13020992279053, + "completions/min_length": 28.55, + "epoch": 0.006401365624666595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4977560341358185, + "kl": 0.006124603931675665, + "learning_rate": 3.5545023696682464e-06, + "loss": 0.001441839709877968, + "reward": 0.13277553468942643, + "reward_std": 0.12494266256690026, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13277551881037652, + "rewards/ASRWerHalluLenRewardV5/std": 0.4259007595479488, + "step": 60, + "step_time": 32.842594062909484 + }, + { + "epoch": 0.006401365624666595, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 60.521126760563384, + "eval_completions/mean_length": 47.40258337746204, + "eval_completions/min_length": 33.732394366197184, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.004562647315357077, + "eval_loss": 0.009821569547057152, + "eval_reward": 0.5370557949760221, + "eval_reward_std": 0.07722115885256461, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5370557765725633, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.33296022585160295, + "eval_runtime": 348.5551, + "eval_samples_per_second": 0.614, + "eval_steps_per_second": 0.052, + "step": 60 + }, + { + "clip_ratio/high_max": 0.020163887232774867, + "clip_ratio/high_mean": 0.01117994584783446, + "clip_ratio/low_mean": 0.01615895590512082, + "clip_ratio/low_min": 0.005512208543950692, + "clip_ratio/region_mean": 0.027338901674374937, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.95, + "completions/mean_length": 51.525000953674315, + "completions/min_length": 32.8, + "epoch": 0.006934812760055479, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42790961265563965, + "kl": 0.007371518810396082, + "learning_rate": 3.850710900473934e-06, + "loss": 0.0018063774332404137, + "reward": -0.014136556535959244, + "reward_std": 0.11159702204167843, + "rewards/ASRWerHalluLenRewardV5/mean": -0.014136549923568964, + "rewards/ASRWerHalluLenRewardV5/std": 0.4193071097135544, + "step": 65, + "step_time": 31.329416749812662 + }, + { + "clip_ratio/high_max": 0.02178742127143778, + "clip_ratio/high_mean": 0.011861002063960768, + "clip_ratio/low_mean": 0.018375673788250425, + "clip_ratio/low_min": 0.007440098785446025, + "clip_ratio/region_mean": 0.030236676055938005, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.95, + "completions/mean_length": 48.18958435058594, + "completions/min_length": 31.45, + "epoch": 0.0074682598954443615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5728639364242554, + "kl": 0.008765012258663774, + "learning_rate": 4.146919431279621e-06, + "loss": 0.012148426473140716, + "reward": -0.03257287405431271, + "reward_std": 0.13708484694361686, + "rewards/ASRWerHalluLenRewardV5/mean": -0.032572878804057834, + "rewards/ASRWerHalluLenRewardV5/std": 0.39438444674015044, + "step": 70, + "step_time": 29.996175704710186 + }, + { + "clip_ratio/high_max": 0.019824933662312104, + "clip_ratio/high_mean": 0.011659608899208251, + "clip_ratio/low_mean": 0.015122921610600314, + "clip_ratio/low_min": 0.004419159051030875, + "clip_ratio/region_mean": 0.02678253058111295, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.45, + "completions/mean_length": 49.65833435058594, + "completions/min_length": 33.3, + "epoch": 0.008001707030833244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3458283841609955, + "kl": 0.00814168702927418, + "learning_rate": 4.443127962085308e-06, + "loss": 0.01824360489845276, + "reward": -0.05196817144751549, + "reward_std": 0.10938869081437588, + "rewards/ASRWerHalluLenRewardV5/mean": -0.051968176010996105, + "rewards/ASRWerHalluLenRewardV5/std": 0.3643105633556843, + "step": 75, + "step_time": 33.35227437056601 + }, + { + "clip_ratio/high_max": 0.0185696501925122, + "clip_ratio/high_mean": 0.009912336000707001, + "clip_ratio/low_mean": 0.016785736108431593, + "clip_ratio/low_min": 0.007310686149867252, + "clip_ratio/region_mean": 0.026698072062572464, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.05, + "completions/mean_length": 48.489584350585936, + "completions/min_length": 29.2, + "epoch": 0.008535154166222128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7358887791633606, + "kl": 0.010421078791841865, + "learning_rate": 4.739336492890995e-06, + "loss": 0.0003320907708257437, + "reward": -0.021024424768984317, + "reward_std": 0.14184249006211758, + "rewards/ASRWerHalluLenRewardV5/mean": -0.021024424210190774, + "rewards/ASRWerHalluLenRewardV5/std": 0.3504705406725407, + "step": 80, + "step_time": 32.09752713683993 + }, + { + "epoch": 0.008535154166222128, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.33802816901409, + "eval_completions/mean_length": 47.149062250701476, + "eval_completions/min_length": 33.49295774647887, + "eval_frac_reward_zero_std": 0.1173708955167045, + "eval_kl": 0.0072064965005806635, + "eval_loss": 0.005555253475904465, + "eval_reward": 0.5441842313960824, + "eval_reward_std": 0.07381102799529761, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5441842137665396, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.32423653920561496, + "eval_runtime": 344.1557, + "eval_samples_per_second": 0.622, + "eval_steps_per_second": 0.052, + "step": 80 + }, + { + "clip_ratio/high_max": 0.02332109396811575, + "clip_ratio/high_mean": 0.012450982483278495, + "clip_ratio/low_mean": 0.016875270087621175, + "clip_ratio/low_min": 0.005332258256385103, + "clip_ratio/region_mean": 0.029326252941973507, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/mean_length": 50.14791774749756, + "completions/min_length": 36.0, + "epoch": 0.00906860130161101, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8312454223632812, + "kl": 0.011362565046874806, + "learning_rate": 5.035545023696683e-06, + "loss": 0.0045444205403327945, + "reward": -0.06997929234057665, + "reward_std": 0.10606737714260817, + "rewards/ASRWerHalluLenRewardV5/mean": -0.06997928991913796, + "rewards/ASRWerHalluLenRewardV5/std": 0.4226209603250027, + "step": 85, + "step_time": 31.655663657188416 + }, + { + "clip_ratio/high_max": 0.019430955633288248, + "clip_ratio/high_mean": 0.010579835224780254, + "clip_ratio/low_mean": 0.017467756096448284, + "clip_ratio/low_min": 0.006065765177481808, + "clip_ratio/region_mean": 0.028047591057838873, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.2, + "completions/mean_length": 47.460417938232425, + "completions/min_length": 31.5, + "epoch": 0.009602048436999893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39156168699264526, + "kl": 0.011836530466098338, + "learning_rate": 5.33175355450237e-06, + "loss": 0.002175554260611534, + "reward": -0.018728679046034813, + "reward_std": 0.12494124695658684, + "rewards/ASRWerHalluLenRewardV5/mean": -0.01872868384234607, + "rewards/ASRWerHalluLenRewardV5/std": 0.4319018803536892, + "step": 90, + "step_time": 31.185784369893373 + }, + { + "clip_ratio/high_max": 0.020193112036213278, + "clip_ratio/high_mean": 0.010662357379624154, + "clip_ratio/low_mean": 0.016402664968336467, + "clip_ratio/low_min": 0.005929990948061459, + "clip_ratio/region_mean": 0.02706502246146556, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 83.5, + "completions/mean_length": 47.66562614440918, + "completions/min_length": 27.3, + "epoch": 0.010135495572388777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4193708002567291, + "kl": 0.012001729826442898, + "learning_rate": 5.627962085308057e-06, + "loss": -0.003953045234084129, + "reward": 0.07124377898871899, + "reward_std": 0.13295698612928392, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07124377889558672, + "rewards/ASRWerHalluLenRewardV5/std": 0.3788189634680748, + "step": 95, + "step_time": 31.573507184907793 + }, + { + "clip_ratio/high_max": 0.02024620698066428, + "clip_ratio/high_mean": 0.011164145612565336, + "clip_ratio/low_mean": 0.015372579709219281, + "clip_ratio/low_min": 0.00503248346503824, + "clip_ratio/region_mean": 0.026536725042387844, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/mean_length": 52.568751907348634, + "completions/min_length": 34.15, + "epoch": 0.01066894270777766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4211793839931488, + "kl": 0.012691801725304685, + "learning_rate": 5.924170616113745e-06, + "loss": -0.0076587356626987456, + "reward": 0.07220923751592637, + "reward_std": 0.11441648751497269, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07220923444256186, + "rewards/ASRWerHalluLenRewardV5/std": 0.39604233130812644, + "step": 100, + "step_time": 32.535540885291994 + }, + { + "epoch": 0.01066894270777766, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.225352112676056, + "eval_completions/mean_length": 46.95892167427171, + "eval_completions/min_length": 33.225352112676056, + "eval_frac_reward_zero_std": 0.12206573133737268, + "eval_kl": 0.008503434528649169, + "eval_loss": 0.00551997683942318, + "eval_reward": 0.546006932398829, + "eval_reward_std": 0.08081549534600385, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5460069172189269, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3250747287882046, + "eval_runtime": 344.3741, + "eval_samples_per_second": 0.621, + "eval_steps_per_second": 0.052, + "step": 100 + }, + { + "clip_ratio/high_max": 0.021404229156905784, + "clip_ratio/high_mean": 0.012123409798368812, + "clip_ratio/low_mean": 0.017754235699248964, + "clip_ratio/low_min": 0.005321850618929602, + "clip_ratio/region_mean": 0.029877645324449986, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.7, + "completions/mean_length": 47.65000114440918, + "completions/min_length": 28.6, + "epoch": 0.011202389843166542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4571327567100525, + "kl": 0.01476139816804789, + "learning_rate": 6.2203791469194315e-06, + "loss": -0.004940593987703324, + "reward": 0.0013304379768669604, + "reward_std": 0.12287195809185505, + "rewards/ASRWerHalluLenRewardV5/mean": 0.001330439979210496, + "rewards/ASRWerHalluLenRewardV5/std": 0.3865403100848198, + "step": 105, + "step_time": 30.364824500493704 + }, + { + "clip_ratio/high_max": 0.01767902867286466, + "clip_ratio/high_mean": 0.00977460638678167, + "clip_ratio/low_mean": 0.015170802870125044, + "clip_ratio/low_min": 0.006389756489079445, + "clip_ratio/region_mean": 0.024945409613428636, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.1, + "completions/mean_length": 51.16250114440918, + "completions/min_length": 35.2, + "epoch": 0.011735836978555425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47762957215309143, + "kl": 0.01329807786969468, + "learning_rate": 6.516587677725119e-06, + "loss": 0.0077606581151485445, + "reward": 0.10777070559561253, + "reward_std": 0.12235365323722362, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10777069800533354, + "rewards/ASRWerHalluLenRewardV5/std": 0.41718641817569735, + "step": 110, + "step_time": 31.97713616490364 + }, + { + "clip_ratio/high_max": 0.020424583030398936, + "clip_ratio/high_mean": 0.011208168351731728, + "clip_ratio/low_mean": 0.018582070560660212, + "clip_ratio/low_min": 0.0064372346503660085, + "clip_ratio/region_mean": 0.029790239024441688, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.2, + "completions/mean_length": 50.21250133514404, + "completions/min_length": 33.2, + "epoch": 0.012269284113944309, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41945403814315796, + "kl": 0.019057577889179812, + "learning_rate": 6.812796208530806e-06, + "loss": 0.004955665394663811, + "reward": 0.029572534561157226, + "reward_std": 0.12552662529051303, + "rewards/ASRWerHalluLenRewardV5/mean": 0.029572532139718534, + "rewards/ASRWerHalluLenRewardV5/std": 0.44832691326737406, + "step": 115, + "step_time": 30.59291108623147 + }, + { + "clip_ratio/high_max": 0.021269711764762177, + "clip_ratio/high_mean": 0.0114160802346305, + "clip_ratio/low_mean": 0.016993196256225928, + "clip_ratio/low_min": 0.006912675654166378, + "clip_ratio/region_mean": 0.028409276384627445, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.75, + "completions/mean_length": 49.394792556762695, + "completions/min_length": 34.55, + "epoch": 0.01280273124933319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40411245822906494, + "kl": 0.023217458743602037, + "learning_rate": 7.109004739336493e-06, + "loss": 0.00032006353139877317, + "reward": 0.08898027762770652, + "reward_std": 0.14333140589296817, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08898027800023556, + "rewards/ASRWerHalluLenRewardV5/std": 0.35283839553594587, + "step": 120, + "step_time": 31.39442367646843 + }, + { + "epoch": 0.01280273124933319, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.23943661971831, + "eval_completions/mean_length": 46.897888559690664, + "eval_completions/min_length": 33.394366197183096, + "eval_frac_reward_zero_std": 0.09859155223403178, + "eval_kl": 0.014746761693894652, + "eval_loss": 0.0039822752587497234, + "eval_reward": 0.558984708303297, + "eval_reward_std": 0.07546814373681243, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5589846969831368, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3246988824135821, + "eval_runtime": 346.2867, + "eval_samples_per_second": 0.618, + "eval_steps_per_second": 0.052, + "step": 120 + }, + { + "clip_ratio/high_max": 0.020703278377186506, + "clip_ratio/high_mean": 0.010940808754821774, + "clip_ratio/low_mean": 0.014421513644629158, + "clip_ratio/low_min": 0.0042489173705689606, + "clip_ratio/region_mean": 0.025362322211731226, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.05, + "completions/mean_length": 47.86979312896729, + "completions/min_length": 31.6, + "epoch": 0.013336178384722074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4666975140571594, + "kl": 0.032711091847158966, + "learning_rate": 7.40521327014218e-06, + "loss": -0.006730391085147858, + "reward": 0.0671119175851345, + "reward_std": 0.11250332370400429, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06711191106587648, + "rewards/ASRWerHalluLenRewardV5/std": 0.4156209543347359, + "step": 125, + "step_time": 32.058955289050935 + }, + { + "clip_ratio/high_max": 0.022658733901334927, + "clip_ratio/high_mean": 0.012122854597691912, + "clip_ratio/low_mean": 0.018534100406395738, + "clip_ratio/low_min": 0.007235012511955574, + "clip_ratio/region_mean": 0.030656955519225447, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.55, + "completions/mean_length": 51.4395845413208, + "completions/min_length": 39.95, + "epoch": 0.013869625520110958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4283663034439087, + "kl": 0.034679236507508905, + "learning_rate": 7.701421800947868e-06, + "loss": -0.00018229559063911437, + "reward": 0.09558267313987016, + "reward_std": 0.1321804028004408, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09558266680687666, + "rewards/ASRWerHalluLenRewardV5/std": 0.3539563357830048, + "step": 130, + "step_time": 33.39920915141702 + }, + { + "clip_ratio/high_max": 0.02111506321525667, + "clip_ratio/high_mean": 0.012310835978132673, + "clip_ratio/low_mean": 0.020663138732197696, + "clip_ratio/low_min": 0.007812515075784177, + "clip_ratio/region_mean": 0.03297397454734892, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.95, + "completions/mean_length": 49.12187595367432, + "completions/min_length": 32.1, + "epoch": 0.01440307265549984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.608925998210907, + "kl": 0.044405269692651926, + "learning_rate": 7.997630331753554e-06, + "loss": 0.024940022826194765, + "reward": -0.07751726172864437, + "reward_std": 0.11899946816265583, + "rewards/ASRWerHalluLenRewardV5/mean": -0.07751726545393467, + "rewards/ASRWerHalluLenRewardV5/std": 0.3918007381260395, + "step": 135, + "step_time": 31.242850332707167 + }, + { + "clip_ratio/high_max": 0.017632553266594186, + "clip_ratio/high_mean": 0.009537290791922715, + "clip_ratio/low_mean": 0.01616040244407486, + "clip_ratio/low_min": 0.006446195492753759, + "clip_ratio/region_mean": 0.02569769316469319, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.9, + "completions/mean_length": 48.98229293823242, + "completions/min_length": 34.2, + "epoch": 0.014936519790888723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37133559584617615, + "kl": 0.03510341187939048, + "learning_rate": 8.293838862559241e-06, + "loss": 0.004928496479988098, + "reward": 0.10123668387532234, + "reward_std": 0.1148852489888668, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10123667917214334, + "rewards/ASRWerHalluLenRewardV5/std": 0.41279625743627546, + "step": 140, + "step_time": 31.36708654742688 + }, + { + "epoch": 0.014936519790888723, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.690140845070424, + "eval_completions/mean_length": 47.18075249900281, + "eval_completions/min_length": 33.563380281690144, + "eval_frac_reward_zero_std": 0.1455399104407136, + "eval_kl": 0.020881212009838453, + "eval_loss": 0.007481948938220739, + "eval_reward": 0.5626472770695535, + "eval_reward_std": 0.07122734981313558, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5626472720194241, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3187212126565651, + "eval_runtime": 350.5257, + "eval_samples_per_second": 0.611, + "eval_steps_per_second": 0.051, + "step": 140 + }, + { + "clip_ratio/high_max": 0.020523724029771984, + "clip_ratio/high_mean": 0.011241446096391883, + "clip_ratio/low_mean": 0.016257574982591904, + "clip_ratio/low_min": 0.005065255504450761, + "clip_ratio/region_mean": 0.02749902132782154, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.55, + "completions/mean_length": 51.103125953674315, + "completions/min_length": 32.0, + "epoch": 0.015469966926277607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38445141911506653, + "kl": 0.03802230341825634, + "learning_rate": 8.590047393364929e-06, + "loss": -0.0011128230020403863, + "reward": 0.037374222651124, + "reward_std": 0.12588206715881825, + "rewards/ASRWerHalluLenRewardV5/mean": 0.037374220043420794, + "rewards/ASRWerHalluLenRewardV5/std": 0.3537131272256374, + "step": 145, + "step_time": 32.75153214782476 + }, + { + "clip_ratio/high_max": 0.01970517839654349, + "clip_ratio/high_mean": 0.01175296965666348, + "clip_ratio/low_mean": 0.017909256499842742, + "clip_ratio/low_min": 0.006476977019337938, + "clip_ratio/region_mean": 0.0296622262801975, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.8, + "completions/mean_length": 51.166667747497556, + "completions/min_length": 35.1, + "epoch": 0.01600341406166649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37139052152633667, + "kl": 0.04876039304072037, + "learning_rate": 8.886255924170617e-06, + "loss": 0.007619699090719223, + "reward": 0.03511717915534973, + "reward_std": 0.1188300896435976, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03511717366054654, + "rewards/ASRWerHalluLenRewardV5/std": 0.3445649579167366, + "step": 150, + "step_time": 31.315648832544685 + }, + { + "clip_ratio/high_max": 0.02097206810140051, + "clip_ratio/high_mean": 0.010429006010235753, + "clip_ratio/low_mean": 0.017066870884445962, + "clip_ratio/low_min": 0.00556317325681448, + "clip_ratio/region_mean": 0.027495877057663164, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.8, + "completions/mean_length": 46.29896011352539, + "completions/min_length": 29.85, + "epoch": 0.016536861197055372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4377763569355011, + "kl": 0.05460279862163588, + "learning_rate": 9.182464454976304e-06, + "loss": -0.0027389636263251303, + "reward": -0.012504154816269875, + "reward_std": 0.13179093189537525, + "rewards/ASRWerHalluLenRewardV5/mean": -0.01250415570102632, + "rewards/ASRWerHalluLenRewardV5/std": 0.4368879795074463, + "step": 155, + "step_time": 30.4072869323194 + }, + { + "clip_ratio/high_max": 0.02001944375806488, + "clip_ratio/high_mean": 0.011612351617077365, + "clip_ratio/low_mean": 0.015961025602882727, + "clip_ratio/low_min": 0.0058850467787124215, + "clip_ratio/region_mean": 0.02757337714720052, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.9, + "completions/mean_length": 48.7354175567627, + "completions/min_length": 31.6, + "epoch": 0.017070308332444255, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4728398621082306, + "kl": 0.057798243151046334, + "learning_rate": 9.47867298578199e-06, + "loss": -0.0022851999849081038, + "reward": -0.04133806936442852, + "reward_std": 0.12949439659714698, + "rewards/ASRWerHalluLenRewardV5/mean": -0.0413380709476769, + "rewards/ASRWerHalluLenRewardV5/std": 0.3404708057641983, + "step": 160, + "step_time": 31.877997374162078 + }, + { + "epoch": 0.017070308332444255, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.54929577464789, + "eval_completions/mean_length": 47.21596371288031, + "eval_completions/min_length": 33.80281690140845, + "eval_frac_reward_zero_std": 0.13145540297870906, + "eval_kl": 0.03429720881657387, + "eval_loss": 0.006774925626814365, + "eval_reward": 0.5550861409537389, + "eval_reward_std": 0.07036787266252746, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5550861268396109, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3306792957682005, + "eval_runtime": 344.8127, + "eval_samples_per_second": 0.621, + "eval_steps_per_second": 0.052, + "step": 160 + }, + { + "clip_ratio/high_max": 0.021267260549939236, + "clip_ratio/high_mean": 0.011656555433000903, + "clip_ratio/low_mean": 0.016042765446763953, + "clip_ratio/low_min": 0.00495184087776579, + "clip_ratio/region_mean": 0.027699320862302555, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.75, + "completions/mean_length": 49.331250762939455, + "completions/min_length": 30.15, + "epoch": 0.01760375546783314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27271902561187744, + "kl": 0.06805465505458415, + "learning_rate": 9.774881516587678e-06, + "loss": 0.0035282112658023834, + "reward": 0.11504089878872037, + "reward_std": 0.12474017217755318, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11504089334048331, + "rewards/ASRWerHalluLenRewardV5/std": 0.34675780721008775, + "step": 165, + "step_time": 30.96511156000197 + }, + { + "clip_ratio/high_max": 0.021045279351528733, + "clip_ratio/high_mean": 0.010976889902667607, + "clip_ratio/low_mean": 0.02009057419054443, + "clip_ratio/low_min": 0.0072230004880111665, + "clip_ratio/region_mean": 0.031067463895305993, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.4, + "completions/mean_length": 48.930209732055665, + "completions/min_length": 30.6, + "epoch": 0.01813720260322202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5032679438591003, + "kl": 0.1021467649610713, + "learning_rate": 1.0071090047393366e-05, + "loss": 0.016357582807540894, + "reward": 0.09934407910332084, + "reward_std": 0.12524006366729737, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09934407541295513, + "rewards/ASRWerHalluLenRewardV5/std": 0.3913145959377289, + "step": 170, + "step_time": 31.04865035675466 + }, + { + "clip_ratio/high_max": 0.021596672298619522, + "clip_ratio/high_mean": 0.011499641911359503, + "clip_ratio/low_mean": 0.02003555408446118, + "clip_ratio/low_min": 0.007630924251861871, + "clip_ratio/region_mean": 0.03153519580373541, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.7, + "completions/mean_length": 49.25520973205566, + "completions/min_length": 30.1, + "epoch": 0.018670649738610903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6155486106872559, + "kl": 0.084866996249184, + "learning_rate": 1.0367298578199053e-05, + "loss": 0.01888526976108551, + "reward": -0.03181526847183704, + "reward_std": 0.1152847982943058, + "rewards/ASRWerHalluLenRewardV5/mean": -0.03181527073029429, + "rewards/ASRWerHalluLenRewardV5/std": 0.387355250120163, + "step": 175, + "step_time": 32.76760973650962 + }, + { + "clip_ratio/high_max": 0.021229682344710454, + "clip_ratio/high_mean": 0.012255953440035227, + "clip_ratio/low_mean": 0.01749388487660326, + "clip_ratio/low_min": 0.004621020445483737, + "clip_ratio/region_mean": 0.029749838286079466, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.75, + "completions/mean_length": 48.47500095367432, + "completions/min_length": 32.6, + "epoch": 0.019204096873999786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5064765810966492, + "kl": 0.07167187703307717, + "learning_rate": 1.066350710900474e-05, + "loss": -0.001172581221908331, + "reward": -0.01092434674501419, + "reward_std": 0.11864908784627914, + "rewards/ASRWerHalluLenRewardV5/mean": -0.010924350051209331, + "rewards/ASRWerHalluLenRewardV5/std": 0.46645163744688034, + "step": 180, + "step_time": 32.29532420802862 + }, + { + "epoch": 0.019204096873999786, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.478873239436616, + "eval_completions/mean_length": 47.238264191318564, + "eval_completions/min_length": 33.859154929577464, + "eval_frac_reward_zero_std": 0.1173708955167045, + "eval_kl": 0.039044017706748466, + "eval_loss": 0.009075390174984932, + "eval_reward": 0.5598195557233313, + "eval_reward_std": 0.0667502372984735, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5598195416616721, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3264299427014841, + "eval_runtime": 345.8236, + "eval_samples_per_second": 0.619, + "eval_steps_per_second": 0.052, + "step": 180 + }, + { + "clip_ratio/high_max": 0.019448561139870434, + "clip_ratio/high_mean": 0.01017946095962543, + "clip_ratio/low_mean": 0.016345574788283558, + "clip_ratio/low_min": 0.004689607638283633, + "clip_ratio/region_mean": 0.026525035724625924, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.7, + "completions/mean_length": 51.909376525878905, + "completions/min_length": 33.7, + "epoch": 0.01973754400938867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5062123537063599, + "kl": 0.08193435487337411, + "learning_rate": 1.0959715639810427e-05, + "loss": 0.016601839661598207, + "reward": 0.059602702409029006, + "reward_std": 0.11529121585190297, + "rewards/ASRWerHalluLenRewardV5/mean": 0.05960269882343709, + "rewards/ASRWerHalluLenRewardV5/std": 0.3536074422299862, + "step": 185, + "step_time": 31.951436912454664 + }, + { + "clip_ratio/high_max": 0.021972546866163612, + "clip_ratio/high_mean": 0.012184980165329762, + "clip_ratio/low_mean": 0.019011904083890842, + "clip_ratio/low_min": 0.0071691074001137165, + "clip_ratio/region_mean": 0.031196883763186634, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.5, + "completions/mean_length": 48.86979293823242, + "completions/min_length": 31.45, + "epoch": 0.020270991144777553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7144002318382263, + "kl": 0.0823578915791586, + "learning_rate": 1.1255924170616114e-05, + "loss": 0.008663912862539291, + "reward": -0.00699254460632801, + "reward_std": 0.14598618522286416, + "rewards/ASRWerHalluLenRewardV5/mean": -0.006992543768137694, + "rewards/ASRWerHalluLenRewardV5/std": 0.4157458938658237, + "step": 190, + "step_time": 30.725919409655035 + }, + { + "clip_ratio/high_max": 0.024201807042118162, + "clip_ratio/high_mean": 0.01407465010706801, + "clip_ratio/low_mean": 0.016070712779765017, + "clip_ratio/low_min": 0.0047206003626342865, + "clip_ratio/region_mean": 0.03014536271803081, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.35, + "completions/mean_length": 51.16770992279053, + "completions/min_length": 34.35, + "epoch": 0.020804438280166437, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4706801474094391, + "kl": 0.08066340808290988, + "learning_rate": 1.1552132701421802e-05, + "loss": -0.004159260541200638, + "reward": 0.037531185802072285, + "reward_std": 0.12296826727688312, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03753118956228718, + "rewards/ASRWerHalluLenRewardV5/std": 0.4297295793890953, + "step": 195, + "step_time": 31.073168290406464 + }, + { + "clip_ratio/high_max": 0.017542928014881908, + "clip_ratio/high_mean": 0.009487711804104038, + "clip_ratio/low_mean": 0.01679111122211907, + "clip_ratio/low_min": 0.00622684175032191, + "clip_ratio/region_mean": 0.02627882307424443, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.5, + "completions/mean_length": 48.4395845413208, + "completions/min_length": 31.95, + "epoch": 0.02133788541555532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5741032958030701, + "kl": 0.08856770845595748, + "learning_rate": 1.184834123222749e-05, + "loss": 0.010799595713615417, + "reward": 0.13641602685675025, + "reward_std": 0.11813973672688008, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13641602608840914, + "rewards/ASRWerHalluLenRewardV5/std": 0.40316845923662187, + "step": 200, + "step_time": 32.75277647059411 + }, + { + "epoch": 0.02133788541555532, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.71830985915493, + "eval_completions/mean_length": 46.81572900691503, + "eval_completions/min_length": 33.563380281690144, + "eval_frac_reward_zero_std": 0.13145540297870906, + "eval_kl": 0.05823957659041797, + "eval_loss": 0.009357578121125698, + "eval_reward": 0.5740149386332069, + "eval_reward_std": 0.07588579480759275, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5740149280344936, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3263211923480873, + "eval_runtime": 343.5034, + "eval_samples_per_second": 0.623, + "eval_steps_per_second": 0.052, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0226413459982723, + "clip_ratio/high_mean": 0.01284041968465317, + "clip_ratio/low_mean": 0.017419386001711244, + "clip_ratio/low_min": 0.0066041415673680605, + "clip_ratio/region_mean": 0.030259805487003178, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.8, + "completions/mean_length": 45.386459255218504, + "completions/min_length": 27.6, + "epoch": 0.0218713325509442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5989184975624084, + "kl": 0.10298110251314938, + "learning_rate": 1.2144549763033177e-05, + "loss": -0.0022663991898298264, + "reward": 0.08932449165731668, + "reward_std": 0.1406340293586254, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08932449016720057, + "rewards/ASRWerHalluLenRewardV5/std": 0.34886891171336176, + "step": 205, + "step_time": 34.06581553611905 + }, + { + "clip_ratio/high_max": 0.021877659653546287, + "clip_ratio/high_mean": 0.012802534487855155, + "clip_ratio/low_mean": 0.016582495227339678, + "clip_ratio/low_min": 0.005952920106938109, + "clip_ratio/region_mean": 0.02938503022887744, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.85, + "completions/mean_length": 50.300001335144046, + "completions/min_length": 36.4, + "epoch": 0.022404779686333084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4833619296550751, + "kl": 0.1299730094964616, + "learning_rate": 1.2440758293838863e-05, + "loss": 0.0014055415987968445, + "reward": 0.10798986814916134, + "reward_std": 0.10762971676886082, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10798986814916134, + "rewards/ASRWerHalluLenRewardV5/std": 0.36487362533807755, + "step": 210, + "step_time": 30.848442691937088 + }, + { + "clip_ratio/high_max": 0.02080991430557333, + "clip_ratio/high_mean": 0.011740012385416776, + "clip_ratio/low_mean": 0.016564509697491302, + "clip_ratio/low_min": 0.005262044197297655, + "clip_ratio/region_mean": 0.028304521949030458, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.4, + "completions/mean_length": 46.27083492279053, + "completions/min_length": 32.55, + "epoch": 0.022938226821721967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.550326943397522, + "kl": 0.16126869237050415, + "learning_rate": 1.273696682464455e-05, + "loss": 0.01206182986497879, + "reward": -0.0075092028826475145, + "reward_std": 0.12513151578605175, + "rewards/ASRWerHalluLenRewardV5/mean": -0.007509207556722686, + "rewards/ASRWerHalluLenRewardV5/std": 0.38228999599814417, + "step": 215, + "step_time": 30.316878784261643 + }, + { + "clip_ratio/high_max": 0.022481245303060858, + "clip_ratio/high_mean": 0.011979898530989885, + "clip_ratio/low_mean": 0.019815011450555177, + "clip_ratio/low_min": 0.007697046833345667, + "clip_ratio/region_mean": 0.03179490978363901, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.1, + "completions/mean_length": 49.07291812896729, + "completions/min_length": 36.15, + "epoch": 0.02347167395711085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44298169016838074, + "kl": 0.15693111587315797, + "learning_rate": 1.3033175355450238e-05, + "loss": 0.008300107717514039, + "reward": -0.03815748654305935, + "reward_std": 0.1169005710631609, + "rewards/ASRWerHalluLenRewardV5/mean": -0.03815748859196901, + "rewards/ASRWerHalluLenRewardV5/std": 0.39476855397224425, + "step": 220, + "step_time": 32.00739878956229 + }, + { + "epoch": 0.02347167395711085, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.971830985915496, + "eval_completions/mean_length": 46.410799429450236, + "eval_completions/min_length": 33.45070422535211, + "eval_frac_reward_zero_std": 0.14084507462004542, + "eval_kl": 0.08753432375444493, + "eval_loss": 0.008204675279557705, + "eval_reward": 0.5845848011298919, + "eval_reward_std": 0.06903467329979783, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5845847852842908, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3175008967781151, + "eval_runtime": 342.8365, + "eval_samples_per_second": 0.624, + "eval_steps_per_second": 0.053, + "step": 220 + }, + { + "clip_ratio/high_max": 0.018746783927781507, + "clip_ratio/high_mean": 0.01022952087514568, + "clip_ratio/low_mean": 0.018843384785577656, + "clip_ratio/low_min": 0.007253191876225173, + "clip_ratio/region_mean": 0.029072905622888355, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.8, + "completions/mean_length": 46.80104293823242, + "completions/min_length": 31.6, + "epoch": 0.024005121092499734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45340853929519653, + "kl": 0.1441838828381151, + "learning_rate": 1.3329383886255924e-05, + "loss": 0.013718652725219726, + "reward": 0.02972680814564228, + "reward_std": 0.1285203617066145, + "rewards/ASRWerHalluLenRewardV5/mean": 0.029726804576057475, + "rewards/ASRWerHalluLenRewardV5/std": 0.367801833152771, + "step": 225, + "step_time": 30.16964445654303 + }, + { + "clip_ratio/high_max": 0.02104043265571818, + "clip_ratio/high_mean": 0.011488624896446708, + "clip_ratio/low_mean": 0.01567082425171975, + "clip_ratio/low_min": 0.004665631853276864, + "clip_ratio/region_mean": 0.027159449527971447, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.1, + "completions/mean_length": 48.06666793823242, + "completions/min_length": 32.6, + "epoch": 0.024538568227888618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7424259185791016, + "kl": 0.1282304996624589, + "learning_rate": 1.3625592417061612e-05, + "loss": 0.010289233922958375, + "reward": 0.1839321617037058, + "reward_std": 0.10838814824819565, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1839321658015251, + "rewards/ASRWerHalluLenRewardV5/std": 0.4655562460422516, + "step": 230, + "step_time": 30.412428098358212 + }, + { + "clip_ratio/high_max": 0.02272765922243707, + "clip_ratio/high_mean": 0.012596981885144487, + "clip_ratio/low_mean": 0.019570054503856227, + "clip_ratio/low_min": 0.007386698527261615, + "clip_ratio/region_mean": 0.03216703654034063, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.5, + "completions/mean_length": 46.125000953674316, + "completions/min_length": 30.35, + "epoch": 0.025072015363277498, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4277493953704834, + "kl": 0.23973431792110206, + "learning_rate": 1.39218009478673e-05, + "loss": 0.008617895096540451, + "reward": -0.03508000522851944, + "reward_std": 0.12539697475731373, + "rewards/ASRWerHalluLenRewardV5/mean": -0.03508000643923879, + "rewards/ASRWerHalluLenRewardV5/std": 0.41510373130440714, + "step": 235, + "step_time": 29.926113635860382 + }, + { + "clip_ratio/high_max": 0.023244463303126395, + "clip_ratio/high_mean": 0.012066030610003508, + "clip_ratio/low_mean": 0.016613682350725866, + "clip_ratio/low_min": 0.004767808597534895, + "clip_ratio/region_mean": 0.02867971296655014, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.45, + "completions/mean_length": 49.6572925567627, + "completions/min_length": 34.7, + "epoch": 0.02560546249866638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41614922881126404, + "kl": 0.2162212889175862, + "learning_rate": 1.4218009478672985e-05, + "loss": 0.011000619828701019, + "reward": 0.043707318417727944, + "reward_std": 0.1034819521009922, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04370732121169567, + "rewards/ASRWerHalluLenRewardV5/std": 0.4458952181041241, + "step": 240, + "step_time": 32.233536714874205 + }, + { + "epoch": 0.02560546249866638, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.816901408450704, + "eval_completions/mean_length": 46.99413288814921, + "eval_completions/min_length": 33.80281690140845, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.0806257817697462, + "eval_loss": 0.009340801276266575, + "eval_reward": 0.5768822846593151, + "eval_reward_std": 0.06400424617172128, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5768822635668265, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.32334403137505896, + "eval_runtime": 344.3844, + "eval_samples_per_second": 0.621, + "eval_steps_per_second": 0.052, + "step": 240 + }, + { + "clip_ratio/high_max": 0.021500708020175806, + "clip_ratio/high_mean": 0.012750158438575455, + "clip_ratio/low_mean": 0.018546935460472015, + "clip_ratio/low_min": 0.0062771226541372014, + "clip_ratio/region_mean": 0.03129709387430921, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.3, + "completions/mean_length": 52.038542938232425, + "completions/min_length": 34.95, + "epoch": 0.026138909634055265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.51253342628479, + "kl": 0.14617341808043421, + "learning_rate": 1.4514218009478675e-05, + "loss": 0.0004899534396827221, + "reward": 0.04123313296586275, + "reward_std": 0.11335671842098236, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04123312379233539, + "rewards/ASRWerHalluLenRewardV5/std": 0.31836567521095277, + "step": 245, + "step_time": 31.678637876547874 + }, + { + "clip_ratio/high_max": 0.020946358074434103, + "clip_ratio/high_mean": 0.010827345293364488, + "clip_ratio/low_mean": 0.014354757389810402, + "clip_ratio/low_min": 0.0038679605291690677, + "clip_ratio/region_mean": 0.025182102719554678, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.75, + "completions/mean_length": 45.842709350585935, + "completions/min_length": 27.9, + "epoch": 0.02667235676944415, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4852648675441742, + "kl": 0.12267452226951718, + "learning_rate": 1.481042654028436e-05, + "loss": 0.0011922205798327924, + "reward": 0.09630277752876282, + "reward_std": 0.10908879414200782, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09630278460681438, + "rewards/ASRWerHalluLenRewardV5/std": 0.4050320826470852, + "step": 250, + "step_time": 30.158394424803554 + }, + { + "clip_ratio/high_max": 0.01812214698875323, + "clip_ratio/high_mean": 0.009958000668848398, + "clip_ratio/low_mean": 0.01667711393092759, + "clip_ratio/low_min": 0.005789508746238426, + "clip_ratio/region_mean": 0.026635114412056283, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 266.85, + "completions/mean_length": 49.99479331970215, + "completions/min_length": 25.1, + "epoch": 0.027205803904833032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3725917935371399, + "kl": 0.13468601861968638, + "learning_rate": 1.5106635071090047e-05, + "loss": 0.005540101975202561, + "reward": 0.0708458211272955, + "reward_std": 0.11763928793370723, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0708458217792213, + "rewards/ASRWerHalluLenRewardV5/std": 0.4122568920254707, + "step": 255, + "step_time": 90.2933396782726 + }, + { + "clip_ratio/high_max": 0.021962702728342266, + "clip_ratio/high_mean": 0.011973483716428746, + "clip_ratio/low_mean": 0.018519677074800713, + "clip_ratio/low_min": 0.007189543667482212, + "clip_ratio/region_mean": 0.030493160577316304, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.9, + "completions/mean_length": 47.417709541320804, + "completions/min_length": 31.05, + "epoch": 0.027739251040221916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45739036798477173, + "kl": 0.15281936731189488, + "learning_rate": 1.5402843601895736e-05, + "loss": 0.011719565093517303, + "reward": 0.08652897775173188, + "reward_std": 0.13519105948507787, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08652896916028112, + "rewards/ASRWerHalluLenRewardV5/std": 0.3850595377385616, + "step": 260, + "step_time": 30.24571572840214 + }, + { + "epoch": 0.027739251040221916, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.732394366197184, + "eval_completions/mean_length": 46.12558808125241, + "eval_completions/min_length": 33.25352112676056, + "eval_frac_reward_zero_std": 0.12676056715804088, + "eval_kl": 0.09308078353831047, + "eval_loss": 0.007052826229482889, + "eval_reward": 0.5836867611383049, + "eval_reward_std": 0.0697437851873397, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5836867378158889, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.32384983214064383, + "eval_runtime": 340.0805, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 260 + }, + { + "clip_ratio/high_max": 0.026298460020916535, + "clip_ratio/high_mean": 0.01473938374692807, + "clip_ratio/low_mean": 0.01795173070859164, + "clip_ratio/low_min": 0.006558888801373541, + "clip_ratio/region_mean": 0.03269111458212137, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.55, + "completions/mean_length": 51.47500114440918, + "completions/min_length": 38.15, + "epoch": 0.028272698175610796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.631825864315033, + "kl": 0.1880974371917546, + "learning_rate": 1.5699052132701422e-05, + "loss": 0.005482178181409836, + "reward": -0.0018308978527784348, + "reward_std": 0.10595405511558056, + "rewards/ASRWerHalluLenRewardV5/mean": -0.001830891950521618, + "rewards/ASRWerHalluLenRewardV5/std": 0.42179119810461996, + "step": 265, + "step_time": 31.215971452370287 + }, + { + "clip_ratio/high_max": 0.021007754484890027, + "clip_ratio/high_mean": 0.011053302219079342, + "clip_ratio/low_mean": 0.021223481294873635, + "clip_ratio/low_min": 0.008089623355772346, + "clip_ratio/region_mean": 0.03227678355178796, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.1, + "completions/mean_length": 44.775000953674315, + "completions/min_length": 30.15, + "epoch": 0.02880614531099968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41032689809799194, + "kl": 0.2692207901738584, + "learning_rate": 1.5995260663507108e-05, + "loss": 0.021451906859874727, + "reward": 0.008028454706072807, + "reward_std": 0.110575738735497, + "rewards/ASRWerHalluLenRewardV5/mean": 0.008028447441756726, + "rewards/ASRWerHalluLenRewardV5/std": 0.3468805752694607, + "step": 270, + "step_time": 30.250224334560336 + }, + { + "clip_ratio/high_max": 0.02159400873351842, + "clip_ratio/high_mean": 0.012528940475021955, + "clip_ratio/low_mean": 0.01919074750621803, + "clip_ratio/low_min": 0.006923501682467759, + "clip_ratio/region_mean": 0.031719687848817554, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.6, + "completions/mean_length": 45.05937671661377, + "completions/min_length": 27.8, + "epoch": 0.029339592446388563, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5199218988418579, + "kl": 0.21233401466161012, + "learning_rate": 1.6291469194312797e-05, + "loss": 0.0034658245742321013, + "reward": 0.03605337142944336, + "reward_std": 0.1324103705585003, + "rewards/ASRWerHalluLenRewardV5/mean": 0.036053371406160294, + "rewards/ASRWerHalluLenRewardV5/std": 0.42138367481529715, + "step": 275, + "step_time": 29.63183932043612 + }, + { + "clip_ratio/high_max": 0.02239687932305969, + "clip_ratio/high_mean": 0.013148702486068942, + "clip_ratio/low_mean": 0.018185508053284137, + "clip_ratio/low_min": 0.006374962301924825, + "clip_ratio/region_mean": 0.031334210571367295, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.25, + "completions/mean_length": 51.07916793823242, + "completions/min_length": 34.75, + "epoch": 0.029873039581777446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35126733779907227, + "kl": 0.21495271753519773, + "learning_rate": 1.6587677725118483e-05, + "loss": 0.0006896938197314739, + "reward": 0.03844807334244251, + "reward_std": 0.11394645012915135, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03844807203859091, + "rewards/ASRWerHalluLenRewardV5/std": 0.4134372688829899, + "step": 280, + "step_time": 31.041589968837798 + }, + { + "epoch": 0.029873039581777446, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.7887323943662, + "eval_completions/mean_length": 47.32981340650102, + "eval_completions/min_length": 33.929577464788736, + "eval_frac_reward_zero_std": 0.13145540297870906, + "eval_kl": 0.14720006010324602, + "eval_loss": 0.015254839323461056, + "eval_reward": 0.5799436617845958, + "eval_reward_std": 0.06982928677789972, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5799436424137421, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31212890163903506, + "eval_runtime": 345.6112, + "eval_samples_per_second": 0.619, + "eval_steps_per_second": 0.052, + "step": 280 + }, + { + "clip_ratio/high_max": 0.019619667006190868, + "clip_ratio/high_mean": 0.010529958056577016, + "clip_ratio/low_mean": 0.015907119476469233, + "clip_ratio/low_min": 0.004902251966996118, + "clip_ratio/region_mean": 0.026437077403534202, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.05, + "completions/mean_length": 48.621876335144044, + "completions/min_length": 36.3, + "epoch": 0.03040648671716633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.458778738975525, + "kl": 0.2342870099004358, + "learning_rate": 1.6883886255924172e-05, + "loss": 0.004743968695402145, + "reward": 0.1690674439072609, + "reward_std": 0.09840434566140174, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16906743943691255, + "rewards/ASRWerHalluLenRewardV5/std": 0.40843720510602, + "step": 285, + "step_time": 31.453365372121333 + }, + { + "clip_ratio/high_max": 0.021378430462209507, + "clip_ratio/high_mean": 0.012008589821925852, + "clip_ratio/low_mean": 0.02013037682045251, + "clip_ratio/low_min": 0.00747740610386245, + "clip_ratio/region_mean": 0.032138966885395345, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.35, + "completions/mean_length": 48.50625133514404, + "completions/min_length": 25.85, + "epoch": 0.030939933852555213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9367825388908386, + "kl": 0.2024377293419093, + "learning_rate": 1.7180094786729858e-05, + "loss": 0.004812454432249069, + "reward": -0.03542451001703739, + "reward_std": 0.13334222063422202, + "rewards/ASRWerHalluLenRewardV5/mean": -0.03542451029643416, + "rewards/ASRWerHalluLenRewardV5/std": 0.3439970649778843, + "step": 290, + "step_time": 30.86103101912886 + }, + { + "clip_ratio/high_max": 0.017538529279408976, + "clip_ratio/high_mean": 0.009179015170957427, + "clip_ratio/low_mean": 0.016313550327322447, + "clip_ratio/low_min": 0.004617211961885914, + "clip_ratio/region_mean": 0.025492565508466215, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.4, + "completions/mean_length": 48.16666793823242, + "completions/min_length": 29.7, + "epoch": 0.0314733809879441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3933960795402527, + "kl": 0.1372756644152105, + "learning_rate": 1.7476303317535544e-05, + "loss": 0.00696822926402092, + "reward": 0.14782994110137224, + "reward_std": 0.11166248116642237, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1478299398906529, + "rewards/ASRWerHalluLenRewardV5/std": 0.4049372121691704, + "step": 295, + "step_time": 30.953101348876952 + }, + { + "clip_ratio/high_max": 0.017246172932209446, + "clip_ratio/high_mean": 0.009157170346588828, + "clip_ratio/low_mean": 0.014090541876794305, + "clip_ratio/low_min": 0.003226381354033947, + "clip_ratio/region_mean": 0.023247712326701732, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.15, + "completions/mean_length": 44.017709732055664, + "completions/min_length": 28.05, + "epoch": 0.03200682812333298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5804752111434937, + "kl": 0.14309124411083757, + "learning_rate": 1.7772511848341233e-05, + "loss": 0.010077747702598571, + "reward": 0.12062493301928043, + "reward_std": 0.10602599419653416, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12062493171542883, + "rewards/ASRWerHalluLenRewardV5/std": 0.4230868324637413, + "step": 300, + "step_time": 30.823902398720385 + }, + { + "epoch": 0.03200682812333298, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.32394366197183, + "eval_completions/mean_length": 45.99295908968214, + "eval_completions/min_length": 33.056338028169016, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.12924325004429885, + "eval_loss": 0.007478647865355015, + "eval_reward": 0.6005360168260587, + "eval_reward_std": 0.06896657724274506, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6005360043909348, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31353687765207927, + "eval_runtime": 339.0545, + "eval_samples_per_second": 0.631, + "eval_steps_per_second": 0.053, + "step": 300 + }, + { + "clip_ratio/high_max": 0.019868970493553205, + "clip_ratio/high_mean": 0.011052681977162138, + "clip_ratio/low_mean": 0.017859458739985713, + "clip_ratio/low_min": 0.006462924109655432, + "clip_ratio/region_mean": 0.028912140725879, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.8, + "completions/mean_length": 47.195834159851074, + "completions/min_length": 29.75, + "epoch": 0.032540275258721864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4543176591396332, + "kl": 0.22447667652741074, + "learning_rate": 1.806872037914692e-05, + "loss": 0.00022159740328788757, + "reward": 0.04190865028649569, + "reward_std": 0.12209739051759243, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04190865034470335, + "rewards/ASRWerHalluLenRewardV5/std": 0.3721024259924889, + "step": 305, + "step_time": 31.855755706131458 + }, + { + "clip_ratio/high_max": 0.021095217560650782, + "clip_ratio/high_mean": 0.01217644363641739, + "clip_ratio/low_mean": 0.01688823994627455, + "clip_ratio/low_min": 0.005215707395109348, + "clip_ratio/region_mean": 0.02906468342989683, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.05, + "completions/mean_length": 47.25104331970215, + "completions/min_length": 27.5, + "epoch": 0.033073722394110744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4617408514022827, + "kl": 0.20546940248459578, + "learning_rate": 1.836492890995261e-05, + "loss": -0.003222481161355972, + "reward": 0.006018774583935738, + "reward_std": 0.11146049126982689, + "rewards/ASRWerHalluLenRewardV5/mean": 0.006018774583935738, + "rewards/ASRWerHalluLenRewardV5/std": 0.4111086279153824, + "step": 310, + "step_time": 30.284994880855084 + }, + { + "clip_ratio/high_max": 0.021509819029597567, + "clip_ratio/high_mean": 0.011835448982310481, + "clip_ratio/low_mean": 0.020054455680656245, + "clip_ratio/low_min": 0.008457369066309183, + "clip_ratio/region_mean": 0.03188990459311754, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/mean_length": 51.630209350585936, + "completions/min_length": 38.0, + "epoch": 0.033607169529499624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7779387831687927, + "kl": 0.2784342132508755, + "learning_rate": 1.8661137440758295e-05, + "loss": 0.014899882674217223, + "reward": -0.02527285423129797, + "reward_std": 0.11537100933492184, + "rewards/ASRWerHalluLenRewardV5/mean": -0.025272856280207635, + "rewards/ASRWerHalluLenRewardV5/std": 0.3871266581118107, + "step": 315, + "step_time": 34.10723292008042 + }, + { + "clip_ratio/high_max": 0.020931344892596827, + "clip_ratio/high_mean": 0.011544619595224504, + "clip_ratio/low_mean": 0.019589883706066756, + "clip_ratio/low_min": 0.007481394713977352, + "clip_ratio/region_mean": 0.03113450310193002, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.3, + "completions/mean_length": 46.65208435058594, + "completions/min_length": 24.8, + "epoch": 0.03414061666488851, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7282801270484924, + "kl": 0.22058490999042987, + "learning_rate": 1.895734597156398e-05, + "loss": 0.007515702396631241, + "reward": 0.03616289347410202, + "reward_std": 0.11584692541509867, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0361628944054246, + "rewards/ASRWerHalluLenRewardV5/std": 0.37045286893844603, + "step": 320, + "step_time": 30.71590498033911 + }, + { + "epoch": 0.03414061666488851, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.91549295774648, + "eval_completions/mean_length": 46.2957758299062, + "eval_completions/min_length": 33.0, + "eval_frac_reward_zero_std": 0.13145540297870906, + "eval_kl": 0.13895258819028525, + "eval_loss": 0.009752826765179634, + "eval_reward": 0.5938855458746178, + "eval_reward_std": 0.06825050407312286, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5938855323376475, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3135188783348446, + "eval_runtime": 342.272, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.053, + "step": 320 + }, + { + "clip_ratio/high_max": 0.01953667314373888, + "clip_ratio/high_mean": 0.010227300388214644, + "clip_ratio/low_mean": 0.019918083559605292, + "clip_ratio/low_min": 0.005896864423993975, + "clip_ratio/region_mean": 0.030145384091883897, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.55, + "completions/mean_length": 49.59271011352539, + "completions/min_length": 32.95, + "epoch": 0.03467406380027739, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4353817105293274, + "kl": 0.19100640779361128, + "learning_rate": 1.925355450236967e-05, + "loss": 0.016184966266155242, + "reward": 0.08534082621335984, + "reward_std": 0.10953522324562073, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08534082621335984, + "rewards/ASRWerHalluLenRewardV5/std": 0.3584629908204079, + "step": 325, + "step_time": 30.952420284971595 + }, + { + "clip_ratio/high_max": 0.023207874610670844, + "clip_ratio/high_mean": 0.012591653932759073, + "clip_ratio/low_mean": 0.018332100998668464, + "clip_ratio/low_min": 0.006010329644777812, + "clip_ratio/region_mean": 0.03092375546693802, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.95, + "completions/mean_length": 44.94895944595337, + "completions/min_length": 27.65, + "epoch": 0.03520751093566628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49688202142715454, + "kl": 0.18154549980536103, + "learning_rate": 1.9549763033175356e-05, + "loss": 0.0011232475750148295, + "reward": 0.05188002288341522, + "reward_std": 0.12473404407501221, + "rewards/ASRWerHalluLenRewardV5/mean": 0.05188001617789269, + "rewards/ASRWerHalluLenRewardV5/std": 0.4138328179717064, + "step": 330, + "step_time": 30.73091931678355 + }, + { + "clip_ratio/high_max": 0.01981263398192823, + "clip_ratio/high_mean": 0.010976627199852373, + "clip_ratio/low_mean": 0.017838550376472993, + "clip_ratio/low_min": 0.006103446238557808, + "clip_ratio/region_mean": 0.028815177781507374, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.65, + "completions/mean_length": 49.18229331970215, + "completions/min_length": 33.4, + "epoch": 0.03574095807105516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44854435324668884, + "kl": 0.1876353969797492, + "learning_rate": 1.9845971563981045e-05, + "loss": 0.00046979887410998346, + "reward": 0.12301725028082729, + "reward_std": 0.1174619022756815, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12301724185235799, + "rewards/ASRWerHalluLenRewardV5/std": 0.38606795221567153, + "step": 335, + "step_time": 33.14739841856063 + }, + { + "clip_ratio/high_max": 0.020866259559988977, + "clip_ratio/high_mean": 0.011059475626097992, + "clip_ratio/low_mean": 0.01693558362312615, + "clip_ratio/low_min": 0.005737510620383546, + "clip_ratio/region_mean": 0.027995059581007807, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.6, + "completions/mean_length": 48.517709732055664, + "completions/min_length": 31.1, + "epoch": 0.03627440520644404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.410971075296402, + "kl": 0.18167107738554478, + "learning_rate": 2.014218009478673e-05, + "loss": 0.008845667541027068, + "reward": 0.09740807060152293, + "reward_std": 0.13027836568653584, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09740806902991608, + "rewards/ASRWerHalluLenRewardV5/std": 0.3436972536146641, + "step": 340, + "step_time": 31.14339899700135 + }, + { + "epoch": 0.03627440520644404, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.46478873239437, + "eval_completions/mean_length": 46.54577593736246, + "eval_completions/min_length": 33.394366197183096, + "eval_frac_reward_zero_std": 0.1455399104407136, + "eval_kl": 0.14013115536998694, + "eval_loss": 0.010033881291747093, + "eval_reward": 0.6010966390686135, + "eval_reward_std": 0.07366877353527176, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.601096617503905, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3106916654645137, + "eval_runtime": 342.1914, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.053, + "step": 340 + }, + { + "clip_ratio/high_max": 0.020509986276738345, + "clip_ratio/high_mean": 0.011577656884037424, + "clip_ratio/low_mean": 0.0186627964314539, + "clip_ratio/low_min": 0.0068459727335721254, + "clip_ratio/region_mean": 0.030240453261649237, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.9, + "completions/mean_length": 50.530209350585935, + "completions/min_length": 34.15, + "epoch": 0.036807852341832925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4806738495826721, + "kl": 0.22247503167018295, + "learning_rate": 2.0438388625592417e-05, + "loss": 0.006836593151092529, + "reward": 0.04531610431149602, + "reward_std": 0.11825492605566978, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04531611138954759, + "rewards/ASRWerHalluLenRewardV5/std": 0.3982472516596317, + "step": 345, + "step_time": 32.993535871617496 + }, + { + "clip_ratio/high_max": 0.022996838542167098, + "clip_ratio/high_mean": 0.01358610480237985, + "clip_ratio/low_mean": 0.020019567426061258, + "clip_ratio/low_min": 0.008917004032991827, + "clip_ratio/region_mean": 0.03360567236086354, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.75, + "completions/mean_length": 48.6239595413208, + "completions/min_length": 33.1, + "epoch": 0.037341299477221805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8068991303443909, + "kl": 0.22980739297345282, + "learning_rate": 2.0734597156398106e-05, + "loss": 0.0032023530453443526, + "reward": -0.09663721099495888, + "reward_std": 0.1104249358177185, + "rewards/ASRWerHalluLenRewardV5/mean": -0.09663720629177988, + "rewards/ASRWerHalluLenRewardV5/std": 0.3215485565364361, + "step": 350, + "step_time": 30.727315118163823 + }, + { + "clip_ratio/high_max": 0.01812508306466043, + "clip_ratio/high_mean": 0.010760971941635944, + "clip_ratio/low_mean": 0.01693145468161674, + "clip_ratio/low_min": 0.006192501765326597, + "clip_ratio/region_mean": 0.027692426351131873, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.75, + "completions/mean_length": 50.37083473205566, + "completions/min_length": 40.4, + "epoch": 0.03787474661261069, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33700495958328247, + "kl": 0.17393158129416406, + "learning_rate": 2.1030805687203792e-05, + "loss": -0.006173215061426163, + "reward": 0.16220147870481014, + "reward_std": 0.10503681637346744, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16220148112624883, + "rewards/ASRWerHalluLenRewardV5/std": 0.3837303012609482, + "step": 355, + "step_time": 32.17871859166771 + }, + { + "clip_ratio/high_max": 0.02182121151708998, + "clip_ratio/high_mean": 0.011845047387760132, + "clip_ratio/low_mean": 0.018608778732595966, + "clip_ratio/low_min": 0.006827689133933746, + "clip_ratio/region_mean": 0.030453826091252266, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.65, + "completions/mean_length": 50.81666812896729, + "completions/min_length": 35.35, + "epoch": 0.03840819374799957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41421857476234436, + "kl": 0.18109886888414622, + "learning_rate": 2.132701421800948e-05, + "loss": 0.0030337393283843992, + "reward": 0.014468875527381898, + "reward_std": 0.11227692253887653, + "rewards/ASRWerHalluLenRewardV5/mean": 0.014468873851001263, + "rewards/ASRWerHalluLenRewardV5/std": 0.42868958711624144, + "step": 360, + "step_time": 30.44110239278525 + }, + { + "epoch": 0.03840819374799957, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.54929577464789, + "eval_completions/mean_length": 46.396715083592376, + "eval_completions/min_length": 33.563380281690144, + "eval_frac_reward_zero_std": 0.1549295812425479, + "eval_kl": 0.14047779473410527, + "eval_loss": 0.007633109577000141, + "eval_reward": 0.6002792186090644, + "eval_reward_std": 0.06266396103138236, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6002792114208282, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31154800484507855, + "eval_runtime": 340.2327, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 360 + }, + { + "clip_ratio/high_max": 0.025149832299211992, + "clip_ratio/high_mean": 0.01350426595308818, + "clip_ratio/low_mean": 0.019516514195129276, + "clip_ratio/low_min": 0.006595268787350506, + "clip_ratio/region_mean": 0.033020780375227335, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.25, + "completions/mean_length": 52.4541675567627, + "completions/min_length": 37.45, + "epoch": 0.03894164088338846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4664393961429596, + "kl": 0.18054678235203028, + "learning_rate": 2.1623222748815167e-05, + "loss": 0.0008491630665957928, + "reward": 0.02307158038020134, + "reward_std": 0.1055484440177679, + "rewards/ASRWerHalluLenRewardV5/mean": 0.023071585595607756, + "rewards/ASRWerHalluLenRewardV5/std": 0.42725510597229005, + "step": 365, + "step_time": 31.600190706923605 + }, + { + "clip_ratio/high_max": 0.020835031574824824, + "clip_ratio/high_mean": 0.011275418857985641, + "clip_ratio/low_mean": 0.01735157405346399, + "clip_ratio/low_min": 0.0067889905883930625, + "clip_ratio/region_mean": 0.028626992966746912, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.05, + "completions/mean_length": 48.07187633514404, + "completions/min_length": 32.3, + "epoch": 0.03947508801877734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4542257785797119, + "kl": 0.21985585587099193, + "learning_rate": 2.1919431279620853e-05, + "loss": 0.0026127513498067855, + "reward": 0.11128522735089064, + "reward_std": 0.11729211322963237, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11128523107618093, + "rewards/ASRWerHalluLenRewardV5/std": 0.3523942820727825, + "step": 370, + "step_time": 30.394554891996087 + }, + { + "clip_ratio/high_max": 0.022259553708136082, + "clip_ratio/high_mean": 0.011497596402477939, + "clip_ratio/low_mean": 0.016910902221570722, + "clip_ratio/low_min": 0.006154828710714355, + "clip_ratio/region_mean": 0.02840849900385365, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.05, + "completions/mean_length": 47.73020992279053, + "completions/min_length": 36.0, + "epoch": 0.04000853515416622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3954105079174042, + "kl": 0.26737771835178137, + "learning_rate": 2.2215639810426543e-05, + "loss": 0.011412160843610764, + "reward": 0.044310411997139455, + "reward_std": 0.10574035458266735, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04431041008792817, + "rewards/ASRWerHalluLenRewardV5/std": 0.3793522082269192, + "step": 375, + "step_time": 30.564504916965962 + }, + { + "clip_ratio/high_max": 0.02244724889751524, + "clip_ratio/high_mean": 0.012724023667396977, + "clip_ratio/low_mean": 0.01888705052842852, + "clip_ratio/low_min": 0.005111713008955121, + "clip_ratio/region_mean": 0.03161107443738729, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.95, + "completions/mean_length": 47.23437633514404, + "completions/min_length": 29.2, + "epoch": 0.040541982289555106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5002712607383728, + "kl": 0.22307770289480686, + "learning_rate": 2.251184834123223e-05, + "loss": 0.005298890918493271, + "reward": 0.13621375896036625, + "reward_std": 0.12127830497920514, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13621375970542432, + "rewards/ASRWerHalluLenRewardV5/std": 0.41299191266298296, + "step": 380, + "step_time": 33.02251593954861 + }, + { + "epoch": 0.040541982289555106, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.985915492957744, + "eval_completions/mean_length": 46.617372056128275, + "eval_completions/min_length": 33.46478873239437, + "eval_frac_reward_zero_std": 0.15023474584163074, + "eval_kl": 0.1593070277865504, + "eval_loss": 0.010079918429255486, + "eval_reward": 0.6097234368429217, + "eval_reward_std": 0.06651315159938286, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6097234184263458, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31050583646750785, + "eval_runtime": 343.0378, + "eval_samples_per_second": 0.624, + "eval_steps_per_second": 0.052, + "step": 380 + }, + { + "clip_ratio/high_max": 0.020902693987591193, + "clip_ratio/high_mean": 0.011931974873004948, + "clip_ratio/low_mean": 0.017229651397792624, + "clip_ratio/low_min": 0.005912358220666647, + "clip_ratio/region_mean": 0.029161626100540163, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.2, + "completions/mean_length": 49.158334732055664, + "completions/min_length": 31.15, + "epoch": 0.041075429424943986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4526750445365906, + "kl": 0.21516240620985627, + "learning_rate": 2.2808056872037918e-05, + "loss": 0.0028047297149896623, + "reward": 0.11593936271965503, + "reward_std": 0.1075512720271945, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11593935396522284, + "rewards/ASRWerHalluLenRewardV5/std": 0.4146284721791744, + "step": 385, + "step_time": 31.020328475534917 + }, + { + "clip_ratio/high_max": 0.02479512546560727, + "clip_ratio/high_mean": 0.01416432987753069, + "clip_ratio/low_mean": 0.021364565478870647, + "clip_ratio/low_min": 0.007426180667243898, + "clip_ratio/region_mean": 0.035528895212337375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.5, + "completions/mean_length": 47.41770935058594, + "completions/min_length": 32.1, + "epoch": 0.04160887656033287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6027013063430786, + "kl": 0.21172908842563629, + "learning_rate": 2.3104265402843604e-05, + "loss": 0.0020947476848959925, + "reward": 0.02095988281071186, + "reward_std": 0.12764509171247482, + "rewards/ASRWerHalluLenRewardV5/mean": 0.020959876594133675, + "rewards/ASRWerHalluLenRewardV5/std": 0.42724572867155075, + "step": 390, + "step_time": 31.135389265418052 + }, + { + "clip_ratio/high_max": 0.022429191443370654, + "clip_ratio/high_mean": 0.011938743690552655, + "clip_ratio/low_mean": 0.01889866001001792, + "clip_ratio/low_min": 0.0074936748307663946, + "clip_ratio/region_mean": 0.03083740398287773, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.55, + "completions/mean_length": 49.80520935058594, + "completions/min_length": 32.7, + "epoch": 0.04214232369572175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3511272370815277, + "kl": 0.21585048083215952, + "learning_rate": 2.340047393364929e-05, + "loss": 0.029727497696876527, + "reward": 0.07535568070597946, + "reward_std": 0.1135896909981966, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0753556783311069, + "rewards/ASRWerHalluLenRewardV5/std": 0.41468155905604365, + "step": 395, + "step_time": 30.905753006227314 + }, + { + "clip_ratio/high_max": 0.0177357608161401, + "clip_ratio/high_mean": 0.00974491600500187, + "clip_ratio/low_mean": 0.017575158427644056, + "clip_ratio/low_min": 0.007020667882170528, + "clip_ratio/region_mean": 0.027320074489398393, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.05, + "completions/mean_length": 47.37083492279053, + "completions/min_length": 32.1, + "epoch": 0.04267577083111064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44491255283355713, + "kl": 0.2422690737992525, + "learning_rate": 2.369668246445498e-05, + "loss": 0.0086495541036129, + "reward": 0.19951059259474277, + "reward_std": 0.10917122773826123, + "rewards/ASRWerHalluLenRewardV5/mean": 0.19951058812439443, + "rewards/ASRWerHalluLenRewardV5/std": 0.38021907284855844, + "step": 400, + "step_time": 32.112432012706996 + }, + { + "epoch": 0.04267577083111064, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 56.647887323943664, + "eval_completions/mean_length": 45.59389794040734, + "eval_completions/min_length": 32.80281690140845, + "eval_frac_reward_zero_std": 0.19248826822764437, + "eval_kl": 0.20677319583548626, + "eval_loss": 0.007039555814117193, + "eval_reward": 0.6038957580294407, + "eval_reward_std": 0.06237843283124163, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.603895743154514, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30670443720276086, + "eval_runtime": 341.6822, + "eval_samples_per_second": 0.626, + "eval_steps_per_second": 0.053, + "step": 400 + }, + { + "clip_ratio/high_max": 0.021617289871210234, + "clip_ratio/high_mean": 0.011680428861291147, + "clip_ratio/low_mean": 0.01870264754688833, + "clip_ratio/low_min": 0.006379917374579236, + "clip_ratio/region_mean": 0.030383076309226452, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.85, + "completions/mean_length": 49.4708345413208, + "completions/min_length": 35.6, + "epoch": 0.04320921796649952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4305608570575714, + "kl": 0.2395480598323047, + "learning_rate": 2.3992890995260665e-05, + "loss": -0.00023243899922817946, + "reward": 0.09769210740923881, + "reward_std": 0.10718287285417319, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09769210785161704, + "rewards/ASRWerHalluLenRewardV5/std": 0.4276264920830727, + "step": 405, + "step_time": 31.93263008147478 + }, + { + "clip_ratio/high_max": 0.0203190702770371, + "clip_ratio/high_mean": 0.012009711585415061, + "clip_ratio/low_mean": 0.019411475217202677, + "clip_ratio/low_min": 0.006713703111745417, + "clip_ratio/region_mean": 0.03142118687974289, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.35, + "completions/mean_length": 47.55625114440918, + "completions/min_length": 33.85, + "epoch": 0.0437426651018884, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.55621862411499, + "kl": 0.3494168357923627, + "learning_rate": 2.4289099526066354e-05, + "loss": 0.00867505818605423, + "reward": -0.0051323559135198595, + "reward_std": 0.10740983188152313, + "rewards/ASRWerHalluLenRewardV5/mean": -0.005132353771477938, + "rewards/ASRWerHalluLenRewardV5/std": 0.44295769110322, + "step": 410, + "step_time": 31.79921675771475 + }, + { + "clip_ratio/high_max": 0.023078950896160678, + "clip_ratio/high_mean": 0.012992559345730115, + "clip_ratio/low_mean": 0.018133472124463878, + "clip_ratio/low_min": 0.007470440526958555, + "clip_ratio/region_mean": 0.03112603162880987, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.25, + "completions/mean_length": 47.253125953674314, + "completions/min_length": 29.9, + "epoch": 0.04427611223727729, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37740325927734375, + "kl": 0.23244924512691795, + "learning_rate": 2.458530805687204e-05, + "loss": 0.013270987570285797, + "reward": 0.08285031840205193, + "reward_std": 0.11080278344452381, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08285032534040511, + "rewards/ASRWerHalluLenRewardV5/std": 0.45122388899326327, + "step": 415, + "step_time": 32.04004750773311 + }, + { + "clip_ratio/high_max": 0.024658491817535832, + "clip_ratio/high_mean": 0.01340955867926823, + "clip_ratio/low_mean": 0.019599541492061688, + "clip_ratio/low_min": 0.006757633405504749, + "clip_ratio/region_mean": 0.03300910049583763, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.75, + "completions/mean_length": 44.9239595413208, + "completions/min_length": 27.3, + "epoch": 0.04480955937266617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5605917572975159, + "kl": 0.20311314500868322, + "learning_rate": 2.4881516587677726e-05, + "loss": 0.0067875199019908905, + "reward": -0.04388252012431622, + "reward_std": 0.11759737655520439, + "rewards/ASRWerHalluLenRewardV5/mean": -0.04388252152130008, + "rewards/ASRWerHalluLenRewardV5/std": 0.41899975538253786, + "step": 420, + "step_time": 31.29044296257198 + }, + { + "epoch": 0.04480955937266617, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.91549295774648, + "eval_completions/mean_length": 46.96948478591274, + "eval_completions/min_length": 33.66197183098591, + "eval_frac_reward_zero_std": 0.12206573133737268, + "eval_kl": 0.11346675788747593, + "eval_loss": 0.010507900267839432, + "eval_reward": 0.5782932346345673, + "eval_reward_std": 0.07157933750045552, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.578293207704916, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31850574826690514, + "eval_runtime": 346.1836, + "eval_samples_per_second": 0.618, + "eval_steps_per_second": 0.052, + "step": 420 + }, + { + "clip_ratio/high_max": 0.023625252000056208, + "clip_ratio/high_mean": 0.01384461446723435, + "clip_ratio/low_mean": 0.021029075907426885, + "clip_ratio/low_min": 0.007703452298301272, + "clip_ratio/region_mean": 0.0348736904328689, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.95, + "completions/mean_length": 49.16250114440918, + "completions/min_length": 34.85, + "epoch": 0.045343006508055055, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3688032925128937, + "kl": 0.19539985093288123, + "learning_rate": 2.5177725118483415e-05, + "loss": 0.01282842755317688, + "reward": -0.016619089245796203, + "reward_std": 0.11938627250492573, + "rewards/ASRWerHalluLenRewardV5/mean": -0.016619088873267174, + "rewards/ASRWerHalluLenRewardV5/std": 0.3780877277255058, + "step": 425, + "step_time": 31.22844870686531 + }, + { + "clip_ratio/high_max": 0.019221331737935544, + "clip_ratio/high_mean": 0.010577328714134637, + "clip_ratio/low_mean": 0.018666726097580976, + "clip_ratio/low_min": 0.0046923253161367025, + "clip_ratio/region_mean": 0.029244054667651655, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.95, + "completions/mean_length": 46.58229284286499, + "completions/min_length": 31.6, + "epoch": 0.045876453643443935, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4987526535987854, + "kl": 0.14921191981993615, + "learning_rate": 2.54739336492891e-05, + "loss": 0.010601651668548585, + "reward": 0.04847665634006262, + "reward_std": 0.12293884567916394, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04847665629349649, + "rewards/ASRWerHalluLenRewardV5/std": 0.4166228473186493, + "step": 430, + "step_time": 29.818835356086492 + }, + { + "clip_ratio/high_max": 0.02378420829772949, + "clip_ratio/high_mean": 0.013653012015856803, + "clip_ratio/low_mean": 0.019039018265902995, + "clip_ratio/low_min": 0.004684268415439874, + "clip_ratio/region_mean": 0.03269203067757189, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.2, + "completions/mean_length": 47.91250095367432, + "completions/min_length": 31.4, + "epoch": 0.046409900778832815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3795550465583801, + "kl": 0.16458330545574426, + "learning_rate": 2.5770142180094787e-05, + "loss": 0.0008937795646488667, + "reward": -0.005974165722727775, + "reward_std": 0.12296998873353004, + "rewards/ASRWerHalluLenRewardV5/mean": -0.005974165536463261, + "rewards/ASRWerHalluLenRewardV5/std": 0.39773628413677214, + "step": 435, + "step_time": 30.45902319829911 + }, + { + "clip_ratio/high_max": 0.022438760905060917, + "clip_ratio/high_mean": 0.01186781948490534, + "clip_ratio/low_mean": 0.02115765865892172, + "clip_ratio/low_min": 0.007555329249589704, + "clip_ratio/region_mean": 0.03302547838538885, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.35, + "completions/mean_length": 49.83229274749756, + "completions/min_length": 33.1, + "epoch": 0.0469433479142217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.373346209526062, + "kl": 0.2025768463499844, + "learning_rate": 2.6066350710900477e-05, + "loss": 0.02029934525489807, + "reward": 0.03242668695747852, + "reward_std": 0.12283045016229152, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03242668611928821, + "rewards/ASRWerHalluLenRewardV5/std": 0.40938481986522673, + "step": 440, + "step_time": 31.67016608826816 + }, + { + "epoch": 0.0469433479142217, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.859154929577464, + "eval_completions/mean_length": 46.3826303616376, + "eval_completions/min_length": 33.225352112676056, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.15344341440309942, + "eval_loss": 0.009737945161759853, + "eval_reward": 0.5953883403505553, + "eval_reward_std": 0.07476377527190137, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5953883190809841, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3165517322928973, + "eval_runtime": 345.0557, + "eval_samples_per_second": 0.62, + "eval_steps_per_second": 0.052, + "step": 440 + }, + { + "clip_ratio/high_max": 0.022583939717151225, + "clip_ratio/high_mean": 0.011597000442270656, + "clip_ratio/low_mean": 0.01872272643086035, + "clip_ratio/low_min": 0.005172610521549359, + "clip_ratio/region_mean": 0.030319726816378532, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.75, + "completions/mean_length": 45.12812623977661, + "completions/min_length": 27.9, + "epoch": 0.04747679504961058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3716302812099457, + "kl": 0.23840653942897916, + "learning_rate": 2.6362559241706163e-05, + "loss": 0.005148108303546906, + "reward": 0.06910029612481594, + "reward_std": 0.11296849399805069, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06910029212012887, + "rewards/ASRWerHalluLenRewardV5/std": 0.4411710128188133, + "step": 445, + "step_time": 30.790177465230226 + }, + { + "clip_ratio/high_max": 0.02656098037259653, + "clip_ratio/high_mean": 0.01469047371065244, + "clip_ratio/low_mean": 0.01919944689143449, + "clip_ratio/low_min": 0.006834919773973524, + "clip_ratio/region_mean": 0.033889920660294594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.55, + "completions/mean_length": 45.8052095413208, + "completions/min_length": 34.3, + "epoch": 0.04801024218499947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3632916510105133, + "kl": 0.24269365393556655, + "learning_rate": 2.665876777251185e-05, + "loss": -0.012752650678157807, + "reward": 0.024399829655885698, + "reward_std": 0.12994402945041655, + "rewards/ASRWerHalluLenRewardV5/mean": 0.02439982946962118, + "rewards/ASRWerHalluLenRewardV5/std": 0.40448786318302155, + "step": 450, + "step_time": 31.38805052600801 + }, + { + "clip_ratio/high_max": 0.021792676305631177, + "clip_ratio/high_mean": 0.011624695977661758, + "clip_ratio/low_mean": 0.01786996170121711, + "clip_ratio/low_min": 0.005609939387068152, + "clip_ratio/region_mean": 0.02949465757701546, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.55, + "completions/mean_length": 47.97500114440918, + "completions/min_length": 34.2, + "epoch": 0.04854368932038835, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5479908585548401, + "kl": 0.1969360401853919, + "learning_rate": 2.6954976303317538e-05, + "loss": -0.0028670065104961394, + "reward": 0.08409808259457349, + "reward_std": 0.11603860892355441, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08409808063879609, + "rewards/ASRWerHalluLenRewardV5/std": 0.3969815239310265, + "step": 455, + "step_time": 29.846870595775545 + }, + { + "clip_ratio/high_max": 0.022726294590393083, + "clip_ratio/high_mean": 0.011990346990933176, + "clip_ratio/low_mean": 0.01939607025706209, + "clip_ratio/low_min": 0.006769528414588421, + "clip_ratio/region_mean": 0.03138641705736518, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.2, + "completions/mean_length": 47.78541793823242, + "completions/min_length": 33.4, + "epoch": 0.049077136455777236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49995097517967224, + "kl": 0.18993512978777288, + "learning_rate": 2.7251184834123224e-05, + "loss": -0.0003039829432964325, + "reward": -0.03299487568438053, + "reward_std": 0.12146597243845463, + "rewards/ASRWerHalluLenRewardV5/mean": -0.03299487535841763, + "rewards/ASRWerHalluLenRewardV5/std": 0.38830015286803243, + "step": 460, + "step_time": 30.284812232479453 + }, + { + "epoch": 0.049077136455777236, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.80281690140845, + "eval_completions/mean_length": 46.876761906583546, + "eval_completions/min_length": 33.732394366197184, + "eval_frac_reward_zero_std": 0.19718310446806356, + "eval_kl": 0.1347118248231709, + "eval_loss": 0.009791980497539043, + "eval_reward": 0.5942439833183734, + "eval_reward_std": 0.059676058091957807, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5942439609666316, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3110725814345437, + "eval_runtime": 343.282, + "eval_samples_per_second": 0.623, + "eval_steps_per_second": 0.052, + "step": 460 + }, + { + "clip_ratio/high_max": 0.02232023217366077, + "clip_ratio/high_mean": 0.01251423822977813, + "clip_ratio/low_mean": 0.01803350745467469, + "clip_ratio/low_min": 0.006281549340928905, + "clip_ratio/region_mean": 0.03054774572374299, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.6, + "completions/mean_length": 46.00416774749756, + "completions/min_length": 28.3, + "epoch": 0.049610583591166116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4805825650691986, + "kl": 0.19468484641984105, + "learning_rate": 2.7547393364928913e-05, + "loss": 0.01194530948996544, + "reward": 0.035513670183718204, + "reward_std": 0.14877184852957726, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03551367241889238, + "rewards/ASRWerHalluLenRewardV5/std": 0.38714921548962594, + "step": 465, + "step_time": 29.962684253044426 + }, + { + "clip_ratio/high_max": 0.019969686993863433, + "clip_ratio/high_mean": 0.01097516220761463, + "clip_ratio/low_mean": 0.016335581699968314, + "clip_ratio/low_min": 0.004745208163512871, + "clip_ratio/region_mean": 0.02731074399780482, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.95, + "completions/mean_length": 49.19166774749756, + "completions/min_length": 33.1, + "epoch": 0.050144030726554996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4672549068927765, + "kl": 0.23661346053704618, + "learning_rate": 2.78436018957346e-05, + "loss": -0.008102209120988847, + "reward": 0.10590604916214943, + "reward_std": 0.11388203166425229, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1059060497675091, + "rewards/ASRWerHalluLenRewardV5/std": 0.4313781410455704, + "step": 470, + "step_time": 30.783540468476712 + }, + { + "clip_ratio/high_max": 0.02041216222860385, + "clip_ratio/high_mean": 0.010961738960759249, + "clip_ratio/low_mean": 0.01792231215222273, + "clip_ratio/low_min": 0.006508771487278864, + "clip_ratio/region_mean": 0.028884051309432834, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.7, + "completions/mean_length": 47.33125114440918, + "completions/min_length": 32.8, + "epoch": 0.05067747786194388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37458136677742004, + "kl": 0.2553060230799019, + "learning_rate": 2.8139810426540285e-05, + "loss": 0.010386411845684052, + "reward": 0.14226419553160669, + "reward_std": 0.11224400326609611, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14226419541519136, + "rewards/ASRWerHalluLenRewardV5/std": 0.47602072209119795, + "step": 475, + "step_time": 31.8354235785082 + }, + { + "clip_ratio/high_max": 0.022367299522738904, + "clip_ratio/high_mean": 0.01169821156654507, + "clip_ratio/low_mean": 0.018003789155045524, + "clip_ratio/low_min": 0.004863916418980807, + "clip_ratio/region_mean": 0.029702000902034342, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.5, + "completions/mean_length": 48.95312614440918, + "completions/min_length": 33.85, + "epoch": 0.05121092499733276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3145526945590973, + "kl": 0.22688735136762261, + "learning_rate": 2.843601895734597e-05, + "loss": 0.001993742398917675, + "reward": 0.05832275003194809, + "reward_std": 0.10745758637785911, + "rewards/ASRWerHalluLenRewardV5/mean": 0.05832275040447712, + "rewards/ASRWerHalluLenRewardV5/std": 0.4678617656230927, + "step": 480, + "step_time": 32.044637398980555 + }, + { + "epoch": 0.05121092499733276, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.225352112676056, + "eval_completions/mean_length": 46.11032982947121, + "eval_completions/min_length": 33.267605633802816, + "eval_frac_reward_zero_std": 0.17370892452522063, + "eval_kl": 0.16719598008293501, + "eval_loss": 0.00801607221364975, + "eval_reward": 0.6130606065123854, + "eval_reward_std": 0.05823775193273601, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6130605885680293, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30576335893235573, + "eval_runtime": 340.2459, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 480 + }, + { + "clip_ratio/high_max": 0.02011008468689397, + "clip_ratio/high_mean": 0.010500257048988715, + "clip_ratio/low_mean": 0.018889597116503863, + "clip_ratio/low_min": 0.0072883399319835, + "clip_ratio/region_mean": 0.029389854148030282, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/mean_length": 46.26354293823242, + "completions/min_length": 29.3, + "epoch": 0.05174437213272165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5715278387069702, + "kl": 0.254672125633806, + "learning_rate": 2.873222748815166e-05, + "loss": 0.014625081419944763, + "reward": 0.06587618384510278, + "reward_std": 0.11775341294705868, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06587618961930275, + "rewards/ASRWerHalluLenRewardV5/std": 0.4106108158826828, + "step": 485, + "step_time": 29.63619227670133 + }, + { + "clip_ratio/high_max": 0.021401180032989943, + "clip_ratio/high_mean": 0.013180395000381396, + "clip_ratio/low_mean": 0.01715761163504794, + "clip_ratio/low_min": 0.005410656108870171, + "clip_ratio/region_mean": 0.030338006385136396, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.3, + "completions/mean_length": 49.27187614440918, + "completions/min_length": 32.6, + "epoch": 0.05227781926811053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3949529528617859, + "kl": 0.23177626878023147, + "learning_rate": 2.902843601895735e-05, + "loss": -0.0017434362322092055, + "reward": 0.13056654743850232, + "reward_std": 0.11439156010746956, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13056654911488294, + "rewards/ASRWerHalluLenRewardV5/std": 0.3734717883169651, + "step": 490, + "step_time": 34.21747261285782 + }, + { + "clip_ratio/high_max": 0.022223599202698098, + "clip_ratio/high_mean": 0.011819998377177399, + "clip_ratio/low_mean": 0.01748253827681765, + "clip_ratio/low_min": 0.006578187650302425, + "clip_ratio/region_mean": 0.029302537138573826, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.55, + "completions/mean_length": 48.48541812896728, + "completions/min_length": 32.05, + "epoch": 0.05281126640349941, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3916182816028595, + "kl": 0.23992092795670034, + "learning_rate": 2.9324644549763035e-05, + "loss": 0.007800701260566712, + "reward": 0.12842706292867662, + "reward_std": 0.10616274774074555, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12842705845832825, + "rewards/ASRWerHalluLenRewardV5/std": 0.40283975303173064, + "step": 495, + "step_time": 32.02820341605693 + }, + { + "clip_ratio/high_max": 0.023133136573596856, + "clip_ratio/high_mean": 0.012632602827216033, + "clip_ratio/low_mean": 0.019131099435617215, + "clip_ratio/low_min": 0.007259987416910007, + "clip_ratio/region_mean": 0.03176370224682614, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/mean_length": 51.588542747497556, + "completions/min_length": 38.45, + "epoch": 0.0533447135388883, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3686841130256653, + "kl": 0.26571566788479684, + "learning_rate": 2.962085308056872e-05, + "loss": 0.0011999200098216534, + "reward": -0.0060242898762226105, + "reward_std": 0.10107758715748787, + "rewards/ASRWerHalluLenRewardV5/mean": -0.006024285918101668, + "rewards/ASRWerHalluLenRewardV5/std": 0.35764608792960645, + "step": 500, + "step_time": 32.839676479436456 + }, + { + "epoch": 0.0533447135388883, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.15492957746479, + "eval_completions/mean_length": 45.971832221662495, + "eval_completions/min_length": 33.12676056338028, + "eval_frac_reward_zero_std": 0.18779343240697619, + "eval_kl": 0.1653635070829744, + "eval_loss": 0.008429245091974735, + "eval_reward": 0.6143593808085146, + "eval_reward_std": 0.05940043864588083, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.614359366956731, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3030868285191311, + "eval_runtime": 338.4359, + "eval_samples_per_second": 0.632, + "eval_steps_per_second": 0.053, + "step": 500 + }, + { + "clip_ratio/high_max": 0.02237925141234882, + "clip_ratio/high_mean": 0.012502750697603916, + "clip_ratio/low_mean": 0.01817481203470379, + "clip_ratio/low_min": 0.006049407494720072, + "clip_ratio/region_mean": 0.030677563120843843, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.25, + "completions/mean_length": 48.71354312896729, + "completions/min_length": 33.75, + "epoch": 0.05387816067427718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.388864666223526, + "kl": 0.23934099152684213, + "learning_rate": 2.9917061611374407e-05, + "loss": 0.001282734703272581, + "reward": 0.11967640332877635, + "reward_std": 0.11058448925614357, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11967639785725623, + "rewards/ASRWerHalluLenRewardV5/std": 0.3519856005907059, + "step": 505, + "step_time": 31.853093682602047 + }, + { + "clip_ratio/high_max": 0.020004623319255187, + "clip_ratio/high_mean": 0.009935402689734474, + "clip_ratio/low_mean": 0.01798907085758401, + "clip_ratio/low_min": 0.005283119360683486, + "clip_ratio/region_mean": 0.027924473735038192, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.55, + "completions/mean_length": 48.673959732055664, + "completions/min_length": 31.35, + "epoch": 0.054411607809666064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34572818875312805, + "kl": 0.4303630146197975, + "learning_rate": 3.0213270142180093e-05, + "loss": 0.013768720626831054, + "reward": 0.14545488432049752, + "reward_std": 0.10346581041812897, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14545488134026527, + "rewards/ASRWerHalluLenRewardV5/std": 0.3842397391796112, + "step": 510, + "step_time": 30.855621638335286 + }, + { + "clip_ratio/high_max": 0.02286638270888943, + "clip_ratio/high_mean": 0.012302048211859073, + "clip_ratio/low_mean": 0.015255889143736568, + "clip_ratio/low_min": 0.004112986111431382, + "clip_ratio/region_mean": 0.027557937242090703, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.85, + "completions/mean_length": 49.73020973205566, + "completions/min_length": 31.1, + "epoch": 0.054945054945054944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30727970600128174, + "kl": 0.24140588929876686, + "learning_rate": 3.0509478672985786e-05, + "loss": 0.004695157334208488, + "reward": 0.06268044579774142, + "reward_std": 0.09615515545010567, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06268044412136078, + "rewards/ASRWerHalluLenRewardV5/std": 0.3808781638741493, + "step": 515, + "step_time": 31.538363670557736 + }, + { + "clip_ratio/high_max": 0.023842659662477672, + "clip_ratio/high_mean": 0.013582345431495923, + "clip_ratio/low_mean": 0.017292399761208797, + "clip_ratio/low_min": 0.0058399301022291185, + "clip_ratio/region_mean": 0.030874745117034762, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.5, + "completions/mean_length": 50.02083473205566, + "completions/min_length": 35.95, + "epoch": 0.05547850208044383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3563966155052185, + "kl": 0.2360831851605326, + "learning_rate": 3.080568720379147e-05, + "loss": 0.009933754056692123, + "reward": 0.09444936420768499, + "reward_std": 0.11002213172614575, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09444936234503984, + "rewards/ASRWerHalluLenRewardV5/std": 0.4029719643294811, + "step": 520, + "step_time": 30.597086461260915 + }, + { + "epoch": 0.05547850208044383, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.014084507042256, + "eval_completions/mean_length": 46.51995430529957, + "eval_completions/min_length": 33.32394366197183, + "eval_frac_reward_zero_std": 0.15962441790271814, + "eval_kl": 0.1749434970437326, + "eval_loss": 0.009139284491539001, + "eval_reward": 0.6002996472090902, + "eval_reward_std": 0.056479902570726166, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6002996367153147, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3049096435990552, + "eval_runtime": 344.1699, + "eval_samples_per_second": 0.622, + "eval_steps_per_second": 0.052, + "step": 520 + }, + { + "clip_ratio/high_max": 0.017145914706634357, + "clip_ratio/high_mean": 0.0101197033553035, + "clip_ratio/low_mean": 0.019237367241294122, + "clip_ratio/low_min": 0.00744377423543483, + "clip_ratio/region_mean": 0.029357070790138096, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.15, + "completions/mean_length": 47.69166774749756, + "completions/min_length": 32.35, + "epoch": 0.05601194921583271, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48511844873428345, + "kl": 0.2255495408549905, + "learning_rate": 3.110189573459716e-05, + "loss": 0.01693311631679535, + "reward": 0.1569930087774992, + "reward_std": 0.11556710414588452, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1569930107332766, + "rewards/ASRWerHalluLenRewardV5/std": 0.4239355683326721, + "step": 525, + "step_time": 32.68271917812526 + }, + { + "clip_ratio/high_max": 0.020851266343379395, + "clip_ratio/high_mean": 0.011467358330264688, + "clip_ratio/low_mean": 0.019025085198518355, + "clip_ratio/low_min": 0.0061263565585250035, + "clip_ratio/region_mean": 0.030492443667026237, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.8, + "completions/mean_length": 46.35000095367432, + "completions/min_length": 26.45, + "epoch": 0.05654539635122159, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5144731402397156, + "kl": 0.23321051318198444, + "learning_rate": 3.1398104265402844e-05, + "loss": 0.008485667407512665, + "reward": 0.015590917132794858, + "reward_std": 0.14634969793260097, + "rewards/ASRWerHalluLenRewardV5/mean": 0.015590915840584784, + "rewards/ASRWerHalluLenRewardV5/std": 0.4242253445088863, + "step": 530, + "step_time": 30.899163778871298 + }, + { + "clip_ratio/high_max": 0.022902045625960453, + "clip_ratio/high_mean": 0.012612689939851407, + "clip_ratio/low_mean": 0.0180656985292444, + "clip_ratio/low_min": 0.006787292886292562, + "clip_ratio/region_mean": 0.030678388496744445, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.9, + "completions/mean_length": 48.898959732055665, + "completions/min_length": 34.2, + "epoch": 0.05707884348661048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31684932112693787, + "kl": 0.25611695311963556, + "learning_rate": 3.169431279620853e-05, + "loss": -0.0025979544967412948, + "reward": 0.017908305488526823, + "reward_std": 0.10733051560819148, + "rewards/ASRWerHalluLenRewardV5/mean": 0.017908297944813967, + "rewards/ASRWerHalluLenRewardV5/std": 0.37228437289595606, + "step": 535, + "step_time": 32.085447635687885 + }, + { + "clip_ratio/high_max": 0.0234123777190689, + "clip_ratio/high_mean": 0.012209173294832, + "clip_ratio/low_mean": 0.020918242231709884, + "clip_ratio/low_min": 0.007252169528510421, + "clip_ratio/region_mean": 0.033127415389753875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.2, + "completions/mean_length": 50.492709922790525, + "completions/min_length": 37.8, + "epoch": 0.05761229062199936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.526032030582428, + "kl": 0.32151210829615595, + "learning_rate": 3.1990521327014215e-05, + "loss": 0.006808710098266601, + "reward": 0.0778658889234066, + "reward_std": 0.10306876935064793, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07786588091403246, + "rewards/ASRWerHalluLenRewardV5/std": 0.5161729797720909, + "step": 540, + "step_time": 31.058302187733354 + }, + { + "epoch": 0.05761229062199936, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.28169014084507, + "eval_completions/mean_length": 46.65258332373391, + "eval_completions/min_length": 33.50704225352113, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.19839437691573525, + "eval_loss": 0.01148261222988367, + "eval_reward": 0.6110966547169316, + "eval_reward_std": 0.06699898898024374, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6110966390287372, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30695183816510185, + "eval_runtime": 342.2911, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.053, + "step": 540 + }, + { + "clip_ratio/high_max": 0.02214470295002684, + "clip_ratio/high_mean": 0.012299771278048866, + "clip_ratio/low_mean": 0.018799011953524312, + "clip_ratio/low_min": 0.007060754502890632, + "clip_ratio/region_mean": 0.031098783103516325, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.2, + "completions/mean_length": 47.69166812896729, + "completions/min_length": 28.65, + "epoch": 0.058145737757388245, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8558627963066101, + "kl": 0.25110821733251215, + "learning_rate": 3.228672985781991e-05, + "loss": 0.0024886054918169974, + "reward": 0.07034483924508095, + "reward_std": 0.13485789038240908, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07034483167808503, + "rewards/ASRWerHalluLenRewardV5/std": 0.34680956080555914, + "step": 545, + "step_time": 31.20156016293913 + }, + { + "clip_ratio/high_max": 0.018717341142473743, + "clip_ratio/high_mean": 0.010609136053244584, + "clip_ratio/low_mean": 0.019510470825480297, + "clip_ratio/low_min": 0.0075616588903358204, + "clip_ratio/region_mean": 0.030119607085362078, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.05, + "completions/mean_length": 47.132292938232425, + "completions/min_length": 33.85, + "epoch": 0.058679184892777125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3625843822956085, + "kl": 0.21931314589455725, + "learning_rate": 3.2582938388625594e-05, + "loss": 0.024904891848564148, + "reward": 0.06367945168167352, + "reward_std": 0.10097450762987137, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06367944905068726, + "rewards/ASRWerHalluLenRewardV5/std": 0.44359691590070727, + "step": 550, + "step_time": 30.88075897227973 + }, + { + "clip_ratio/high_max": 0.020317778023309075, + "clip_ratio/high_mean": 0.01064244468434481, + "clip_ratio/low_mean": 0.017068463723990134, + "clip_ratio/low_min": 0.00525764758058358, + "clip_ratio/region_mean": 0.027710908360313626, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.05, + "completions/mean_length": 44.7270845413208, + "completions/min_length": 27.8, + "epoch": 0.059212632028166005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5463204383850098, + "kl": 0.24373649745248258, + "learning_rate": 3.287914691943128e-05, + "loss": 0.01165289506316185, + "reward": 0.10453543402254581, + "reward_std": 0.12937226071953772, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10453543420881033, + "rewards/ASRWerHalluLenRewardV5/std": 0.35504634156823156, + "step": 555, + "step_time": 29.472501922957598 + }, + { + "clip_ratio/high_max": 0.022973934857873246, + "clip_ratio/high_mean": 0.011605412187054753, + "clip_ratio/low_mean": 0.016265330405440183, + "clip_ratio/low_min": 0.00464172323117964, + "clip_ratio/region_mean": 0.027870742371305823, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.75, + "completions/mean_length": 48.355209541320804, + "completions/min_length": 32.5, + "epoch": 0.05974607916355489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5564252734184265, + "kl": 0.2501982622779906, + "learning_rate": 3.3175355450236966e-05, + "loss": 0.003585268929600716, + "reward": 0.17633956652134658, + "reward_std": 0.11173606254160404, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1763395685236901, + "rewards/ASRWerHalluLenRewardV5/std": 0.4262539066374302, + "step": 560, + "step_time": 32.178631962649526 + }, + { + "epoch": 0.05974607916355489, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.971830985915496, + "eval_completions/mean_length": 46.22065847692355, + "eval_completions/min_length": 33.2112676056338, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.1760966396615119, + "eval_loss": 0.009206665679812431, + "eval_reward": 0.6047271561979408, + "eval_reward_std": 0.06792651105698355, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6047271396177756, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3119017601957623, + "eval_runtime": 342.5386, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.053, + "step": 560 + }, + { + "clip_ratio/high_max": 0.02162550835637376, + "clip_ratio/high_mean": 0.01156936058687279, + "clip_ratio/low_mean": 0.018173534068046136, + "clip_ratio/low_min": 0.006049840053310618, + "clip_ratio/region_mean": 0.02974289432168007, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.35, + "completions/mean_length": 45.25000171661377, + "completions/min_length": 28.95, + "epoch": 0.06027952629894377, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30521613359451294, + "kl": 0.29180194567888973, + "learning_rate": 3.347156398104265e-05, + "loss": 0.015754535794258118, + "reward": 0.12280540987849235, + "reward_std": 0.11511376984417439, + "rewards/ASRWerHalluLenRewardV5/mean": 0.122805406129919, + "rewards/ASRWerHalluLenRewardV5/std": 0.45000745356082916, + "step": 565, + "step_time": 29.867732682451607 + }, + { + "clip_ratio/high_max": 0.022138673171866685, + "clip_ratio/high_mean": 0.011694567019003444, + "clip_ratio/low_mean": 0.016036504310613965, + "clip_ratio/low_min": 0.005975709049380384, + "clip_ratio/region_mean": 0.027731071243761108, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.6, + "completions/mean_length": 45.53229293823242, + "completions/min_length": 31.05, + "epoch": 0.06081297343433266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3126011788845062, + "kl": 0.26097582932561636, + "learning_rate": 3.3767772511848345e-05, + "loss": 0.0036033987998962402, + "reward": 0.18469350170344115, + "reward_std": 0.11166517548263073, + "rewards/ASRWerHalluLenRewardV5/mean": 0.18469348791986703, + "rewards/ASRWerHalluLenRewardV5/std": 0.36749946102499964, + "step": 570, + "step_time": 30.009296764060856 + }, + { + "clip_ratio/high_max": 0.02241293775732629, + "clip_ratio/high_mean": 0.012406452524010092, + "clip_ratio/low_mean": 0.01979699451767374, + "clip_ratio/low_min": 0.00811659214377869, + "clip_ratio/region_mean": 0.03220344693399966, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.1, + "completions/mean_length": 46.711459732055665, + "completions/min_length": 31.95, + "epoch": 0.06134642056972154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35406821966171265, + "kl": 0.2928049575537443, + "learning_rate": 3.406398104265403e-05, + "loss": 0.016846334934234618, + "reward": 0.03556577600538731, + "reward_std": 0.11627664081752301, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03556578052230179, + "rewards/ASRWerHalluLenRewardV5/std": 0.3932749845087528, + "step": 575, + "step_time": 30.090068272314966 + }, + { + "clip_ratio/high_max": 0.022125113243237137, + "clip_ratio/high_mean": 0.012193679723714013, + "clip_ratio/low_mean": 0.016626471566269174, + "clip_ratio/low_min": 0.005476440221536904, + "clip_ratio/region_mean": 0.028820151509717108, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.7, + "completions/mean_length": 48.64687633514404, + "completions/min_length": 34.15, + "epoch": 0.061879867705110427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37835636734962463, + "kl": 0.33047946756705643, + "learning_rate": 3.4360189573459716e-05, + "loss": -0.0008954043500125408, + "reward": 0.1679516418837011, + "reward_std": 0.11390085481107234, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16795163745991887, + "rewards/ASRWerHalluLenRewardV5/std": 0.3749282471835613, + "step": 580, + "step_time": 31.80127391256392 + }, + { + "epoch": 0.061879867705110427, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 56.87323943661972, + "eval_completions/mean_length": 45.80399172071, + "eval_completions/min_length": 33.056338028169016, + "eval_frac_reward_zero_std": 0.19248826864739538, + "eval_kl": 0.2201577213028787, + "eval_loss": 0.010908209718763828, + "eval_reward": 0.6271352848877579, + "eval_reward_std": 0.058539655492444276, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6271352792604709, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.2892845453773166, + "eval_runtime": 337.1758, + "eval_samples_per_second": 0.635, + "eval_steps_per_second": 0.053, + "step": 580 + }, + { + "clip_ratio/high_max": 0.022696605493547396, + "clip_ratio/high_mean": 0.011661511068814434, + "clip_ratio/low_mean": 0.020619381923461334, + "clip_ratio/low_min": 0.0059104506799485534, + "clip_ratio/region_mean": 0.03228089300682768, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.05, + "completions/mean_length": 49.43437671661377, + "completions/min_length": 37.9, + "epoch": 0.06241331484049931, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3230418264865875, + "kl": 0.33794138887897135, + "learning_rate": 3.46563981042654e-05, + "loss": 0.003951956331729889, + "reward": 0.06044807955622673, + "reward_std": 0.10587166510522365, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06044808067381382, + "rewards/ASRWerHalluLenRewardV5/std": 0.41734456941485404, + "step": 585, + "step_time": 32.79560173656792 + }, + { + "clip_ratio/high_max": 0.023281613289145752, + "clip_ratio/high_mean": 0.013538176465954167, + "clip_ratio/low_mean": 0.016996020084479823, + "clip_ratio/low_min": 0.006026901886798441, + "clip_ratio/region_mean": 0.030534196575172246, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.65, + "completions/mean_length": 49.23437614440918, + "completions/min_length": 33.6, + "epoch": 0.0629467619758882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34404468536376953, + "kl": 0.27442721351981164, + "learning_rate": 3.495260663507109e-05, + "loss": 0.002434891276061535, + "reward": 0.03321332037448883, + "reward_std": 0.10626997910439968, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03321332158520818, + "rewards/ASRWerHalluLenRewardV5/std": 0.3947689987719059, + "step": 590, + "step_time": 31.47174674924463 + }, + { + "clip_ratio/high_max": 0.017913641640916467, + "clip_ratio/high_mean": 0.009397941317001824, + "clip_ratio/low_mean": 0.01739924797147978, + "clip_ratio/low_min": 0.005894706974504516, + "clip_ratio/region_mean": 0.026797189307399095, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.8, + "completions/mean_length": 46.368751525878906, + "completions/min_length": 31.55, + "epoch": 0.06348020911127707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27799081802368164, + "kl": 0.23883347548544406, + "learning_rate": 3.524881516587678e-05, + "loss": 0.008988563716411591, + "reward": 0.14834741353988648, + "reward_std": 0.10753183960914611, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14834740478545427, + "rewards/ASRWerHalluLenRewardV5/std": 0.40600508749485015, + "step": 595, + "step_time": 29.818507570773363 + }, + { + "clip_ratio/high_max": 0.023492327047279105, + "clip_ratio/high_mean": 0.01371819442283595, + "clip_ratio/low_mean": 0.01634126669669058, + "clip_ratio/low_min": 0.005320473399478942, + "clip_ratio/region_mean": 0.030059461155906318, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.85, + "completions/mean_length": 46.566668319702146, + "completions/min_length": 31.6, + "epoch": 0.06401365624666595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.464296817779541, + "kl": 0.27970629930496216, + "learning_rate": 3.554502369668247e-05, + "loss": 6.852051010355353e-05, + "reward": 0.15903657171875238, + "reward_std": 0.1259873604401946, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15903656566515564, + "rewards/ASRWerHalluLenRewardV5/std": 0.3648593336343765, + "step": 600, + "step_time": 30.03790509160608 + }, + { + "epoch": 0.06401365624666595, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.25352112676056, + "eval_completions/mean_length": 46.002348483448294, + "eval_completions/min_length": 33.19718309859155, + "eval_frac_reward_zero_std": 0.17840376118539084, + "eval_kl": 0.18258340270455245, + "eval_loss": 0.010698338970541954, + "eval_reward": 0.6093262612662265, + "eval_reward_std": 0.06051116093048747, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6093262411706465, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31031052204190007, + "eval_runtime": 338.68, + "eval_samples_per_second": 0.632, + "eval_steps_per_second": 0.053, + "step": 600 + }, + { + "clip_ratio/high_max": 0.021194024936994536, + "clip_ratio/high_mean": 0.012150171425309964, + "clip_ratio/low_mean": 0.01850542209431296, + "clip_ratio/low_min": 0.006444886967074126, + "clip_ratio/region_mean": 0.03065559348324314, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.15, + "completions/mean_length": 42.68541831970215, + "completions/min_length": 24.65, + "epoch": 0.06454710338205484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4192546010017395, + "kl": 0.289040894433856, + "learning_rate": 3.584123222748815e-05, + "loss": 0.0019285209476947785, + "reward": -0.0017972267232835293, + "reward_std": 0.10799319669604301, + "rewards/ASRWerHalluLenRewardV5/mean": -0.0017972231842577457, + "rewards/ASRWerHalluLenRewardV5/std": 0.35183509811758995, + "step": 605, + "step_time": 29.640504439361393 + }, + { + "clip_ratio/high_max": 0.019081020989688113, + "clip_ratio/high_mean": 0.009718730603344739, + "clip_ratio/low_mean": 0.01811964936205186, + "clip_ratio/low_min": 0.005137977050617337, + "clip_ratio/region_mean": 0.027838379959575832, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.4, + "completions/mean_length": 51.38541793823242, + "completions/min_length": 37.65, + "epoch": 0.06508055051744373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3378324508666992, + "kl": 0.2475280987098813, + "learning_rate": 3.613744075829384e-05, + "loss": 0.00438508614897728, + "reward": 0.13805835284292697, + "reward_std": 0.09650574084371329, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13805834744125606, + "rewards/ASRWerHalluLenRewardV5/std": 0.48214673697948457, + "step": 610, + "step_time": 32.50655211769045 + }, + { + "clip_ratio/high_max": 0.020472144603263586, + "clip_ratio/high_mean": 0.012043322814861313, + "clip_ratio/low_mean": 0.018588542978977784, + "clip_ratio/low_min": 0.008263707562582568, + "clip_ratio/region_mean": 0.03063186599756591, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.3, + "completions/mean_length": 48.981251335144044, + "completions/min_length": 36.8, + "epoch": 0.0656139976528326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3736710548400879, + "kl": 0.3113519393838942, + "learning_rate": 3.6433649289099525e-05, + "loss": 0.005697961896657944, + "reward": 0.08136609569191933, + "reward_std": 0.10320818349719048, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08136609140783549, + "rewards/ASRWerHalluLenRewardV5/std": 0.39358508959412575, + "step": 615, + "step_time": 30.55725451465696 + }, + { + "clip_ratio/high_max": 0.021487910015275703, + "clip_ratio/high_mean": 0.01250315246579703, + "clip_ratio/low_mean": 0.017770207073772325, + "clip_ratio/low_min": 0.005195159092545509, + "clip_ratio/region_mean": 0.030273359757848083, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.6, + "completions/mean_length": 48.237501335144046, + "completions/min_length": 30.7, + "epoch": 0.06614744478822149, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3358868360519409, + "kl": 0.2702534609474242, + "learning_rate": 3.672985781990522e-05, + "loss": 0.017886409163475038, + "reward": 0.17680366858839988, + "reward_std": 0.1162352355197072, + "rewards/ASRWerHalluLenRewardV5/mean": 0.17680366467684508, + "rewards/ASRWerHalluLenRewardV5/std": 0.3753850318491459, + "step": 620, + "step_time": 30.39419283643365 + }, + { + "epoch": 0.06614744478822149, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.225352112676056, + "eval_completions/mean_length": 46.17253628583022, + "eval_completions/min_length": 33.40845070422535, + "eval_frac_reward_zero_std": 0.18779343240697619, + "eval_kl": 0.17970951594097515, + "eval_loss": 0.01016437727957964, + "eval_reward": 0.6172182555417005, + "eval_reward_std": 0.06420033631033041, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.617218236740626, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3051301239346954, + "eval_runtime": 338.905, + "eval_samples_per_second": 0.631, + "eval_steps_per_second": 0.053, + "step": 620 + }, + { + "clip_ratio/high_max": 0.023225166369229555, + "clip_ratio/high_mean": 0.01188777949573705, + "clip_ratio/low_mean": 0.015994009919813833, + "clip_ratio/low_min": 0.004726081533590332, + "clip_ratio/region_mean": 0.027881789760431274, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.05, + "completions/mean_length": 46.597917556762695, + "completions/min_length": 29.25, + "epoch": 0.06668089192361037, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4108600318431854, + "kl": 0.2664557721465826, + "learning_rate": 3.70260663507109e-05, + "loss": -0.006957245618104934, + "reward": 0.051535621471703055, + "reward_std": 0.11327742673456669, + "rewards/ASRWerHalluLenRewardV5/mean": 0.05153561644256115, + "rewards/ASRWerHalluLenRewardV5/std": 0.4485133022069931, + "step": 625, + "step_time": 30.02546842508018 + }, + { + "clip_ratio/high_max": 0.02274189909803681, + "clip_ratio/high_mean": 0.012955022424284835, + "clip_ratio/low_mean": 0.01606419169984292, + "clip_ratio/low_min": 0.004569841336342506, + "clip_ratio/region_mean": 0.02901921406155452, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.4, + "completions/mean_length": 43.10000104904175, + "completions/min_length": 28.8, + "epoch": 0.06721433905899925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34273192286491394, + "kl": 0.263345611654222, + "learning_rate": 3.732227488151659e-05, + "loss": 0.0019639894366264344, + "reward": 0.1575345316901803, + "reward_std": 0.10451059266924859, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1575345311546698, + "rewards/ASRWerHalluLenRewardV5/std": 0.4238610237836838, + "step": 630, + "step_time": 30.072968155890702 + }, + { + "clip_ratio/high_max": 0.023623739695176484, + "clip_ratio/high_mean": 0.013532632005808409, + "clip_ratio/low_mean": 0.01934681579004973, + "clip_ratio/low_min": 0.007372171705355868, + "clip_ratio/region_mean": 0.032879447942832485, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.05, + "completions/mean_length": 47.690626335144046, + "completions/min_length": 31.55, + "epoch": 0.06774778619438813, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4913213551044464, + "kl": 0.2581340505741537, + "learning_rate": 3.7618483412322275e-05, + "loss": 0.007709268480539322, + "reward": 0.03469167649745941, + "reward_std": 0.113872379809618, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03469167724251747, + "rewards/ASRWerHalluLenRewardV5/std": 0.40701346546411515, + "step": 635, + "step_time": 30.689919892698526 + }, + { + "clip_ratio/high_max": 0.024002998959622344, + "clip_ratio/high_mean": 0.012184102286119014, + "clip_ratio/low_mean": 0.0167181298325886, + "clip_ratio/low_min": 0.004817312632803805, + "clip_ratio/region_mean": 0.028902231651591136, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.4, + "completions/mean_length": 49.00000114440918, + "completions/min_length": 33.7, + "epoch": 0.06828123332977702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3530075252056122, + "kl": 0.26517444029450415, + "learning_rate": 3.791469194312796e-05, + "loss": 0.01432054340839386, + "reward": 0.13977719731628896, + "reward_std": 0.11881709694862366, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1397771888819989, + "rewards/ASRWerHalluLenRewardV5/std": 0.3819749429821968, + "step": 640, + "step_time": 32.49274497982115 + }, + { + "epoch": 0.06828123332977702, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.352112676056336, + "eval_completions/mean_length": 46.24882741041586, + "eval_completions/min_length": 33.61971830985915, + "eval_frac_reward_zero_std": 0.14084507462004542, + "eval_kl": 0.17225044060655884, + "eval_loss": 0.008993042632937431, + "eval_reward": 0.6124812587046288, + "eval_reward_std": 0.06617196223659205, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6124812327850033, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.312891285630389, + "eval_runtime": 339.6665, + "eval_samples_per_second": 0.63, + "eval_steps_per_second": 0.053, + "step": 640 + }, + { + "clip_ratio/high_max": 0.019773649278795345, + "clip_ratio/high_mean": 0.010880465205991641, + "clip_ratio/low_mean": 0.018680952524300666, + "clip_ratio/low_min": 0.006559328979346901, + "clip_ratio/region_mean": 0.029561417631339282, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.45, + "completions/mean_length": 47.70625171661377, + "completions/min_length": 28.05, + "epoch": 0.06881468046516591, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6145163178443909, + "kl": 0.32210920825600625, + "learning_rate": 3.8210900473933654e-05, + "loss": 0.009874771535396575, + "reward": 0.24261164106428623, + "reward_std": 0.10380438826978207, + "rewards/ASRWerHalluLenRewardV5/mean": 0.24261164106428623, + "rewards/ASRWerHalluLenRewardV5/std": 0.3922431580722332, + "step": 645, + "step_time": 31.97321789432317 + }, + { + "clip_ratio/high_max": 0.02201369690010324, + "clip_ratio/high_mean": 0.012400516387424431, + "clip_ratio/low_mean": 0.018692718591773884, + "clip_ratio/low_min": 0.007734966833959333, + "clip_ratio/region_mean": 0.031093234941363335, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.95, + "completions/mean_length": 48.857293128967285, + "completions/min_length": 35.55, + "epoch": 0.06934812760055478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.322584331035614, + "kl": 0.2763761390000582, + "learning_rate": 3.850710900473934e-05, + "loss": 0.01167515441775322, + "reward": 0.08125662840902806, + "reward_std": 0.11672686673700809, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08125663232058286, + "rewards/ASRWerHalluLenRewardV5/std": 0.4137906230986118, + "step": 650, + "step_time": 30.823907271027565 + }, + { + "clip_ratio/high_max": 0.021927651515579783, + "clip_ratio/high_mean": 0.012147185998037458, + "clip_ratio/low_mean": 0.019890290404146072, + "clip_ratio/low_min": 0.00653252428455744, + "clip_ratio/region_mean": 0.032037476370169314, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.05, + "completions/mean_length": 47.81458435058594, + "completions/min_length": 29.7, + "epoch": 0.06988157473594367, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40638309717178345, + "kl": 0.27620368879288437, + "learning_rate": 3.8803317535545026e-05, + "loss": 0.010351645201444626, + "reward": 0.10779664125293494, + "reward_std": 0.1208714235574007, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10779663855209946, + "rewards/ASRWerHalluLenRewardV5/std": 0.42761506959795953, + "step": 655, + "step_time": 30.567219780758023 + }, + { + "clip_ratio/high_max": 0.022149429825367405, + "clip_ratio/high_mean": 0.013224374095443636, + "clip_ratio/low_mean": 0.020413730310974643, + "clip_ratio/low_min": 0.006731331170885824, + "clip_ratio/region_mean": 0.033638104400597515, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/mean_length": 50.310417556762694, + "completions/min_length": 34.4, + "epoch": 0.07041502187133256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3853471279144287, + "kl": 0.2613600772805512, + "learning_rate": 3.909952606635071e-05, + "loss": 0.013835923373699188, + "reward": 0.036949272640049456, + "reward_std": 0.11364052295684815, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03694926835596561, + "rewards/ASRWerHalluLenRewardV5/std": 0.35944399386644366, + "step": 660, + "step_time": 30.91094062179327 + }, + { + "epoch": 0.07041502187133256, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.53521126760563, + "eval_completions/mean_length": 46.211269056293325, + "eval_completions/min_length": 33.36619718309859, + "eval_frac_reward_zero_std": 0.12206573133737268, + "eval_kl": 0.16266977401371574, + "eval_loss": 0.009027771651744843, + "eval_reward": 0.6174049089387269, + "eval_reward_std": 0.05449886583316494, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6174048933554703, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3004206627030188, + "eval_runtime": 339.9623, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 660 + }, + { + "clip_ratio/high_max": 0.02173972667660564, + "clip_ratio/high_mean": 0.012059405412583147, + "clip_ratio/low_mean": 0.020654673600802198, + "clip_ratio/low_min": 0.008156920492183417, + "clip_ratio/region_mean": 0.03271407939027995, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.5, + "completions/mean_length": 48.53750133514404, + "completions/min_length": 31.85, + "epoch": 0.07094846900672143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31291285157203674, + "kl": 0.283885217551142, + "learning_rate": 3.93957345971564e-05, + "loss": 0.014269044995307923, + "reward": -0.009116947278380394, + "reward_std": 0.1084834385663271, + "rewards/ASRWerHalluLenRewardV5/mean": -0.009116948675364256, + "rewards/ASRWerHalluLenRewardV5/std": 0.40135885328054427, + "step": 665, + "step_time": 30.688667918741704 + }, + { + "clip_ratio/high_max": 0.021233977715019135, + "clip_ratio/high_mean": 0.013006302548456005, + "clip_ratio/low_mean": 0.02038024311186746, + "clip_ratio/low_min": 0.006295104179298505, + "clip_ratio/region_mean": 0.03338654591934755, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.7, + "completions/mean_length": 49.25000133514404, + "completions/min_length": 34.3, + "epoch": 0.07148191614211032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8308461308479309, + "kl": 0.2855486965738237, + "learning_rate": 3.969194312796209e-05, + "loss": 0.0032494377344846727, + "reward": 0.12061518840491772, + "reward_std": 0.10979985520243644, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12061517932452262, + "rewards/ASRWerHalluLenRewardV5/std": 0.3641715422272682, + "step": 670, + "step_time": 33.703092951513824 + }, + { + "clip_ratio/high_max": 0.01952743492438458, + "clip_ratio/high_mean": 0.010430011604330502, + "clip_ratio/low_mean": 0.015611604726291261, + "clip_ratio/low_min": 0.004728081793291494, + "clip_ratio/region_mean": 0.02604161637427751, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.7, + "completions/mean_length": 48.096876335144046, + "completions/min_length": 33.75, + "epoch": 0.0720153632774992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26178106665611267, + "kl": 0.25285347606986763, + "learning_rate": 3.9988151658767776e-05, + "loss": -0.0025067027658224107, + "reward": 0.1591363064944744, + "reward_std": 0.1112115815281868, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1591363075276604, + "rewards/ASRWerHalluLenRewardV5/std": 0.47058958634734155, + "step": 675, + "step_time": 31.93210673648864 + }, + { + "clip_ratio/high_max": 0.019698079291265456, + "clip_ratio/high_mean": 0.011339916921861004, + "clip_ratio/low_mean": 0.02069441915518837, + "clip_ratio/low_min": 0.007621663447935134, + "clip_ratio/region_mean": 0.032034336333163084, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.8, + "completions/mean_length": 50.96041831970215, + "completions/min_length": 37.0, + "epoch": 0.07254881041288808, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.591013431549072, + "kl": 0.2938175476156175, + "learning_rate": 4.028436018957346e-05, + "loss": 0.022614394128322602, + "reward": 0.09397105034440756, + "reward_std": 0.11740701571106911, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09397104841191321, + "rewards/ASRWerHalluLenRewardV5/std": 0.4043614901602268, + "step": 680, + "step_time": 30.419725506007673 + }, + { + "epoch": 0.07254881041288808, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.183098591549296, + "eval_completions/mean_length": 46.02934399457045, + "eval_completions/min_length": 33.140845070422536, + "eval_frac_reward_zero_std": 0.17840376118539084, + "eval_kl": 0.17094057292060952, + "eval_loss": 0.008792384527623653, + "eval_reward": 0.6149383100720358, + "eval_reward_std": 0.061270286896410335, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6149382997881359, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3018236717760143, + "eval_runtime": 340.5027, + "eval_samples_per_second": 0.628, + "eval_steps_per_second": 0.053, + "step": 680 + }, + { + "clip_ratio/high_max": 0.020716964232269673, + "clip_ratio/high_mean": 0.011171724858286325, + "clip_ratio/low_mean": 0.017730893027328422, + "clip_ratio/low_min": 0.007173286931356415, + "clip_ratio/region_mean": 0.02890261779539287, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.9, + "completions/mean_length": 45.93229293823242, + "completions/min_length": 28.2, + "epoch": 0.07308225754827696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4173460006713867, + "kl": 0.2763558914884925, + "learning_rate": 4.058056872037915e-05, + "loss": 0.006424432992935181, + "reward": 0.09623038861900568, + "reward_std": 0.11959435641765595, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09623038303107023, + "rewards/ASRWerHalluLenRewardV5/std": 0.40842180997133254, + "step": 685, + "step_time": 30.043215675279498 + }, + { + "clip_ratio/high_max": 0.0216734854853712, + "clip_ratio/high_mean": 0.012181366520235316, + "clip_ratio/low_mean": 0.018878253940783907, + "clip_ratio/low_min": 0.006577522479346953, + "clip_ratio/region_mean": 0.031059620156884193, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.05, + "completions/mean_length": 45.80625133514404, + "completions/min_length": 26.95, + "epoch": 0.07361570468366585, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5837882161140442, + "kl": 0.2655518026091158, + "learning_rate": 4.0876777251184834e-05, + "loss": 0.01737394779920578, + "reward": 0.007059722673147917, + "reward_std": 0.1253091599792242, + "rewards/ASRWerHalluLenRewardV5/mean": 0.007059722440317273, + "rewards/ASRWerHalluLenRewardV5/std": 0.37109057381749155, + "step": 690, + "step_time": 30.1474107529968 + }, + { + "clip_ratio/high_max": 0.023020185832865536, + "clip_ratio/high_mean": 0.011546825582627207, + "clip_ratio/low_mean": 0.02035856907896232, + "clip_ratio/low_min": 0.006425201427191496, + "clip_ratio/region_mean": 0.03190539430943318, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.65, + "completions/mean_length": 46.06354312896728, + "completions/min_length": 27.45, + "epoch": 0.07414915181905474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2392386794090271, + "kl": 0.261417699418962, + "learning_rate": 4.1172985781990527e-05, + "loss": 0.011102437973022461, + "reward": 0.15643806755542755, + "reward_std": 0.09997223913669587, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15643806194420903, + "rewards/ASRWerHalluLenRewardV5/std": 0.4565191075205803, + "step": 695, + "step_time": 30.152123070321977 + }, + { + "clip_ratio/high_max": 0.02332821695599705, + "clip_ratio/high_mean": 0.012928775597538333, + "clip_ratio/low_mean": 0.02040278520435095, + "clip_ratio/low_min": 0.0072227849916089324, + "clip_ratio/region_mean": 0.03333156085573137, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.45, + "completions/mean_length": 46.304167938232425, + "completions/min_length": 29.4, + "epoch": 0.07468259895444361, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.297917127609253, + "kl": 0.3585483986884356, + "learning_rate": 4.146919431279621e-05, + "loss": 0.013674327731132507, + "reward": 0.026620905846357346, + "reward_std": 0.13222792819142343, + "rewards/ASRWerHalluLenRewardV5/mean": 0.02662090854719281, + "rewards/ASRWerHalluLenRewardV5/std": 0.38555737063288686, + "step": 700, + "step_time": 31.01221517715603 + }, + { + "epoch": 0.07468259895444361, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.61971830985915, + "eval_completions/mean_length": 46.15845204742862, + "eval_completions/min_length": 33.19718309859155, + "eval_frac_reward_zero_std": 0.15962441790271814, + "eval_kl": 0.23655568545972797, + "eval_loss": 0.012870410457253456, + "eval_reward": 0.6188499106907509, + "eval_reward_std": 0.0624570916865913, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6188498920642994, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30663550831377506, + "eval_runtime": 342.0213, + "eval_samples_per_second": 0.626, + "eval_steps_per_second": 0.053, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0248217866406776, + "clip_ratio/high_mean": 0.012941092120308894, + "clip_ratio/low_mean": 0.020738190761767326, + "clip_ratio/low_min": 0.00921599391149357, + "clip_ratio/region_mean": 0.033679282816592604, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.1, + "completions/mean_length": 50.804167938232425, + "completions/min_length": 35.5, + "epoch": 0.0752160460898325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33106470108032227, + "kl": 0.3380347043275833, + "learning_rate": 4.17654028436019e-05, + "loss": -0.0005667050834745168, + "reward": 0.008957982063293457, + "reward_std": 0.12551803551614285, + "rewards/ASRWerHalluLenRewardV5/mean": 0.008957985136657954, + "rewards/ASRWerHalluLenRewardV5/std": 0.4023046359419823, + "step": 705, + "step_time": 31.32777774762362 + }, + { + "clip_ratio/high_max": 0.02002890277071856, + "clip_ratio/high_mean": 0.010628574914881029, + "clip_ratio/low_mean": 0.016105228690139483, + "clip_ratio/low_min": 0.004771949927089736, + "clip_ratio/region_mean": 0.026733803981915116, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.7, + "completions/mean_length": 51.38020992279053, + "completions/min_length": 36.9, + "epoch": 0.07574949322522138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3123722970485687, + "kl": 0.2551474699750543, + "learning_rate": 4.2061611374407584e-05, + "loss": 0.00637468695640564, + "reward": 0.17726792059838772, + "reward_std": 0.10339569263160228, + "rewards/ASRWerHalluLenRewardV5/mean": 0.17726792097091676, + "rewards/ASRWerHalluLenRewardV5/std": 0.4211523152887821, + "step": 710, + "step_time": 31.54563678186387 + }, + { + "clip_ratio/high_max": 0.023267353122355415, + "clip_ratio/high_mean": 0.01199128530570306, + "clip_ratio/low_mean": 0.0195645465166308, + "clip_ratio/low_min": 0.005578694120049477, + "clip_ratio/region_mean": 0.03155583196785301, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.45, + "completions/mean_length": 50.45625114440918, + "completions/min_length": 33.4, + "epoch": 0.07628294036061026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31431475281715393, + "kl": 0.27880671694874765, + "learning_rate": 4.235781990521327e-05, + "loss": -0.0010882276110351086, + "reward": 0.09230001755058766, + "reward_std": 0.10387487560510636, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09230001556570641, + "rewards/ASRWerHalluLenRewardV5/std": 0.37054724246263504, + "step": 715, + "step_time": 34.1009834703058 + }, + { + "clip_ratio/high_max": 0.018447322089923546, + "clip_ratio/high_mean": 0.009677607644698582, + "clip_ratio/low_mean": 0.020352532033575697, + "clip_ratio/low_min": 0.006387921515852213, + "clip_ratio/region_mean": 0.030030139506561682, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.6, + "completions/mean_length": 46.82916774749756, + "completions/min_length": 30.1, + "epoch": 0.07681638749599914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.371951699256897, + "kl": 0.2870246350765228, + "learning_rate": 4.265402843601896e-05, + "loss": 0.020040363073349, + "reward": 0.10473445504903793, + "reward_std": 0.11882488243281841, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10473444722592831, + "rewards/ASRWerHalluLenRewardV5/std": 0.39302921667695045, + "step": 720, + "step_time": 30.13808295428753 + }, + { + "epoch": 0.07681638749599914, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 56.859154929577464, + "eval_completions/mean_length": 45.74765379999725, + "eval_completions/min_length": 32.943661971830984, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.2049390861869488, + "eval_loss": 0.008482135832309723, + "eval_reward": 0.6145308356679661, + "eval_reward_std": 0.06709645536374038, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6145308201896472, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3090487418984863, + "eval_runtime": 339.7632, + "eval_samples_per_second": 0.63, + "eval_steps_per_second": 0.053, + "step": 720 + }, + { + "clip_ratio/high_max": 0.025690812547691167, + "clip_ratio/high_mean": 0.013970406429143623, + "clip_ratio/low_mean": 0.01711313129053451, + "clip_ratio/low_min": 0.0055190322920680044, + "clip_ratio/region_mean": 0.03108353787101805, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.9, + "completions/mean_length": 45.99687576293945, + "completions/min_length": 30.85, + "epoch": 0.07734983463138803, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.318103164434433, + "kl": 0.30611806400120256, + "learning_rate": 4.295023696682465e-05, + "loss": 0.003467559814453125, + "reward": 0.10890787020325661, + "reward_std": 0.1223678782582283, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10890787076205015, + "rewards/ASRWerHalluLenRewardV5/std": 0.396470433473587, + "step": 725, + "step_time": 29.88747404459864 + }, + { + "clip_ratio/high_max": 0.023797343525802716, + "clip_ratio/high_mean": 0.012992596725234761, + "clip_ratio/low_mean": 0.017702222312800588, + "clip_ratio/low_min": 0.006080983325955458, + "clip_ratio/region_mean": 0.03069481927668676, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.75, + "completions/mean_length": 46.61875114440918, + "completions/min_length": 28.7, + "epoch": 0.07788328176677692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31612053513526917, + "kl": 0.23645290751010178, + "learning_rate": 4.3246445497630335e-05, + "loss": 0.0014276852831244468, + "reward": 0.07898065894842148, + "reward_std": 0.12685822919011117, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07898065960034728, + "rewards/ASRWerHalluLenRewardV5/std": 0.4108610488474369, + "step": 730, + "step_time": 29.970438128151 + }, + { + "clip_ratio/high_max": 0.018976704188389704, + "clip_ratio/high_mean": 0.010194417454476934, + "clip_ratio/low_mean": 0.014472838310757652, + "clip_ratio/low_min": 0.004041839964338578, + "clip_ratio/region_mean": 0.02466725551057607, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.15, + "completions/mean_length": 47.048959732055664, + "completions/min_length": 31.0, + "epoch": 0.07841672890216579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3081192076206207, + "kl": 0.19524778425693512, + "learning_rate": 4.354265402843602e-05, + "loss": -0.005840438231825828, + "reward": 0.1927234146744013, + "reward_std": 0.10635127685964108, + "rewards/ASRWerHalluLenRewardV5/mean": 0.19272340293973683, + "rewards/ASRWerHalluLenRewardV5/std": 0.3868738628923893, + "step": 735, + "step_time": 31.12986420840025 + }, + { + "clip_ratio/high_max": 0.023544983117608352, + "clip_ratio/high_mean": 0.013836860470473767, + "clip_ratio/low_mean": 0.02016718312515877, + "clip_ratio/low_min": 0.0077683114795945585, + "clip_ratio/region_mean": 0.0340040436014533, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.1, + "completions/mean_length": 48.43750114440918, + "completions/min_length": 33.3, + "epoch": 0.07895017603755468, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.808829307556152, + "kl": 0.24791666101664306, + "learning_rate": 4.383886255924171e-05, + "loss": 0.00551338754594326, + "reward": -0.08526089824736119, + "reward_std": 0.12044392004609109, + "rewards/ASRWerHalluLenRewardV5/mean": -0.08526089498773218, + "rewards/ASRWerHalluLenRewardV5/std": 0.37768146470189096, + "step": 740, + "step_time": 30.603634967282414 + }, + { + "epoch": 0.07895017603755468, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.42253521126761, + "eval_completions/mean_length": 46.136151327213774, + "eval_completions/min_length": 33.563380281690144, + "eval_frac_reward_zero_std": 0.17370892536472265, + "eval_kl": 0.22263641289474678, + "eval_loss": 0.012163734994828701, + "eval_reward": 0.6216707150910942, + "eval_reward_std": 0.05960263138059789, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6216706982485845, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.306765762984123, + "eval_runtime": 341.562, + "eval_samples_per_second": 0.627, + "eval_steps_per_second": 0.053, + "step": 740 + }, + { + "clip_ratio/high_max": 0.021262417844263838, + "clip_ratio/high_mean": 0.011564836589968763, + "clip_ratio/low_mean": 0.017852995850262232, + "clip_ratio/low_min": 0.00585376059752889, + "clip_ratio/region_mean": 0.029417832457693294, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.3, + "completions/mean_length": 43.4989595413208, + "completions/min_length": 27.6, + "epoch": 0.07948362317294357, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2719820737838745, + "kl": 0.26359626520425083, + "learning_rate": 4.41350710900474e-05, + "loss": -0.0008883940055966378, + "reward": 0.06820112895220518, + "reward_std": 0.11557617336511612, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0682011280208826, + "rewards/ASRWerHalluLenRewardV5/std": 0.3439674098044634, + "step": 745, + "step_time": 31.3054073844105 + }, + { + "clip_ratio/high_max": 0.024629139830358325, + "clip_ratio/high_mean": 0.014002611002069898, + "clip_ratio/low_mean": 0.019565087312366815, + "clip_ratio/low_min": 0.0067167300847359, + "clip_ratio/region_mean": 0.033567697973921895, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.65, + "completions/mean_length": 48.19583511352539, + "completions/min_length": 33.65, + "epoch": 0.08001707030833244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3071075975894928, + "kl": 0.26172550916671755, + "learning_rate": 4.4431279620853085e-05, + "loss": -0.0061272382736206055, + "reward": 0.11083377785980701, + "reward_std": 0.12321627847850322, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11083376952446997, + "rewards/ASRWerHalluLenRewardV5/std": 0.37561610117554667, + "step": 750, + "step_time": 32.2521774565801 + }, + { + "clip_ratio/high_max": 0.02223340017371811, + "clip_ratio/high_mean": 0.011095221634604968, + "clip_ratio/low_mean": 0.01689766755589517, + "clip_ratio/low_min": 0.005802395773935131, + "clip_ratio/region_mean": 0.027992889250162988, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 95.35, + "completions/mean_length": 48.96666774749756, + "completions/min_length": 33.95, + "epoch": 0.08055051744372133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6114911437034607, + "kl": 0.22952774716541172, + "learning_rate": 4.472748815165877e-05, + "loss": 0.015685635805130004, + "reward": 0.2579687714576721, + "reward_std": 0.11079133860766888, + "rewards/ASRWerHalluLenRewardV5/mean": 0.2579687640070915, + "rewards/ASRWerHalluLenRewardV5/std": 0.3631763532757759, + "step": 755, + "step_time": 33.52655817456544 + }, + { + "clip_ratio/high_max": 0.026297265267930924, + "clip_ratio/high_mean": 0.013638367381645367, + "clip_ratio/low_mean": 0.01725296649237862, + "clip_ratio/low_min": 0.0048659829306416215, + "clip_ratio/region_mean": 0.030891333962790667, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.65, + "completions/mean_length": 45.3364598274231, + "completions/min_length": 26.95, + "epoch": 0.08108396457911021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6809604167938232, + "kl": 0.3136562847532332, + "learning_rate": 4.502369668246446e-05, + "loss": -0.004737306386232376, + "reward": 0.025091659650206564, + "reward_std": 0.1142435323446989, + "rewards/ASRWerHalluLenRewardV5/mean": 0.02509165685623884, + "rewards/ASRWerHalluLenRewardV5/std": 0.4764515072107315, + "step": 760, + "step_time": 29.892474091425537 + }, + { + "epoch": 0.08108396457911021, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 56.985915492957744, + "eval_completions/mean_length": 45.86032985633528, + "eval_completions/min_length": 33.25352112676056, + "eval_frac_reward_zero_std": 0.2112676119300681, + "eval_kl": 0.22738273066639061, + "eval_loss": 0.010876202955842018, + "eval_reward": 0.6235506095149568, + "eval_reward_std": 0.059021035005981236, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6235505939317001, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3034193040147214, + "eval_runtime": 338.8655, + "eval_samples_per_second": 0.632, + "eval_steps_per_second": 0.053, + "step": 760 + }, + { + "clip_ratio/high_max": 0.025764879764756186, + "clip_ratio/high_mean": 0.013578112567483914, + "clip_ratio/low_mean": 0.01952515142475022, + "clip_ratio/low_min": 0.006871652620611712, + "clip_ratio/region_mean": 0.033103263820521533, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.45, + "completions/mean_length": 46.98854274749756, + "completions/min_length": 30.7, + "epoch": 0.0816174117144991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3498550355434418, + "kl": 0.27374145910143854, + "learning_rate": 4.531990521327014e-05, + "loss": 0.015124619007110596, + "reward": 0.05473716668784619, + "reward_std": 0.10741883385926485, + "rewards/ASRWerHalluLenRewardV5/mean": 0.054737169807776806, + "rewards/ASRWerHalluLenRewardV5/std": 0.44308522567152975, + "step": 765, + "step_time": 30.912826699577273 + }, + { + "clip_ratio/high_max": 0.022395493101794273, + "clip_ratio/high_mean": 0.011243251667474396, + "clip_ratio/low_mean": 0.016747385838243645, + "clip_ratio/low_min": 0.005584977165563032, + "clip_ratio/region_mean": 0.02799063740822021, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/mean_length": 48.269793319702146, + "completions/min_length": 28.4, + "epoch": 0.08215085884988797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2999937832355499, + "kl": 0.28353434670716526, + "learning_rate": 4.5616113744075836e-05, + "loss": 0.0023792609572410583, + "reward": 0.08616620004177093, + "reward_std": 0.11633305661380292, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08616620339453221, + "rewards/ASRWerHalluLenRewardV5/std": 0.4211616463959217, + "step": 770, + "step_time": 31.384623723849653 + }, + { + "clip_ratio/high_max": 0.023283056094078346, + "clip_ratio/high_mean": 0.011686055705649779, + "clip_ratio/low_mean": 0.016799912982969546, + "clip_ratio/low_min": 0.005206077743787318, + "clip_ratio/region_mean": 0.028485968557652085, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.4, + "completions/mean_length": 44.31979303359985, + "completions/min_length": 28.35, + "epoch": 0.08268430598527686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7229480147361755, + "kl": 0.26768580907955764, + "learning_rate": 4.591232227488152e-05, + "loss": 0.007772815972566604, + "reward": 0.0788853820413351, + "reward_std": 0.10080905519425869, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0788853787118569, + "rewards/ASRWerHalluLenRewardV5/std": 0.4386107809841633, + "step": 775, + "step_time": 32.75537656731903 + }, + { + "clip_ratio/high_max": 0.021901596116367727, + "clip_ratio/high_mean": 0.012168753193691373, + "clip_ratio/low_mean": 0.01649410023819655, + "clip_ratio/low_min": 0.006509490526514128, + "clip_ratio/region_mean": 0.028662853385321797, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.85, + "completions/mean_length": 46.79479331970215, + "completions/min_length": 29.1, + "epoch": 0.08321775312066575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35687005519866943, + "kl": 0.27399832382798195, + "learning_rate": 4.620853080568721e-05, + "loss": 0.005268782377243042, + "reward": 0.17514937706291675, + "reward_std": 0.1314102988690138, + "rewards/ASRWerHalluLenRewardV5/mean": 0.17514937192900107, + "rewards/ASRWerHalluLenRewardV5/std": 0.4188511699438095, + "step": 780, + "step_time": 31.67462933063507 + }, + { + "epoch": 0.08321775312066575, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0011737089201877935, + "eval_completions/max_length": 114.16901408450704, + "eval_completions/mean_length": 50.712442908488526, + "eval_completions/min_length": 32.774647887323944, + "eval_frac_reward_zero_std": 0.14084507462004542, + "eval_kl": 0.1944522564627335, + "eval_loss": 0.009686804376542568, + "eval_reward": 0.5993945192493183, + "eval_reward_std": 0.07725301231454376, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5993945013574311, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3178830721323759, + "eval_runtime": 480.3915, + "eval_samples_per_second": 0.445, + "eval_steps_per_second": 0.037, + "step": 780 + }, + { + "clip_ratio/high_max": 0.021637799622840247, + "clip_ratio/high_mean": 0.012515211214486044, + "clip_ratio/low_mean": 0.01806165655289078, + "clip_ratio/low_min": 0.006574348444701172, + "clip_ratio/region_mean": 0.030576867930358277, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 74.8, + "completions/mean_length": 45.330209732055664, + "completions/min_length": 27.75, + "epoch": 0.08375120025605462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3198714852333069, + "kl": 0.26540390085428955, + "learning_rate": 4.6504739336492894e-05, + "loss": -0.0010294992476701736, + "reward": 0.0339052814990282, + "reward_std": 0.12838409282267094, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03390527945011854, + "rewards/ASRWerHalluLenRewardV5/std": 0.3617828540503979, + "step": 785, + "step_time": 32.063922211527824 + }, + { + "clip_ratio/high_max": 0.01845945828827098, + "clip_ratio/high_mean": 0.010357660740555729, + "clip_ratio/low_mean": 0.01742584068852011, + "clip_ratio/low_min": 0.005133854865562171, + "clip_ratio/region_mean": 0.027783501567319034, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.3, + "completions/mean_length": 48.708334350585936, + "completions/min_length": 34.15, + "epoch": 0.0842846473914435, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.753769874572754, + "kl": 0.24987627137452365, + "learning_rate": 4.680094786729858e-05, + "loss": 0.0010457640513777734, + "reward": 0.048153916746377944, + "reward_std": 0.09922454431653023, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04815391413867474, + "rewards/ASRWerHalluLenRewardV5/std": 0.41780648604035375, + "step": 790, + "step_time": 31.028755995258688 + }, + { + "clip_ratio/high_max": 0.02471645642654039, + "clip_ratio/high_mean": 0.01349330966186244, + "clip_ratio/low_mean": 0.022141666250536218, + "clip_ratio/low_min": 0.009209002321586012, + "clip_ratio/region_mean": 0.03563497598515823, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.3, + "completions/mean_length": 50.68437671661377, + "completions/min_length": 34.9, + "epoch": 0.0848180945268324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29080870747566223, + "kl": 0.30422497699037193, + "learning_rate": 4.709715639810427e-05, + "loss": 0.01431596726179123, + "reward": 0.04286404885351658, + "reward_std": 0.1082255445420742, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04286404927261174, + "rewards/ASRWerHalluLenRewardV5/std": 0.4134762302041054, + "step": 795, + "step_time": 30.671217483840884 + }, + { + "clip_ratio/high_max": 0.027172002801671626, + "clip_ratio/high_mean": 0.015410796200740151, + "clip_ratio/low_mean": 0.020351612844388002, + "clip_ratio/low_min": 0.00634141152841039, + "clip_ratio/region_mean": 0.035762409190647305, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.1, + "completions/mean_length": 48.0020845413208, + "completions/min_length": 31.8, + "epoch": 0.08535154166222128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7414382100105286, + "kl": 0.29844519617035986, + "learning_rate": 4.739336492890996e-05, + "loss": -0.013269293308258056, + "reward": 0.01804378591477871, + "reward_std": 0.11465107314288617, + "rewards/ASRWerHalluLenRewardV5/mean": 0.01804378116503358, + "rewards/ASRWerHalluLenRewardV5/std": 0.3888808637857437, + "step": 800, + "step_time": 31.224289292655886 + }, + { + "epoch": 0.08535154166222128, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.45070422535211, + "eval_completions/mean_length": 46.261738387631695, + "eval_completions/min_length": 33.309859154929576, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.19098135445949058, + "eval_loss": 0.011154823005199432, + "eval_reward": 0.6088958769826822, + "eval_reward_std": 0.06406614022381918, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6088958528469985, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3114370623419822, + "eval_runtime": 339.8706, + "eval_samples_per_second": 0.63, + "eval_steps_per_second": 0.053, + "step": 800 + }, + { + "clip_ratio/high_max": 0.026999273616820575, + "clip_ratio/high_mean": 0.014635881627327763, + "clip_ratio/low_mean": 0.02138703600212466, + "clip_ratio/low_min": 0.007270783983403817, + "clip_ratio/region_mean": 0.03602291735587641, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.2, + "completions/mean_length": 47.526042747497556, + "completions/min_length": 26.0, + "epoch": 0.08588498879761015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6138390898704529, + "kl": 0.28475737404078244, + "learning_rate": 4.7689573459715644e-05, + "loss": 0.002216982468962669, + "reward": -0.09176021609455347, + "reward_std": 0.11240072101354599, + "rewards/ASRWerHalluLenRewardV5/mean": -0.09176020887680351, + "rewards/ASRWerHalluLenRewardV5/std": 0.38299219831824305, + "step": 805, + "step_time": 32.341500209271906 + }, + { + "clip_ratio/high_max": 0.02463333792402409, + "clip_ratio/high_mean": 0.012658615924010519, + "clip_ratio/low_mean": 0.018648806889541448, + "clip_ratio/low_min": 0.004655595787335187, + "clip_ratio/region_mean": 0.03130742278881371, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.25, + "completions/mean_length": 47.21875133514404, + "completions/min_length": 32.35, + "epoch": 0.08641843593299904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3621964156627655, + "kl": 0.25406604474410416, + "learning_rate": 4.798578199052133e-05, + "loss": 0.004613757133483887, + "reward": 0.034868376702070235, + "reward_std": 0.10597746185958386, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03486837185919285, + "rewards/ASRWerHalluLenRewardV5/std": 0.3958124227821827, + "step": 810, + "step_time": 33.62361691724509 + }, + { + "clip_ratio/high_max": 0.024505546333966777, + "clip_ratio/high_mean": 0.012676771721453406, + "clip_ratio/low_mean": 0.021792998898308723, + "clip_ratio/low_min": 0.00880484115332365, + "clip_ratio/region_mean": 0.03446977037237957, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.1, + "completions/mean_length": 51.89166812896728, + "completions/min_length": 37.85, + "epoch": 0.08695188306838793, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30076301097869873, + "kl": 0.2371668958105147, + "learning_rate": 4.8281990521327016e-05, + "loss": 0.002752465009689331, + "reward": 0.07097308337688446, + "reward_std": 0.11152552179992199, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07097308319061994, + "rewards/ASRWerHalluLenRewardV5/std": 0.4284476310014725, + "step": 815, + "step_time": 31.169636190496384 + }, + { + "clip_ratio/high_max": 0.020494458632310853, + "clip_ratio/high_mean": 0.01041731330769835, + "clip_ratio/low_mean": 0.016070150693121833, + "clip_ratio/low_min": 0.0048902436043135825, + "clip_ratio/region_mean": 0.02648746394988848, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.55, + "completions/mean_length": 46.5364595413208, + "completions/min_length": 30.3, + "epoch": 0.0874853302037768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34770292043685913, + "kl": 0.26069273171015084, + "learning_rate": 4.857819905213271e-05, + "loss": 0.008888962864875793, + "reward": 0.16611819490790367, + "reward_std": 0.12053815703839063, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1661181867297273, + "rewards/ASRWerHalluLenRewardV5/std": 0.404285179823637, + "step": 820, + "step_time": 30.207159887626766 + }, + { + "epoch": 0.0874853302037768, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.070422535211264, + "eval_completions/mean_length": 46.132630307909466, + "eval_completions/min_length": 33.29577464788732, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.17493022999293367, + "eval_loss": 0.009612927213311195, + "eval_reward": 0.6110463074185479, + "eval_reward_std": 0.06610740914048863, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6110462888445652, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30733337141835776, + "eval_runtime": 339.3828, + "eval_samples_per_second": 0.631, + "eval_steps_per_second": 0.053, + "step": 820 + }, + { + "clip_ratio/high_max": 0.021148791606537997, + "clip_ratio/high_mean": 0.011529289357713423, + "clip_ratio/low_mean": 0.017213678019470536, + "clip_ratio/low_min": 0.005491731187794358, + "clip_ratio/region_mean": 0.028742967487778513, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.5, + "completions/mean_length": 47.514584350585935, + "completions/min_length": 30.7, + "epoch": 0.08801877733916569, + "frac_reward_zero_std": 0.0, + "grad_norm": 38.5274658203125, + "kl": 0.2603848031722009, + "learning_rate": 4.8874407582938395e-05, + "loss": 0.03930645883083343, + "reward": 0.15564159415662288, + "reward_std": 0.10669980123639107, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15564158852212132, + "rewards/ASRWerHalluLenRewardV5/std": 0.39224721416831015, + "step": 825, + "step_time": 35.965948692709205 + }, + { + "clip_ratio/high_max": 0.02416170315700583, + "clip_ratio/high_mean": 0.01349994131742278, + "clip_ratio/low_mean": 0.01970202038937714, + "clip_ratio/low_min": 0.0068447641213424506, + "clip_ratio/region_mean": 0.03320196160930209, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.75, + "completions/mean_length": 43.63645973205566, + "completions/min_length": 26.45, + "epoch": 0.08855222447455458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3226834833621979, + "kl": 0.23037337409332395, + "learning_rate": 4.917061611374408e-05, + "loss": -0.0009315586648881435, + "reward": 0.033081012591719625, + "reward_std": 0.13549160063266755, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0330810074461624, + "rewards/ASRWerHalluLenRewardV5/std": 0.42044909596443175, + "step": 830, + "step_time": 30.66513577736914 + }, + { + "clip_ratio/high_max": 0.023488644463941454, + "clip_ratio/high_mean": 0.013364161843492183, + "clip_ratio/low_mean": 0.02211245144135319, + "clip_ratio/low_min": 0.009246664651436731, + "clip_ratio/region_mean": 0.03547661331249401, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.2, + "completions/mean_length": 49.46770973205567, + "completions/min_length": 31.95, + "epoch": 0.08908567160994345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3711346387863159, + "kl": 0.3695411872118711, + "learning_rate": 4.9466824644549766e-05, + "loss": 0.010375740379095078, + "reward": -0.06610476654022931, + "reward_std": 0.10221175625920295, + "rewards/ASRWerHalluLenRewardV5/mean": -0.06610477026551961, + "rewards/ASRWerHalluLenRewardV5/std": 0.42339898198843, + "step": 835, + "step_time": 32.125301686301825 + }, + { + "clip_ratio/high_max": 0.02464531031437218, + "clip_ratio/high_mean": 0.014328234342974611, + "clip_ratio/low_mean": 0.01900487542443443, + "clip_ratio/low_min": 0.006871568853966892, + "clip_ratio/region_mean": 0.03333310993621126, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.85, + "completions/mean_length": 47.21354312896729, + "completions/min_length": 28.5, + "epoch": 0.08961911874533234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3427729606628418, + "kl": 0.3169043993577361, + "learning_rate": 4.976303317535545e-05, + "loss": 3.054825065191835e-05, + "reward": 0.03824124448001385, + "reward_std": 0.1268853757530451, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03824124485254288, + "rewards/ASRWerHalluLenRewardV5/std": 0.3714602891355753, + "step": 840, + "step_time": 30.92905813101679 + }, + { + "epoch": 0.08961911874533234, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.54929577464789, + "eval_completions/mean_length": 46.225353294694926, + "eval_completions/min_length": 33.352112676056336, + "eval_frac_reward_zero_std": 0.18309859700605902, + "eval_kl": 0.18674146465327537, + "eval_loss": 0.009397901594638824, + "eval_reward": 0.6237026801411535, + "eval_reward_std": 0.060517096669006516, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.623702658523976, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.29961956570475873, + "eval_runtime": 340.1683, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 840 + }, + { + "clip_ratio/high_max": 0.023470206616912036, + "clip_ratio/high_mean": 0.012833312961447518, + "clip_ratio/low_mean": 0.020936741042532958, + "clip_ratio/low_min": 0.005764843651559204, + "clip_ratio/region_mean": 0.03377005406073295, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.25, + "completions/mean_length": 47.664584732055665, + "completions/min_length": 30.35, + "epoch": 0.09015256588072122, + "frac_reward_zero_std": 0.0, + "grad_norm": 166.66944885253906, + "kl": 0.3162549525499344, + "learning_rate": 4.999999983416346e-05, + "loss": 0.05975604653358459, + "reward": 0.11585038229823112, + "reward_std": 0.1184530071914196, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11585037421900779, + "rewards/ASRWerHalluLenRewardV5/std": 0.4093499958515167, + "step": 845, + "step_time": 31.4737987190485 + }, + { + "clip_ratio/high_max": 0.02381321878056042, + "clip_ratio/high_mean": 0.013407529608230107, + "clip_ratio/low_mean": 0.017868909220851492, + "clip_ratio/low_min": 0.005684490362182259, + "clip_ratio/region_mean": 0.031276438990607856, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 104.55, + "completions/mean_length": 46.71875152587891, + "completions/min_length": 29.7, + "epoch": 0.09068601301611011, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28892359137535095, + "kl": 0.3486902203410864, + "learning_rate": 4.9999994029884636e-05, + "loss": -0.0076481163501739505, + "reward": 0.048359951749444005, + "reward_std": 0.12417013309895993, + "rewards/ASRWerHalluLenRewardV5/mean": 0.048359953612089154, + "rewards/ASRWerHalluLenRewardV5/std": 0.33849347606301305, + "step": 850, + "step_time": 36.93777986206114 + }, + { + "clip_ratio/high_max": 0.024039508908754215, + "clip_ratio/high_mean": 0.013482060686510523, + "clip_ratio/low_mean": 0.01891568069404457, + "clip_ratio/low_min": 0.004566915862960741, + "clip_ratio/region_mean": 0.03239774147514254, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.95, + "completions/mean_length": 47.151042747497556, + "completions/min_length": 32.65, + "epoch": 0.09121946015149898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5316139459609985, + "kl": 0.506232968531549, + "learning_rate": 4.999997993378081e-05, + "loss": -0.00855671763420105, + "reward": 0.03939302135258913, + "reward_std": 0.1260345984250307, + "rewards/ASRWerHalluLenRewardV5/mean": 0.039393015578389166, + "rewards/ASRWerHalluLenRewardV5/std": 0.4560299560427666, + "step": 855, + "step_time": 30.564516743831337 + }, + { + "clip_ratio/high_max": 0.020919934404082595, + "clip_ratio/high_mean": 0.011735829389363062, + "clip_ratio/low_mean": 0.022442817204864695, + "clip_ratio/low_min": 0.007674809673335403, + "clip_ratio/region_mean": 0.03417864670045674, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.4, + "completions/mean_length": 50.6770845413208, + "completions/min_length": 37.95, + "epoch": 0.09175290728688787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27751052379608154, + "kl": 0.276455735322088, + "learning_rate": 4.9999957545856625e-05, + "loss": 0.013569027185440063, + "reward": 0.008026454970240593, + "reward_std": 0.11982978656888008, + "rewards/ASRWerHalluLenRewardV5/mean": 0.008026460930705071, + "rewards/ASRWerHalluLenRewardV5/std": 0.4352497264742851, + "step": 860, + "step_time": 30.309752106107773 + }, + { + "epoch": 0.09175290728688787, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.478873239436616, + "eval_completions/mean_length": 46.177231479698506, + "eval_completions/min_length": 33.28169014084507, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.15979394655753398, + "eval_loss": 0.010602003894746304, + "eval_reward": 0.6202454413746444, + "eval_reward_std": 0.060077204513297955, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6202454189704337, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.2973281159262422, + "eval_runtime": 341.4378, + "eval_samples_per_second": 0.627, + "eval_steps_per_second": 0.053, + "step": 860 + }, + { + "clip_ratio/high_max": 0.020572750142309814, + "clip_ratio/high_mean": 0.011223930286359974, + "clip_ratio/low_mean": 0.019214935437776148, + "clip_ratio/low_min": 0.00496776384243276, + "clip_ratio/region_mean": 0.03043886594241485, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.25, + "completions/mean_length": 49.4583345413208, + "completions/min_length": 33.05, + "epoch": 0.09228635442227676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35372403264045715, + "kl": 0.25987838227301835, + "learning_rate": 4.999992686611954e-05, + "loss": 0.010269951820373536, + "reward": 0.12554704081267118, + "reward_std": 0.11459109224379063, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12554703259374947, + "rewards/ASRWerHalluLenRewardV5/std": 0.3982915602624416, + "step": 865, + "step_time": 30.525779921375214 + }, + { + "clip_ratio/high_max": 0.018873807397903876, + "clip_ratio/high_mean": 0.009488494051038288, + "clip_ratio/low_mean": 0.015125623330823146, + "clip_ratio/low_min": 0.003572114394046366, + "clip_ratio/region_mean": 0.024614117277087642, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.1, + "completions/mean_length": 46.241667938232425, + "completions/min_length": 25.1, + "epoch": 0.09281980155766563, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23411789536476135, + "kl": 0.25946127995848656, + "learning_rate": 4.9999887894579705e-05, + "loss": 0.0066595740616321565, + "reward": 0.27426176369190214, + "reward_std": 0.11678037010133266, + "rewards/ASRWerHalluLenRewardV5/mean": 0.2742617576383054, + "rewards/ASRWerHalluLenRewardV5/std": 0.431386461108923, + "step": 870, + "step_time": 29.95344748869538 + }, + { + "clip_ratio/high_max": 0.021264122566208243, + "clip_ratio/high_mean": 0.01103706323337974, + "clip_ratio/low_mean": 0.01719069368264172, + "clip_ratio/low_min": 0.0034216785221360623, + "clip_ratio/region_mean": 0.02822775685344823, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.45, + "completions/mean_length": 45.28645992279053, + "completions/min_length": 29.75, + "epoch": 0.09335324869305452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2802973985671997, + "kl": 0.28151264078915117, + "learning_rate": 4.999984063125006e-05, + "loss": -0.003859385848045349, + "reward": 0.16201450861990452, + "reward_std": 0.11605017594993114, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16201450834050773, + "rewards/ASRWerHalluLenRewardV5/std": 0.4367524929344654, + "step": 875, + "step_time": 31.406020840071143 + }, + { + "clip_ratio/high_max": 0.01847642019856721, + "clip_ratio/high_mean": 0.010447293567995074, + "clip_ratio/low_mean": 0.017142313270596787, + "clip_ratio/low_min": 0.004808545595733449, + "clip_ratio/region_mean": 0.027589607110712677, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.5, + "completions/mean_length": 49.93854293823242, + "completions/min_length": 35.7, + "epoch": 0.0938866958284434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48308345675468445, + "kl": 0.3009267397224903, + "learning_rate": 4.9999785076146284e-05, + "loss": 0.00927407741546631, + "reward": 0.2309035262092948, + "reward_std": 0.1082698717713356, + "rewards/ASRWerHalluLenRewardV5/mean": 0.23090351596474648, + "rewards/ASRWerHalluLenRewardV5/std": 0.3951416738331318, + "step": 880, + "step_time": 31.45735010355711 + }, + { + "epoch": 0.0938866958284434, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.71830985915493, + "eval_completions/mean_length": 46.3744143096494, + "eval_completions/min_length": 33.53521126760563, + "eval_frac_reward_zero_std": 0.2112676115103171, + "eval_kl": 0.2205452009983046, + "eval_loss": 0.01182747446000576, + "eval_reward": 0.6143311824597103, + "eval_reward_std": 0.06048309351776687, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6143311667190471, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.31100828502274735, + "eval_runtime": 339.8733, + "eval_samples_per_second": 0.63, + "eval_steps_per_second": 0.053, + "step": 880 + }, + { + "clip_ratio/high_max": 0.023890031082555652, + "clip_ratio/high_mean": 0.012245553462707903, + "clip_ratio/low_mean": 0.016009838314494117, + "clip_ratio/low_min": 0.004704255179967731, + "clip_ratio/region_mean": 0.028255391772836445, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.05, + "completions/mean_length": 47.450001335144044, + "completions/min_length": 29.7, + "epoch": 0.09442014296383229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6497003436088562, + "kl": 0.3144874732010067, + "learning_rate": 4.999972122928679e-05, + "loss": -0.002391589805483818, + "reward": 0.22345699928700924, + "reward_std": 0.10634848959743977, + "rewards/ASRWerHalluLenRewardV5/mean": 0.2234570000320673, + "rewards/ASRWerHalluLenRewardV5/std": 0.4297978326678276, + "step": 885, + "step_time": 31.74558497723192 + }, + { + "clip_ratio/high_max": 0.02394473265740089, + "clip_ratio/high_mean": 0.01430617702426389, + "clip_ratio/low_mean": 0.021205576200736688, + "clip_ratio/low_min": 0.009532165274140424, + "clip_ratio/region_mean": 0.03551175307948142, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.7, + "completions/mean_length": 49.97916793823242, + "completions/min_length": 35.25, + "epoch": 0.09495359009922116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26687192916870117, + "kl": 0.33671661335974934, + "learning_rate": 4.999964909069276e-05, + "loss": 0.0002921437378972769, + "reward": 0.03536734282970429, + "reward_std": 0.11364920400083064, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0353673430159688, + "rewards/ASRWerHalluLenRewardV5/std": 0.3812817148864269, + "step": 890, + "step_time": 32.337808100879194 + }, + { + "clip_ratio/high_max": 0.026711547491140665, + "clip_ratio/high_mean": 0.014917776100628544, + "clip_ratio/low_mean": 0.02270283347752411, + "clip_ratio/low_min": 0.008338395785540343, + "clip_ratio/region_mean": 0.03762060987064615, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.9, + "completions/mean_length": 45.17604312896729, + "completions/min_length": 28.4, + "epoch": 0.09548703723461005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3874630630016327, + "kl": 0.3199059063568711, + "learning_rate": 4.999956866038813e-05, + "loss": 0.014637678861618042, + "reward": 0.025701815262436867, + "reward_std": 0.12354252971708775, + "rewards/ASRWerHalluLenRewardV5/mean": 0.025701821067923448, + "rewards/ASRWerHalluLenRewardV5/std": 0.40807018280029295, + "step": 895, + "step_time": 31.29029975589365 + }, + { + "clip_ratio/high_max": 0.02078007413074374, + "clip_ratio/high_mean": 0.011562382863485255, + "clip_ratio/low_mean": 0.01701237463275902, + "clip_ratio/low_min": 0.006176876317476853, + "clip_ratio/region_mean": 0.0285747573885601, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.25, + "completions/mean_length": 48.51354293823242, + "completions/min_length": 35.15, + "epoch": 0.09602048436999894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32491952180862427, + "kl": 0.2683701036497951, + "learning_rate": 4.999947993839956e-05, + "loss": 0.005891662836074829, + "reward": 0.1895364087074995, + "reward_std": 0.10697676911950112, + "rewards/ASRWerHalluLenRewardV5/mean": 0.18953641541302205, + "rewards/ASRWerHalluLenRewardV5/std": 0.400813651829958, + "step": 900, + "step_time": 30.33865807335824 + }, + { + "epoch": 0.09602048436999894, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.08450704225352, + "eval_completions/mean_length": 46.05399196248659, + "eval_completions/min_length": 33.098591549295776, + "eval_frac_reward_zero_std": 0.14084507462004542, + "eval_kl": 0.19648644955120456, + "eval_loss": 0.010407588444650173, + "eval_reward": 0.6234095859223269, + "eval_reward_std": 0.06299057953112142, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6234095724378254, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.2987452344313054, + "eval_runtime": 340.113, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 900 + }, + { + "clip_ratio/high_max": 0.021393070986960083, + "clip_ratio/high_mean": 0.011877894822100643, + "clip_ratio/low_mean": 0.018454039133212065, + "clip_ratio/low_min": 0.006991079612635076, + "clip_ratio/region_mean": 0.030331933771958575, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.65, + "completions/mean_length": 47.16041793823242, + "completions/min_length": 31.65, + "epoch": 0.09655393150538781, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27997133135795593, + "kl": 0.294532526191324, + "learning_rate": 4.9999382924756486e-05, + "loss": 0.0008189266547560692, + "reward": 0.138704838976264, + "reward_std": 0.11187661308795213, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13870484143626527, + "rewards/ASRWerHalluLenRewardV5/std": 0.4414660811424255, + "step": 905, + "step_time": 30.24457983765751 + }, + { + "clip_ratio/high_max": 0.022196425541187635, + "clip_ratio/high_mean": 0.012941610315465368, + "clip_ratio/low_mean": 0.019022516254335643, + "clip_ratio/low_min": 0.0068977589457063, + "clip_ratio/region_mean": 0.031964126817183566, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 263.45, + "completions/mean_length": 51.03750152587891, + "completions/min_length": 32.0, + "epoch": 0.0970873786407767, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2981206476688385, + "kl": 0.34972533825784924, + "learning_rate": 4.999927761949108e-05, + "loss": 0.006517843902111053, + "reward": 0.1598328834399581, + "reward_std": 0.11261984743177891, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15983287980780006, + "rewards/ASRWerHalluLenRewardV5/std": 0.3816841021180153, + "step": 910, + "step_time": 89.39961053635925 + }, + { + "clip_ratio/high_max": 0.025922805588925256, + "clip_ratio/high_mean": 0.013369055613293313, + "clip_ratio/low_mean": 0.02006944935710635, + "clip_ratio/low_min": 0.005590514745563269, + "clip_ratio/region_mean": 0.033438504999503495, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.75, + "completions/mean_length": 44.7989595413208, + "completions/min_length": 31.9, + "epoch": 0.09762082577616558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3096455931663513, + "kl": 0.3497678738087416, + "learning_rate": 4.9999164022638286e-05, + "loss": 0.01637590825557709, + "reward": 0.03389074224978685, + "reward_std": 0.09586467929184436, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03389074958395213, + "rewards/ASRWerHalluLenRewardV5/std": 0.5292658343911171, + "step": 915, + "step_time": 29.687416091933848 + }, + { + "clip_ratio/high_max": 0.026404830749379472, + "clip_ratio/high_mean": 0.015086549185798503, + "clip_ratio/low_mean": 0.01864751745160902, + "clip_ratio/low_min": 0.0054196515993680805, + "clip_ratio/region_mean": 0.03373406658647582, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/mean_length": 46.60729331970215, + "completions/min_length": 26.75, + "epoch": 0.09815427291155447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41559118032455444, + "kl": 0.27533186972141266, + "learning_rate": 4.999904213423575e-05, + "loss": 0.0007354666944593191, + "reward": 0.07353917695581913, + "reward_std": 0.10698754154145718, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0735391778871417, + "rewards/ASRWerHalluLenRewardV5/std": 0.41282984912395476, + "step": 920, + "step_time": 30.618186278641225 + }, + { + "epoch": 0.09815427291155447, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.61971830985915, + "eval_completions/mean_length": 46.19483684486067, + "eval_completions/min_length": 33.42253521126761, + "eval_frac_reward_zero_std": 0.15962441790271814, + "eval_kl": 0.1889393414095254, + "eval_loss": 0.010380066931247711, + "eval_reward": 0.6172630657297624, + "eval_reward_std": 0.05565577392584421, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6172630510384768, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.301023559985866, + "eval_runtime": 340.9844, + "eval_samples_per_second": 0.628, + "eval_steps_per_second": 0.053, + "step": 920 + }, + { + "clip_ratio/high_max": 0.02205558372079395, + "clip_ratio/high_mean": 0.01126630780781852, + "clip_ratio/low_mean": 0.02122448728478048, + "clip_ratio/low_min": 0.006657287132111378, + "clip_ratio/region_mean": 0.032490795233752576, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.1, + "completions/mean_length": 50.25000133514404, + "completions/min_length": 30.7, + "epoch": 0.09868772004694334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2820536494255066, + "kl": 0.2934519348666072, + "learning_rate": 4.999891195432394e-05, + "loss": 0.018843315541744232, + "reward": 0.11014016848057509, + "reward_std": 0.11282789036631584, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11014016093686223, + "rewards/ASRWerHalluLenRewardV5/std": 0.39168190732598307, + "step": 925, + "step_time": 32.6266194364056 + }, + { + "clip_ratio/high_max": 0.027638786181341857, + "clip_ratio/high_mean": 0.015967303566867486, + "clip_ratio/low_mean": 0.021320828874013386, + "clip_ratio/low_min": 0.008668799605220556, + "clip_ratio/region_mean": 0.03728813243797049, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.9, + "completions/mean_length": 50.40729274749756, + "completions/min_length": 33.2, + "epoch": 0.09922116718233223, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3515748083591461, + "kl": 0.2897935027256608, + "learning_rate": 4.9998773482945985e-05, + "loss": 0.0005350843071937561, + "reward": 0.07915806882083416, + "reward_std": 0.11131543852388859, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0791580636287108, + "rewards/ASRWerHalluLenRewardV5/std": 0.384125117957592, + "step": 930, + "step_time": 31.819147821143268 + }, + { + "clip_ratio/high_max": 0.024471336673013867, + "clip_ratio/high_mean": 0.013530932893627324, + "clip_ratio/low_mean": 0.019890383243910036, + "clip_ratio/low_min": 0.006513813571655191, + "clip_ratio/region_mean": 0.03342131616664119, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.8, + "completions/mean_length": 49.2083345413208, + "completions/min_length": 35.9, + "epoch": 0.09975461431772112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39907947182655334, + "kl": 0.3093908028677106, + "learning_rate": 4.999862672014784e-05, + "loss": 0.0002494693268090487, + "reward": 0.11233335006982088, + "reward_std": 0.11110092755407094, + "rewards/ASRWerHalluLenRewardV5/mean": 0.11233334564603865, + "rewards/ASRWerHalluLenRewardV5/std": 0.390214167535305, + "step": 935, + "step_time": 32.82084590811282 + }, + { + "clip_ratio/high_max": 0.019598789606243373, + "clip_ratio/high_mean": 0.010365866485517471, + "clip_ratio/low_mean": 0.019758812976942864, + "clip_ratio/low_min": 0.007948408045922405, + "clip_ratio/region_mean": 0.030124679453729188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.9, + "completions/mean_length": 46.48958473205566, + "completions/min_length": 29.6, + "epoch": 0.10028806145310999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3175654113292694, + "kl": 0.25048509221524, + "learning_rate": 4.99984716659782e-05, + "loss": 0.003846060112118721, + "reward": 0.24552255049347876, + "reward_std": 0.10138763897120953, + "rewards/ASRWerHalluLenRewardV5/mean": 0.24552254602313042, + "rewards/ASRWerHalluLenRewardV5/std": 0.35110059306025504, + "step": 940, + "step_time": 30.21322726123035 + }, + { + "epoch": 0.10028806145310999, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.67605633802817, + "eval_completions/mean_length": 46.23826421818263, + "eval_completions/min_length": 33.352112676056336, + "eval_frac_reward_zero_std": 0.13145540297870906, + "eval_kl": 0.17019054862919827, + "eval_loss": 0.010220457799732685, + "eval_reward": 0.6058769423357198, + "eval_reward_std": 0.06648959654947402, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6058769237617372, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3102952695228684, + "eval_runtime": 339.2183, + "eval_samples_per_second": 0.631, + "eval_steps_per_second": 0.053, + "step": 940 + }, + { + "clip_ratio/high_max": 0.02770029324456118, + "clip_ratio/high_mean": 0.016229458262387197, + "clip_ratio/low_mean": 0.02052436863305047, + "clip_ratio/low_min": 0.006370226200670004, + "clip_ratio/region_mean": 0.036753826681524514, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.7, + "completions/mean_length": 47.78020973205567, + "completions/min_length": 30.25, + "epoch": 0.10082150858849888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29534658789634705, + "kl": 0.27533718980848787, + "learning_rate": 4.999830832048845e-05, + "loss": 0.004250554367899895, + "reward": -0.027600570768117904, + "reward_std": 0.130075229331851, + "rewards/ASRWerHalluLenRewardV5/mean": -0.027600561804138123, + "rewards/ASRWerHalluLenRewardV5/std": 0.32344200983643534, + "step": 945, + "step_time": 31.53184078130871 + }, + { + "clip_ratio/high_max": 0.025831055705202742, + "clip_ratio/high_mean": 0.013429169866140001, + "clip_ratio/low_mean": 0.016062519329716453, + "clip_ratio/low_min": 0.005054107896285132, + "clip_ratio/region_mean": 0.029491688997950405, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.8, + "completions/mean_length": 44.70312633514404, + "completions/min_length": 26.3, + "epoch": 0.10135495572388777, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.42679500579834, + "kl": 0.2894264360889792, + "learning_rate": 4.99981366837328e-05, + "loss": 0.010750997066497802, + "reward": 0.21473245788365602, + "reward_std": 0.1175777941942215, + "rewards/ASRWerHalluLenRewardV5/mean": 0.21473244815133513, + "rewards/ASRWerHalluLenRewardV5/std": 0.40799026414752004, + "step": 950, + "step_time": 29.75104099623859 + }, + { + "clip_ratio/high_max": 0.022052279370836914, + "clip_ratio/high_mean": 0.011583450392936356, + "clip_ratio/low_mean": 0.019989940198138358, + "clip_ratio/low_min": 0.005567519369651564, + "clip_ratio/region_mean": 0.03157339085591957, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/mean_length": 45.275001525878906, + "completions/min_length": 31.8, + "epoch": 0.10188840285927664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47061818838119507, + "kl": 0.2973035776987672, + "learning_rate": 4.999795675576816e-05, + "loss": 0.012267881631851196, + "reward": 0.12508223801851273, + "reward_std": 0.10854340381920338, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1250822358764708, + "rewards/ASRWerHalluLenRewardV5/std": 0.38184292763471606, + "step": 955, + "step_time": 29.37033666409552 + }, + { + "clip_ratio/high_max": 0.024434027989627793, + "clip_ratio/high_mean": 0.013238809030735866, + "clip_ratio/low_mean": 0.016649297447293064, + "clip_ratio/low_min": 0.005057537782704458, + "clip_ratio/region_mean": 0.02988810651586391, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.05, + "completions/mean_length": 48.158334732055664, + "completions/min_length": 33.95, + "epoch": 0.10242184999466553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.312502384185791, + "kl": 0.2521849826909602, + "learning_rate": 4.9997768536654214e-05, + "loss": 0.004292924702167511, + "reward": 0.26678850799798964, + "reward_std": 0.10851897187530994, + "rewards/ASRWerHalluLenRewardV5/mean": 0.26678850799798964, + "rewards/ASRWerHalluLenRewardV5/std": 0.35064154043793677, + "step": 960, + "step_time": 30.219220963865517 + }, + { + "epoch": 0.10242184999466553, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.943661971830984, + "eval_completions/mean_length": 46.40258332373391, + "eval_completions/min_length": 33.436619718309856, + "eval_frac_reward_zero_std": 0.16431925372338632, + "eval_kl": 0.18102584230962773, + "eval_loss": 0.011677735485136509, + "eval_reward": 0.6106016087070317, + "eval_reward_std": 0.06019930897647856, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.610601591147994, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30399624612444726, + "eval_runtime": 343.1896, + "eval_samples_per_second": 0.624, + "eval_steps_per_second": 0.052, + "step": 960 + }, + { + "clip_ratio/high_max": 0.025569424644345418, + "clip_ratio/high_mean": 0.013012524993973784, + "clip_ratio/low_mean": 0.02180939423997188, + "clip_ratio/low_min": 0.0073782768391538415, + "clip_ratio/region_mean": 0.03482191924122162, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/mean_length": 44.825001335144044, + "completions/min_length": 26.35, + "epoch": 0.10295529713005441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32230344414711, + "kl": 0.325704147759825, + "learning_rate": 4.9997572026453395e-05, + "loss": 0.026039782166481017, + "reward": 0.14645020253956317, + "reward_std": 0.12426164932549, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14645020216703414, + "rewards/ASRWerHalluLenRewardV5/std": 0.39573091715574266, + "step": 965, + "step_time": 30.289928135834636 + }, + { + "clip_ratio/high_max": 0.025450453080702574, + "clip_ratio/high_mean": 0.014292843484145123, + "clip_ratio/low_mean": 0.02083947552164318, + "clip_ratio/low_min": 0.007977789564756676, + "clip_ratio/region_mean": 0.03513231886317954, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.45, + "completions/mean_length": 47.483334350585935, + "completions/min_length": 32.55, + "epoch": 0.1034887442654433, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3444148600101471, + "kl": 0.35566417444497345, + "learning_rate": 4.9997367225230865e-05, + "loss": 0.0038170602172613146, + "reward": 0.0008426906540989875, + "reward_std": 0.11677385345101357, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0008426919637713581, + "rewards/ASRWerHalluLenRewardV5/std": 0.3603787288069725, + "step": 970, + "step_time": 34.2297256199643 + }, + { + "clip_ratio/high_max": 0.024122701853048056, + "clip_ratio/high_mean": 0.013499733406933955, + "clip_ratio/low_mean": 0.022140093630878255, + "clip_ratio/low_min": 0.007897313177818433, + "clip_ratio/region_mean": 0.03563982691848651, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.85, + "completions/mean_length": 48.034376525878905, + "completions/min_length": 32.5, + "epoch": 0.10402219140083217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35109788179397583, + "kl": 0.2984958853572607, + "learning_rate": 4.999715413305457e-05, + "loss": 0.016291096806526184, + "reward": 0.09988466184586287, + "reward_std": 0.11400905884802341, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09988465821370482, + "rewards/ASRWerHalluLenRewardV5/std": 0.44001480638980867, + "step": 975, + "step_time": 30.2861518939957 + }, + { + "clip_ratio/high_max": 0.02722280129091814, + "clip_ratio/high_mean": 0.014964276278624311, + "clip_ratio/low_mean": 0.021030744115705602, + "clip_ratio/low_min": 0.008419841708382591, + "clip_ratio/region_mean": 0.035995020193513486, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.15, + "completions/mean_length": 45.839584541320804, + "completions/min_length": 28.5, + "epoch": 0.10455563853622106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5121036767959595, + "kl": 0.46117590740323067, + "learning_rate": 4.999693274999517e-05, + "loss": 0.012672045826911926, + "reward": 0.10367023944854736, + "reward_std": 0.12214499264955521, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1036702362820506, + "rewards/ASRWerHalluLenRewardV5/std": 0.38234301954507827, + "step": 980, + "step_time": 29.58430341910571 + }, + { + "epoch": 0.10455563853622106, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.225352112676056, + "eval_completions/mean_length": 46.03873376443352, + "eval_completions/min_length": 33.19718309859155, + "eval_frac_reward_zero_std": 0.183098596586308, + "eval_kl": 0.26145352793096654, + "eval_loss": 0.014178244397044182, + "eval_reward": 0.6272559991290032, + "eval_reward_std": 0.05908500496298075, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6272559839130287, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.29332094799569797, + "eval_runtime": 341.4566, + "eval_samples_per_second": 0.627, + "eval_steps_per_second": 0.053, + "step": 980 + }, + { + "clip_ratio/high_max": 0.023980378318810835, + "clip_ratio/high_mean": 0.013154192676302046, + "clip_ratio/low_mean": 0.01389996787183918, + "clip_ratio/low_min": 0.004792359448038042, + "clip_ratio/region_mean": 0.027054160740226506, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.4, + "completions/mean_length": 46.36562652587891, + "completions/min_length": 28.7, + "epoch": 0.10508908567160995, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4018096625804901, + "kl": 0.2882715314626694, + "learning_rate": 4.9996703076126094e-05, + "loss": -0.010568474233150483, + "reward": 0.3246448613703251, + "reward_std": 0.10815358869731426, + "rewards/ASRWerHalluLenRewardV5/mean": 0.3246448539197445, + "rewards/ASRWerHalluLenRewardV5/std": 0.35659717842936517, + "step": 985, + "step_time": 30.53259820509702 + }, + { + "clip_ratio/high_max": 0.02346951960353181, + "clip_ratio/high_mean": 0.013316635179216973, + "clip_ratio/low_mean": 0.019580708353896626, + "clip_ratio/low_min": 0.00746672676759772, + "clip_ratio/region_mean": 0.03289734367863275, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.65, + "completions/mean_length": 48.65208435058594, + "completions/min_length": 33.55, + "epoch": 0.10562253280699882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2726946771144867, + "kl": 0.3229525560513139, + "learning_rate": 4.9996465111523535e-05, + "loss": 0.0040734030306339266, + "reward": 0.15301481168717146, + "reward_std": 0.10347771588712931, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15301480810157955, + "rewards/ASRWerHalluLenRewardV5/std": 0.40071580559015274, + "step": 990, + "step_time": 33.513969491794704 + }, + { + "clip_ratio/high_max": 0.026287898264126853, + "clip_ratio/high_mean": 0.015562549806782044, + "clip_ratio/low_mean": 0.020543808060756417, + "clip_ratio/low_min": 0.007496731745777652, + "clip_ratio/region_mean": 0.03610635814256966, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.7, + "completions/mean_length": 46.104167747497556, + "completions/min_length": 32.65, + "epoch": 0.10615597994238771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4401520788669586, + "kl": 0.30052103037014605, + "learning_rate": 4.9996218856266396e-05, + "loss": 0.0075774997472763065, + "reward": 0.1084702130407095, + "reward_std": 0.11157154701650143, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10847021182999014, + "rewards/ASRWerHalluLenRewardV5/std": 0.38270739689469335, + "step": 995, + "step_time": 30.18130837418139 + }, + { + "clip_ratio/high_max": 0.024767558672465384, + "clip_ratio/high_mean": 0.01337633337097941, + "clip_ratio/low_mean": 0.02101525948382914, + "clip_ratio/low_min": 0.00776412858394906, + "clip_ratio/region_mean": 0.034391592949396, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.7, + "completions/mean_length": 48.182292747497556, + "completions/min_length": 30.25, + "epoch": 0.1066894270777766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4630095958709717, + "kl": 0.32148105576634406, + "learning_rate": 4.999596431043637e-05, + "loss": 0.006536516547203064, + "reward": 0.04247091636061669, + "reward_std": 0.09793830774724484, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04247091041179374, + "rewards/ASRWerHalluLenRewardV5/std": 0.4184539385139942, + "step": 1000, + "step_time": 32.483558563701806 + }, + { + "epoch": 0.1066894270777766, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.225352112676056, + "eval_completions/mean_length": 45.91784176356356, + "eval_completions/min_length": 33.16901408450704, + "eval_frac_reward_zero_std": 0.12676056715804088, + "eval_kl": 0.1770296422108798, + "eval_loss": 0.006521000526845455, + "eval_reward": 0.618607794157636, + "eval_reward_std": 0.0661460147304854, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6186077766854998, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3071556248452882, + "eval_runtime": 338.5473, + "eval_samples_per_second": 0.632, + "eval_steps_per_second": 0.053, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.019666367600439116, + "clip_ratio/high_mean": 0.010576394280360547, + "clip_ratio/low_mean": 0.016775101066741627, + "clip_ratio/low_min": 0.0039965350879356265, + "clip_ratio/region_mean": 0.027351495643961242, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.15, + "completions/mean_length": 46.656250953674316, + "completions/min_length": 30.8, + "epoch": 0.10722287421316548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36642196774482727, + "kl": 0.2715712138451636, + "learning_rate": 4.999570147411788e-05, + "loss": -0.0020648781210184097, + "reward": 0.23836014326661825, + "reward_std": 0.11108321137726307, + "rewards/ASRWerHalluLenRewardV5/mean": 0.23836013451218604, + "rewards/ASRWerHalluLenRewardV5/std": 0.3373882293701172, + "step": 1005, + "step_time": 30.40613023955375 + }, + { + "clip_ratio/high_max": 0.029250265943119302, + "clip_ratio/high_mean": 0.016695603972766548, + "clip_ratio/low_mean": 0.022725914238253608, + "clip_ratio/low_min": 0.008727659200667403, + "clip_ratio/region_mean": 0.03942151791416108, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.55, + "completions/mean_length": 49.3427095413208, + "completions/min_length": 33.5, + "epoch": 0.10775632134855435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3511591851711273, + "kl": 0.3497096398845315, + "learning_rate": 4.9995430347398096e-05, + "loss": 0.0015764342620968818, + "reward": 0.0267599917948246, + "reward_std": 0.10825997516512871, + "rewards/ASRWerHalluLenRewardV5/mean": 0.026759991841390728, + "rewards/ASRWerHalluLenRewardV5/std": 0.45253841429948805, + "step": 1010, + "step_time": 30.428196039423348 + }, + { + "clip_ratio/high_max": 0.025498886790592222, + "clip_ratio/high_mean": 0.014470948347297962, + "clip_ratio/low_mean": 0.02082006721320795, + "clip_ratio/low_min": 0.006850362831028178, + "clip_ratio/region_mean": 0.035291015659458934, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.45, + "completions/mean_length": 49.52604293823242, + "completions/min_length": 36.3, + "epoch": 0.10828976848394324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21179193258285522, + "kl": 0.2785156193189323, + "learning_rate": 4.999515093036695e-05, + "loss": 0.0025972379371523857, + "reward": 0.22084318287670612, + "reward_std": 0.10226317290216684, + "rewards/ASRWerHalluLenRewardV5/mean": 0.22084316685795785, + "rewards/ASRWerHalluLenRewardV5/std": 0.4062530815601349, + "step": 1015, + "step_time": 31.00696713011712 + }, + { + "clip_ratio/high_max": 0.023329904896672814, + "clip_ratio/high_mean": 0.011944601440336555, + "clip_ratio/low_mean": 0.020830887692864054, + "clip_ratio/low_min": 0.005732145073125139, + "clip_ratio/region_mean": 0.03277548921760172, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.65, + "completions/mean_length": 46.74166831970215, + "completions/min_length": 30.7, + "epoch": 0.10882321561933213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3675518333911896, + "kl": 0.3701936863362789, + "learning_rate": 4.999486322311711e-05, + "loss": 0.005817363783717155, + "reward": 0.16965706404298544, + "reward_std": 0.10928754135966301, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16965705249458551, + "rewards/ASRWerHalluLenRewardV5/std": 0.3632862463593483, + "step": 1020, + "step_time": 30.84252065103501 + }, + { + "epoch": 0.10882321561933213, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.61971830985915, + "eval_completions/mean_length": 46.260564669756825, + "eval_completions/min_length": 33.19718309859155, + "eval_frac_reward_zero_std": 0.15962441790271814, + "eval_kl": 0.23503596102162985, + "eval_loss": 0.012478147633373737, + "eval_reward": 0.6074120594896901, + "eval_reward_std": 0.06719974053918686, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6074120473169106, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30538198566982444, + "eval_runtime": 344.3088, + "eval_samples_per_second": 0.622, + "eval_steps_per_second": 0.052, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0239369930466637, + "clip_ratio/high_mean": 0.012563335482263937, + "clip_ratio/low_mean": 0.019298866277677008, + "clip_ratio/low_min": 0.00787321745592635, + "clip_ratio/region_mean": 0.031862201332114634, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.6, + "completions/mean_length": 47.54791793823242, + "completions/min_length": 32.5, + "epoch": 0.109356662754721, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31570157408714294, + "kl": 0.3827694904059172, + "learning_rate": 4.999456722574401e-05, + "loss": 0.02221662998199463, + "reward": 0.18466314319521188, + "reward_std": 0.1256857205182314, + "rewards/ASRWerHalluLenRewardV5/mean": 0.18466313946992158, + "rewards/ASRWerHalluLenRewardV5/std": 0.33347662091255187, + "step": 1025, + "step_time": 31.525632321089507 + }, + { + "clip_ratio/high_max": 0.026191326731350273, + "clip_ratio/high_mean": 0.014100981841329486, + "clip_ratio/low_mean": 0.022636716527631506, + "clip_ratio/low_min": 0.01044400458340533, + "clip_ratio/region_mean": 0.03673769844463095, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.15, + "completions/mean_length": 51.63021011352539, + "completions/min_length": 35.95, + "epoch": 0.10989010989010989, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3754618465900421, + "kl": 0.3196474280208349, + "learning_rate": 4.9994262938345816e-05, + "loss": 0.012383192777633667, + "reward": 0.16451531127095223, + "reward_std": 0.11978967376053333, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16451530791819097, + "rewards/ASRWerHalluLenRewardV5/std": 0.3181664668023586, + "step": 1030, + "step_time": 31.23764550816268 + }, + { + "clip_ratio/high_max": 0.027258884260663762, + "clip_ratio/high_mean": 0.014798681079992094, + "clip_ratio/low_mean": 0.02431778384343488, + "clip_ratio/low_min": 0.007704682715120726, + "clip_ratio/region_mean": 0.039116465323604646, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.7, + "completions/mean_length": 47.81458492279053, + "completions/min_length": 36.35, + "epoch": 0.11042355702549878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48717132210731506, + "kl": 0.42427748870104554, + "learning_rate": 4.999395036102345e-05, + "loss": 0.011985325813293457, + "reward": 0.03515339158475399, + "reward_std": 0.11010743416845799, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03515339286532253, + "rewards/ASRWerHalluLenRewardV5/std": 0.44606786146759986, + "step": 1035, + "step_time": 29.955806829221547 + }, + { + "clip_ratio/high_max": 0.022881265619071202, + "clip_ratio/high_mean": 0.013253708332194946, + "clip_ratio/low_mean": 0.0223832695861347, + "clip_ratio/low_min": 0.0083273735945113, + "clip_ratio/region_mean": 0.03563697785721161, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.6, + "completions/mean_length": 45.309376335144044, + "completions/min_length": 30.75, + "epoch": 0.11095700416088766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.459381103515625, + "kl": 0.4481667622923851, + "learning_rate": 4.99936294938806e-05, + "loss": 0.009291221201419831, + "reward": -0.0409976065158844, + "reward_std": 0.12162209115922451, + "rewards/ASRWerHalluLenRewardV5/mean": -0.040997607191093265, + "rewards/ASRWerHalluLenRewardV5/std": 0.404738087952137, + "step": 1040, + "step_time": 31.63698167540133 + }, + { + "epoch": 0.11095700416088766, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.309859154929576, + "eval_completions/mean_length": 46.09507187319473, + "eval_completions/min_length": 33.11267605633803, + "eval_frac_reward_zero_std": 0.15962441790271814, + "eval_kl": 0.23188812330260244, + "eval_loss": 0.011642096564173698, + "eval_reward": 0.6215834688267369, + "eval_reward_std": 0.06129339263236649, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6215834585649722, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.2995572885052419, + "eval_runtime": 342.7829, + "eval_samples_per_second": 0.624, + "eval_steps_per_second": 0.053, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.02376522136037238, + "clip_ratio/high_mean": 0.013454942996031605, + "clip_ratio/low_mean": 0.02414673659368418, + "clip_ratio/low_min": 0.009989246088662185, + "clip_ratio/region_mean": 0.03760167992440984, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.6, + "completions/mean_length": 47.91562614440918, + "completions/min_length": 32.7, + "epoch": 0.11149045129627654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5927233099937439, + "kl": 0.3553912464529276, + "learning_rate": 4.999330033702368e-05, + "loss": 0.011381769180297851, + "reward": 0.12258676588535308, + "reward_std": 0.12233428955078125, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12258676085621119, + "rewards/ASRWerHalluLenRewardV5/std": 0.40625343546271325, + "step": 1045, + "step_time": 31.344754235073925 + }, + { + "clip_ratio/high_max": 0.02189263278269209, + "clip_ratio/high_mean": 0.011988387386372779, + "clip_ratio/low_mean": 0.02013256368081784, + "clip_ratio/low_min": 0.006501164290239103, + "clip_ratio/region_mean": 0.03212095107301138, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.4, + "completions/mean_length": 46.87916793823242, + "completions/min_length": 28.7, + "epoch": 0.11202389843166542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2545875012874603, + "kl": 0.30472447173669936, + "learning_rate": 4.999296289056186e-05, + "loss": 0.017706184089183806, + "reward": 0.1760616822168231, + "reward_std": 0.12486314177513122, + "rewards/ASRWerHalluLenRewardV5/mean": 0.17606167681515217, + "rewards/ASRWerHalluLenRewardV5/std": 0.4055256761610508, + "step": 1050, + "step_time": 30.21775802243501 + }, + { + "clip_ratio/high_max": 0.023598105099517853, + "clip_ratio/high_mean": 0.012445006068446673, + "clip_ratio/low_mean": 0.017572758406458888, + "clip_ratio/low_min": 0.00567761113634333, + "clip_ratio/region_mean": 0.03001776470337063, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 85.85, + "completions/mean_length": 46.2989595413208, + "completions/min_length": 29.9, + "epoch": 0.11255734556705431, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36364275217056274, + "kl": 0.3805642744526267, + "learning_rate": 4.999261715460706e-05, + "loss": 0.001104967575520277, + "reward": 0.1739208571612835, + "reward_std": 0.11706736534833909, + "rewards/ASRWerHalluLenRewardV5/mean": 0.17392085250467063, + "rewards/ASRWerHalluLenRewardV5/std": 0.4114412575960159, + "step": 1055, + "step_time": 32.53966623563319 + }, + { + "clip_ratio/high_max": 0.029256251931656152, + "clip_ratio/high_mean": 0.015667952832882293, + "clip_ratio/low_mean": 0.019342000895994714, + "clip_ratio/low_min": 0.005417454539565369, + "clip_ratio/region_mean": 0.03500995370559394, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.9, + "completions/mean_length": 47.165625762939456, + "completions/min_length": 33.4, + "epoch": 0.11309079270244318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43476101756095886, + "kl": 0.3682957211509347, + "learning_rate": 4.9992263129273956e-05, + "loss": 0.0015579144470393657, + "reward": 0.15141564346849917, + "reward_std": 0.11857310347259045, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15141563564538957, + "rewards/ASRWerHalluLenRewardV5/std": 0.43357725292444227, + "step": 1060, + "step_time": 32.17124021332711 + }, + { + "epoch": 0.11309079270244318, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 58.16901408450704, + "eval_completions/mean_length": 46.33920331068442, + "eval_completions/min_length": 33.183098591549296, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.2206858346160029, + "eval_loss": 0.012514096684753895, + "eval_reward": 0.6041626356751986, + "eval_reward_std": 0.06555236984227955, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6041626185178757, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3122544518448937, + "eval_runtime": 342.442, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.053, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.02493780305958353, + "clip_ratio/high_mean": 0.01442487753374735, + "clip_ratio/low_mean": 0.01846228006761521, + "clip_ratio/low_min": 0.005267205991549418, + "clip_ratio/region_mean": 0.03288715743692592, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.15, + "completions/mean_length": 46.6572925567627, + "completions/min_length": 31.1, + "epoch": 0.11362423983783207, + "frac_reward_zero_std": 0.0, + "grad_norm": 24.850757598876953, + "kl": 0.7235194464214146, + "learning_rate": 4.999190081467997e-05, + "loss": -0.0022364987060427667, + "reward": 0.1983611986041069, + "reward_std": 0.10357024911791086, + "rewards/ASRWerHalluLenRewardV5/mean": 0.19836118780076503, + "rewards/ASRWerHalluLenRewardV5/std": 0.3978478252887726, + "step": 1065, + "step_time": 30.13072933740914 + }, + { + "clip_ratio/high_max": 0.024956896784715354, + "clip_ratio/high_mean": 0.013384159906127024, + "clip_ratio/low_mean": 0.01850853177893441, + "clip_ratio/low_min": 0.006323540583252907, + "clip_ratio/region_mean": 0.03189269177382812, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.65, + "completions/mean_length": 49.444793128967284, + "completions/min_length": 34.85, + "epoch": 0.11415768697322096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28445419669151306, + "kl": 0.27203155038878324, + "learning_rate": 4.999153021094527e-05, + "loss": 0.006953977793455124, + "reward": 0.1539104986935854, + "reward_std": 0.10401002708822489, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15391048374585808, + "rewards/ASRWerHalluLenRewardV5/std": 0.3488810911774635, + "step": 1070, + "step_time": 33.09813131447881 + }, + { + "clip_ratio/high_max": 0.0263389409345109, + "clip_ratio/high_mean": 0.01435768062074203, + "clip_ratio/low_mean": 0.020183589527732693, + "clip_ratio/low_min": 0.008136194822145625, + "clip_ratio/region_mean": 0.03454127038130537, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.35, + "completions/mean_length": 47.44166793823242, + "completions/min_length": 35.0, + "epoch": 0.11469113410860984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24898602068424225, + "kl": 0.24396936474367975, + "learning_rate": 4.9991151318192774e-05, + "loss": -0.005133867263793945, + "reward": 0.14383738934993745, + "reward_std": 0.094856471195817, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14383738525211812, + "rewards/ASRWerHalluLenRewardV5/std": 0.4465418465435505, + "step": 1075, + "step_time": 29.59662895407528 + }, + { + "clip_ratio/high_max": 0.027175239683128894, + "clip_ratio/high_mean": 0.015536522050388158, + "clip_ratio/low_mean": 0.023409055793308653, + "clip_ratio/low_min": 0.007715654757339508, + "clip_ratio/region_mean": 0.03894557787571103, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/mean_length": 48.81666793823242, + "completions/min_length": 33.85, + "epoch": 0.11522458124399872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6180169582366943, + "kl": 0.25690340660512445, + "learning_rate": 4.999076413654815e-05, + "loss": 0.0061361856758594515, + "reward": 0.044824971817433834, + "reward_std": 0.11100129224359989, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04482496936107054, + "rewards/ASRWerHalluLenRewardV5/std": 0.43621088936924934, + "step": 1080, + "step_time": 32.11319844573736 + }, + { + "epoch": 0.11522458124399872, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.11267605633803, + "eval_completions/mean_length": 45.89671470749546, + "eval_completions/min_length": 33.183098591549296, + "eval_frac_reward_zero_std": 0.16431925372338632, + "eval_kl": 0.18301644590748867, + "eval_loss": 0.007790931034833193, + "eval_reward": 0.6184040933427676, + "eval_reward_std": 0.06352685882486928, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6184040784416064, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30737316600558623, + "eval_runtime": 339.4831, + "eval_samples_per_second": 0.63, + "eval_steps_per_second": 0.053, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.02326093673473224, + "clip_ratio/high_mean": 0.012610175722511486, + "clip_ratio/low_mean": 0.0222633455588948, + "clip_ratio/low_min": 0.005767047114204615, + "clip_ratio/region_mean": 0.03487352109514177, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.9, + "completions/mean_length": 47.565626335144046, + "completions/min_length": 30.25, + "epoch": 0.1157580283793876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30452659726142883, + "kl": 0.26774182920344175, + "learning_rate": 4.9990368666139826e-05, + "loss": 0.00804363638162613, + "reward": 0.13157696537673474, + "reward_std": 0.1117436034604907, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13157695059198887, + "rewards/ASRWerHalluLenRewardV5/std": 0.35536216869950293, + "step": 1085, + "step_time": 30.980526034533977 + }, + { + "clip_ratio/high_max": 0.024001082289032638, + "clip_ratio/high_mean": 0.012725757615407928, + "clip_ratio/low_mean": 0.019756372952542733, + "clip_ratio/low_min": 0.007706936250906437, + "clip_ratio/region_mean": 0.03248213030747138, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.35, + "completions/mean_length": 46.03229312896728, + "completions/min_length": 27.2, + "epoch": 0.11629147551477649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46225181221961975, + "kl": 0.3631205702200532, + "learning_rate": 4.998996490709895e-05, + "loss": 0.007047584652900696, + "reward": 0.15126287750899792, + "reward_std": 0.12113595008850098, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15126287690363824, + "rewards/ASRWerHalluLenRewardV5/std": 0.4020110800862312, + "step": 1090, + "step_time": 30.34209956936538 + }, + { + "clip_ratio/high_max": 0.02740416003507562, + "clip_ratio/high_mean": 0.01519650105619803, + "clip_ratio/low_mean": 0.024452463306079152, + "clip_ratio/low_min": 0.007902904687216505, + "clip_ratio/region_mean": 0.039648964174557474, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.4, + "completions/mean_length": 47.12395973205567, + "completions/min_length": 29.7, + "epoch": 0.11682492265016536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2798287868499756, + "kl": 0.32814883524551985, + "learning_rate": 4.9989552859559455e-05, + "loss": 0.02117571234703064, + "reward": 0.05591340251266956, + "reward_std": 0.12100807651877403, + "rewards/ASRWerHalluLenRewardV5/mean": 0.055913408286869526, + "rewards/ASRWerHalluLenRewardV5/std": 0.3719463016837835, + "step": 1095, + "step_time": 32.30928267259151 + }, + { + "clip_ratio/high_max": 0.027316589083056897, + "clip_ratio/high_mean": 0.01496668714680709, + "clip_ratio/low_mean": 0.01819621278118575, + "clip_ratio/low_min": 0.00605871407315135, + "clip_ratio/region_mean": 0.033162900118622926, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.8, + "completions/mean_length": 49.09062614440918, + "completions/min_length": 31.15, + "epoch": 0.11735836978555425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4440866708755493, + "kl": 0.3430633223615587, + "learning_rate": 4.9989132523658e-05, + "loss": 0.0057637464255094525, + "reward": 0.1490264743566513, + "reward_std": 0.11954507008194923, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1490264670457691, + "rewards/ASRWerHalluLenRewardV5/std": 0.39893965646624563, + "step": 1100, + "step_time": 32.65607395172119 + }, + { + "epoch": 0.11735836978555425, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.71830985915493, + "eval_completions/mean_length": 46.328639796082406, + "eval_completions/min_length": 33.352112676056336, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.21381330934786041, + "eval_loss": 0.013864434324204922, + "eval_reward": 0.6168303807121767, + "eval_reward_std": 0.05771518200242393, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6168303726844384, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30469051194967517, + "eval_runtime": 341.1709, + "eval_samples_per_second": 0.627, + "eval_steps_per_second": 0.053, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.020909963169833644, + "clip_ratio/high_mean": 0.01224248957587406, + "clip_ratio/low_mean": 0.01673181930091232, + "clip_ratio/low_min": 0.00650051639531739, + "clip_ratio/region_mean": 0.028974308763281442, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.35, + "completions/mean_length": 45.090626049041745, + "completions/min_length": 29.9, + "epoch": 0.11789181692094314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4239621162414551, + "kl": 0.38632871229201554, + "learning_rate": 4.9988703899533994e-05, + "loss": 0.021788929402828217, + "reward": 0.21075276136398316, + "reward_std": 0.11833531521260739, + "rewards/ASRWerHalluLenRewardV5/mean": 0.21075275014154612, + "rewards/ASRWerHalluLenRewardV5/std": 0.3043745331466198, + "step": 1105, + "step_time": 30.361268295906484 + }, + { + "clip_ratio/high_max": 0.02746724121971056, + "clip_ratio/high_mean": 0.015095782515709288, + "clip_ratio/low_mean": 0.02019167414982803, + "clip_ratio/low_min": 0.006401392776751891, + "clip_ratio/region_mean": 0.03528745692456141, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.45, + "completions/mean_length": 46.64583396911621, + "completions/min_length": 28.5, + "epoch": 0.11842526405633201, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42970210313796997, + "kl": 0.33843108220025897, + "learning_rate": 4.9988266987329614e-05, + "loss": -0.0012818853370845317, + "reward": 0.09741018693894148, + "reward_std": 0.11980042904615402, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09741018237546087, + "rewards/ASRWerHalluLenRewardV5/std": 0.3985237330198288, + "step": 1110, + "step_time": 31.296657408960165 + }, + { + "clip_ratio/high_max": 0.021738068538252265, + "clip_ratio/high_mean": 0.011762558693590108, + "clip_ratio/low_mean": 0.022389974516408985, + "clip_ratio/low_min": 0.007008293835679069, + "clip_ratio/region_mean": 0.03415253341081552, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.6, + "completions/mean_length": 43.62500133514404, + "completions/min_length": 24.75, + "epoch": 0.1189587111917209, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4870377779006958, + "kl": 0.29114672280848025, + "learning_rate": 4.998782178718975e-05, + "loss": 0.00017531675985082984, + "reward": 0.08315838854759931, + "reward_std": 0.11678501963615417, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08315838682465256, + "rewards/ASRWerHalluLenRewardV5/std": 0.382754036039114, + "step": 1115, + "step_time": 31.035597792640328 + }, + { + "clip_ratio/high_max": 0.027659124048659577, + "clip_ratio/high_mean": 0.015460326451284345, + "clip_ratio/low_mean": 0.022838341241003944, + "clip_ratio/low_min": 0.005416290782159194, + "clip_ratio/region_mean": 0.03829866779269651, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.15, + "completions/mean_length": 44.532292652130124, + "completions/min_length": 28.05, + "epoch": 0.11949215832710978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5747174024581909, + "kl": 0.4482402901165187, + "learning_rate": 4.998736829926208e-05, + "loss": -0.008696176856756211, + "reward": 0.12234907783567905, + "reward_std": 0.12138295099139214, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12234907725360245, + "rewards/ASRWerHalluLenRewardV5/std": 0.3604292690753937, + "step": 1120, + "step_time": 30.051762696169316 + }, + { + "epoch": 0.11949215832710978, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.267605633802816, + "eval_completions/mean_length": 46.1009401670644, + "eval_completions/min_length": 32.91549295774648, + "eval_frac_reward_zero_std": 0.183098596586308, + "eval_kl": 0.21147997320537837, + "eval_loss": 0.012333904393017292, + "eval_reward": 0.6240429901583514, + "eval_reward_std": 0.057972929138742704, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6240429750210802, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.2963143443610047, + "eval_runtime": 338.6764, + "eval_samples_per_second": 0.632, + "eval_steps_per_second": 0.053, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.027075197134399785, + "clip_ratio/high_mean": 0.014773923740722238, + "clip_ratio/low_mean": 0.023644780814356635, + "clip_ratio/low_min": 0.009027763630729168, + "clip_ratio/region_mean": 0.03841870439937338, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.2, + "completions/mean_length": 46.513543128967285, + "completions/min_length": 27.95, + "epoch": 0.12002560546249867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3434796631336212, + "kl": 0.333338226377964, + "learning_rate": 4.9986906523697016e-05, + "loss": -0.010706178843975067, + "reward": 8.749254047870635e-05, + "reward_std": 0.1013871394097805, + "rewards/ASRWerHalluLenRewardV5/mean": 8.748576510697603e-05, + "rewards/ASRWerHalluLenRewardV5/std": 0.4111168935894966, + "step": 1125, + "step_time": 31.73693456556648 + }, + { + "clip_ratio/high_max": 0.025755579891847447, + "clip_ratio/high_mean": 0.014464987193059642, + "clip_ratio/low_mean": 0.02143947858712636, + "clip_ratio/low_min": 0.00544068239396438, + "clip_ratio/region_mean": 0.0359044658136554, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.95, + "completions/mean_length": 44.336459922790525, + "completions/min_length": 21.75, + "epoch": 0.12055905259788754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3709721267223358, + "kl": 0.2783936608582735, + "learning_rate": 4.99864364606477e-05, + "loss": 0.006696180254220962, + "reward": 0.14240834414958953, + "reward_std": 0.1171993337571621, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14240833756048232, + "rewards/ASRWerHalluLenRewardV5/std": 0.4270940274000168, + "step": 1130, + "step_time": 31.051088786870242 + }, + { + "clip_ratio/high_max": 0.02739778161048889, + "clip_ratio/high_mean": 0.01557918162434362, + "clip_ratio/low_mean": 0.023324621966457924, + "clip_ratio/low_min": 0.009087929048109799, + "clip_ratio/region_mean": 0.03890380362281576, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.3, + "completions/mean_length": 46.050000953674314, + "completions/min_length": 24.1, + "epoch": 0.12109249973327643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4197031259536743, + "kl": 0.3038671233691275, + "learning_rate": 4.998595811027006e-05, + "loss": 0.01765294522047043, + "reward": 0.0981090385466814, + "reward_std": 0.11961323618888856, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09810904138721525, + "rewards/ASRWerHalluLenRewardV5/std": 0.38983526080846786, + "step": 1135, + "step_time": 33.40298830773681 + }, + { + "clip_ratio/high_max": 0.023461745789973065, + "clip_ratio/high_mean": 0.0117980861759861, + "clip_ratio/low_mean": 0.019536697497824208, + "clip_ratio/low_min": 0.00522548605222255, + "clip_ratio/region_mean": 0.03133478332310915, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.6, + "completions/mean_length": 48.75625114440918, + "completions/min_length": 31.85, + "epoch": 0.12162594686866532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34263402223587036, + "kl": 0.2835341364145279, + "learning_rate": 4.998547147272274e-05, + "loss": 0.01185780018568039, + "reward": 0.1517514854669571, + "reward_std": 0.10872828289866447, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15175148379057646, + "rewards/ASRWerHalluLenRewardV5/std": 0.4460065931081772, + "step": 1140, + "step_time": 31.037483246438207 + }, + { + "epoch": 0.12162594686866532, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.352112676056336, + "eval_completions/mean_length": 45.96596371288031, + "eval_completions/min_length": 33.23943661971831, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.21924526382132736, + "eval_loss": 0.011900043115019798, + "eval_reward": 0.6125448266385307, + "eval_reward_std": 0.06481492049782209, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6125448110028052, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3142316920394209, + "eval_runtime": 341.4511, + "eval_samples_per_second": 0.627, + "eval_steps_per_second": 0.053, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.023714054422453047, + "clip_ratio/high_mean": 0.012803823647846001, + "clip_ratio/low_mean": 0.02214908110909164, + "clip_ratio/low_min": 0.006463136035017669, + "clip_ratio/region_mean": 0.03495290477294475, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.35, + "completions/mean_length": 47.27187614440918, + "completions/min_length": 30.2, + "epoch": 0.12215939400405419, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2971256375312805, + "kl": 0.33590664602816106, + "learning_rate": 4.9984976548167134e-05, + "loss": 0.00826924592256546, + "reward": 0.08525845296680927, + "reward_std": 0.10978813730180263, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0852584526874125, + "rewards/ASRWerHalluLenRewardV5/std": 0.4458322286605835, + "step": 1145, + "step_time": 31.975166478939354 + }, + { + "clip_ratio/high_max": 0.023993525418336503, + "clip_ratio/high_mean": 0.013171355893427971, + "clip_ratio/low_mean": 0.023700285688391887, + "clip_ratio/low_min": 0.008629048126749694, + "clip_ratio/region_mean": 0.036871641653124246, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.45, + "completions/mean_length": 46.239584350585936, + "completions/min_length": 30.5, + "epoch": 0.12269284113944308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3657069206237793, + "kl": 0.41308804545551536, + "learning_rate": 4.998447333676741e-05, + "loss": 0.014252057671546936, + "reward": 0.040065056085586546, + "reward_std": 0.11467994451522827, + "rewards/ASRWerHalluLenRewardV5/mean": 0.040065054563456215, + "rewards/ASRWerHalluLenRewardV5/std": 0.4702542081475258, + "step": 1150, + "step_time": 30.07740504499525 + }, + { + "clip_ratio/high_max": 0.026794564272859134, + "clip_ratio/high_mean": 0.014579377029440366, + "clip_ratio/low_mean": 0.021407196778454818, + "clip_ratio/low_min": 0.006161132891429588, + "clip_ratio/region_mean": 0.035986573772970584, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 467.1, + "completions/mean_length": 54.40729293823242, + "completions/min_length": 26.1, + "epoch": 0.12322628827483197, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3319120407104492, + "kl": 0.5093825986608863, + "learning_rate": 4.9983961838690473e-05, + "loss": 0.00801520049571991, + "reward": 0.08879910409450531, + "reward_std": 0.12629093527793883, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08879910316318274, + "rewards/ASRWerHalluLenRewardV5/std": 0.39248122945427893, + "step": 1155, + "step_time": 85.50344787165523 + }, + { + "clip_ratio/high_max": 0.024573741760104895, + "clip_ratio/high_mean": 0.013284446459147147, + "clip_ratio/low_mean": 0.024801372905494647, + "clip_ratio/low_min": 0.007677417452214286, + "clip_ratio/region_mean": 0.03808581959456205, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 262.6, + "completions/mean_length": 51.99479312896729, + "completions/min_length": 34.1, + "epoch": 0.12375973541022085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2985908091068268, + "kl": 0.3737185955047607, + "learning_rate": 4.9983442054105966e-05, + "loss": 0.024840639531612398, + "reward": 0.09498805850744248, + "reward_std": 0.10549605377018452, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09498805403709412, + "rewards/ASRWerHalluLenRewardV5/std": 0.41430003494024276, + "step": 1160, + "step_time": 88.35626623574645 + }, + { + "epoch": 0.12375973541022085, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0011737089201877935, + "eval_completions/max_length": 70.28169014084507, + "eval_completions/mean_length": 47.322771126115825, + "eval_completions/min_length": 32.985915492957744, + "eval_frac_reward_zero_std": 0.15492958208204996, + "eval_kl": 0.2777229215897305, + "eval_loss": 0.01671767607331276, + "eval_reward": 0.6204228038624139, + "eval_reward_std": 0.058911492274155916, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6204227893810037, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3012523698607381, + "eval_runtime": 387.8609, + "eval_samples_per_second": 0.552, + "eval_steps_per_second": 0.046, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.023299489298369735, + "clip_ratio/high_mean": 0.012782835931284353, + "clip_ratio/low_mean": 0.02144681747886352, + "clip_ratio/low_min": 0.00875283406348899, + "clip_ratio/region_mean": 0.034229653491638604, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.3, + "completions/mean_length": 49.86979331970215, + "completions/min_length": 35.7, + "epoch": 0.12429318254560973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2809962034225464, + "kl": 0.482571822963655, + "learning_rate": 4.9982913983186285e-05, + "loss": 0.010915641486644746, + "reward": 0.08683606274425984, + "reward_std": 0.1129381686449051, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08683606350095943, + "rewards/ASRWerHalluLenRewardV5/std": 0.40758472979068755, + "step": 1165, + "step_time": 31.49570974241942 + }, + { + "clip_ratio/high_max": 0.024759801261825486, + "clip_ratio/high_mean": 0.012781311088474467, + "clip_ratio/low_mean": 0.01614216133602895, + "clip_ratio/low_min": 0.004444583595613949, + "clip_ratio/region_mean": 0.028923472529277204, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.15, + "completions/mean_length": 49.64583444595337, + "completions/min_length": 32.6, + "epoch": 0.12482662968099861, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25434786081314087, + "kl": 0.3815100058913231, + "learning_rate": 4.998237762610658e-05, + "loss": 0.0008874764665961265, + "reward": 0.24487357176840305, + "reward_std": 0.10994000546634197, + "rewards/ASRWerHalluLenRewardV5/mean": 0.24487356101162733, + "rewards/ASRWerHalluLenRewardV5/std": 0.46770170629024505, + "step": 1170, + "step_time": 33.09689524527639 + }, + { + "clip_ratio/high_max": 0.024459327006479726, + "clip_ratio/high_mean": 0.013836747560708318, + "clip_ratio/low_mean": 0.022623501547786872, + "clip_ratio/low_min": 0.007435000906116329, + "clip_ratio/region_mean": 0.0364602490561083, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.5, + "completions/mean_length": 46.946875953674315, + "completions/min_length": 30.8, + "epoch": 0.1253600768163875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2875978350639343, + "kl": 0.34472307162359356, + "learning_rate": 4.9981832983044745e-05, + "loss": -0.0010014991275966167, + "reward": 0.057512760162353516, + "reward_std": 0.11909103281795978, + "rewards/ASRWerHalluLenRewardV5/mean": 0.057512761780526486, + "rewards/ASRWerHalluLenRewardV5/std": 0.43290549218654634, + "step": 1175, + "step_time": 31.893299024738372 + }, + { + "clip_ratio/high_max": 0.024333265650784596, + "clip_ratio/high_mean": 0.013574296922888607, + "clip_ratio/low_mean": 0.021828537838882767, + "clip_ratio/low_min": 0.008287718280917034, + "clip_ratio/region_mean": 0.035402834857814015, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.5, + "completions/mean_length": 49.12500114440918, + "completions/min_length": 35.25, + "epoch": 0.1258935239517764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27340903878211975, + "kl": 0.30496250772848726, + "learning_rate": 4.9981280054181425e-05, + "loss": 0.006509742140769959, + "reward": 0.1455186627805233, + "reward_std": 0.09812142439186573, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14551866014953702, + "rewards/ASRWerHalluLenRewardV5/std": 0.427775913476944, + "step": 1180, + "step_time": 30.552203050628304 + }, + { + "epoch": 0.1258935239517764, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 56.901408450704224, + "eval_completions/mean_length": 45.75234845658423, + "eval_completions/min_length": 32.84507042253521, + "eval_frac_reward_zero_std": 0.16431925372338632, + "eval_kl": 0.20222635739023836, + "eval_loss": 0.010492527857422829, + "eval_reward": 0.6209416520637525, + "eval_reward_std": 0.060067099512515355, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6209416316008903, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3043528008996181, + "eval_runtime": 337.056, + "eval_samples_per_second": 0.635, + "eval_steps_per_second": 0.053, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.028101132600568235, + "clip_ratio/high_mean": 0.015340362576534972, + "clip_ratio/low_mean": 0.018864900959306395, + "clip_ratio/low_min": 0.006496647209860385, + "clip_ratio/region_mean": 0.034205263678450136, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.6, + "completions/mean_length": 45.61145963668823, + "completions/min_length": 28.45, + "epoch": 0.12642697108716527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2771006226539612, + "kl": 0.3206426206044853, + "learning_rate": 4.998071883970002e-05, + "loss": 0.0010825096629559993, + "reward": 0.049767949432134626, + "reward_std": 0.1227209497243166, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04976795027032495, + "rewards/ASRWerHalluLenRewardV5/std": 0.4432234913110733, + "step": 1185, + "step_time": 30.204979922249912 + }, + { + "clip_ratio/high_max": 0.028220631362637506, + "clip_ratio/high_mean": 0.016649660177063198, + "clip_ratio/low_mean": 0.017792708548950032, + "clip_ratio/low_min": 0.006431512493873015, + "clip_ratio/region_mean": 0.03444236838258803, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.05, + "completions/mean_length": 49.30625133514404, + "completions/min_length": 32.3, + "epoch": 0.12696041822255413, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33391302824020386, + "kl": 0.3133445616811514, + "learning_rate": 4.998014933978665e-05, + "loss": -0.0008558402769267559, + "reward": 0.07649024613201619, + "reward_std": 0.11089640855789185, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07649024704005569, + "rewards/ASRWerHalluLenRewardV5/std": 0.3675775021314621, + "step": 1190, + "step_time": 30.704920135810973 + }, + { + "clip_ratio/high_max": 0.02598376620153431, + "clip_ratio/high_mean": 0.014138944375736174, + "clip_ratio/low_mean": 0.022425020608352497, + "clip_ratio/low_min": 0.007572715551941655, + "clip_ratio/region_mean": 0.036563965235836805, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.7, + "completions/mean_length": 47.09479293823242, + "completions/min_length": 24.75, + "epoch": 0.12749386535794302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44957050681114197, + "kl": 0.38864260651171206, + "learning_rate": 4.997957155463022e-05, + "loss": 0.00525350533425808, + "reward": 0.0015701942145824432, + "reward_std": 0.1390396200120449, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0015701924296081416, + "rewards/ASRWerHalluLenRewardV5/std": 0.4534612409770489, + "step": 1195, + "step_time": 31.172508512437343 + }, + { + "clip_ratio/high_max": 0.026518847479019315, + "clip_ratio/high_mean": 0.01534777895139996, + "clip_ratio/low_mean": 0.021937844873173164, + "clip_ratio/low_min": 0.008440735263866373, + "clip_ratio/region_mean": 0.03728562365286052, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.6, + "completions/mean_length": 47.15208492279053, + "completions/min_length": 30.05, + "epoch": 0.1280273124933319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4437207579612732, + "kl": 0.2638074651360512, + "learning_rate": 4.997898548442236e-05, + "loss": 0.02040847986936569, + "reward": 0.030245497077703475, + "reward_std": 0.1180131733417511, + "rewards/ASRWerHalluLenRewardV5/mean": 0.03024550140835345, + "rewards/ASRWerHalluLenRewardV5/std": 0.35368256717920304, + "step": 1200, + "step_time": 32.83905965536833 + }, + { + "epoch": 0.1280273124933319, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.25352112676056, + "eval_completions/mean_length": 46.06690245614925, + "eval_completions/min_length": 33.056338028169016, + "eval_frac_reward_zero_std": 0.16901408954405447, + "eval_kl": 0.19799550566297602, + "eval_loss": 0.009318606927990913, + "eval_reward": 0.6247281597953447, + "eval_reward_std": 0.05574072972202385, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6247281409065488, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.29841708003635137, + "eval_runtime": 340.7571, + "eval_samples_per_second": 0.628, + "eval_steps_per_second": 0.053, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.02576664023217745, + "clip_ratio/high_mean": 0.014380328699189704, + "clip_ratio/low_mean": 0.0192615910054883, + "clip_ratio/low_min": 0.0068488700140733275, + "clip_ratio/region_mean": 0.03364191979635507, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.15, + "completions/mean_length": 47.30104274749756, + "completions/min_length": 32.45, + "epoch": 0.1285607596287208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.316606730222702, + "kl": 0.31076571261510255, + "learning_rate": 4.9978391129357446e-05, + "loss": 0.009427210688591004, + "reward": 0.15924068875610828, + "reward_std": 0.12651926502585412, + "rewards/ASRWerHalluLenRewardV5/mean": 0.15924068428575994, + "rewards/ASRWerHalluLenRewardV5/std": 0.34100558459758756, + "step": 1205, + "step_time": 30.250768317468463 + }, + { + "clip_ratio/high_max": 0.02887065636459738, + "clip_ratio/high_mean": 0.01654268127749674, + "clip_ratio/low_mean": 0.018563222838565707, + "clip_ratio/low_min": 0.00584659151500091, + "clip_ratio/region_mean": 0.03510590431978926, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.65, + "completions/mean_length": 47.017709732055664, + "completions/min_length": 31.8, + "epoch": 0.12909420676410968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4529380202293396, + "kl": 0.3286305209621787, + "learning_rate": 4.997778848963263e-05, + "loss": -0.00019555340986698866, + "reward": 0.1656231466680765, + "reward_std": 0.11293609365820885, + "rewards/ASRWerHalluLenRewardV5/mean": 0.16562314406037332, + "rewards/ASRWerHalluLenRewardV5/std": 0.3759664423763752, + "step": 1210, + "step_time": 31.76740676742047 + }, + { + "clip_ratio/high_max": 0.021309600863605737, + "clip_ratio/high_mean": 0.012015803981921636, + "clip_ratio/low_mean": 0.019314767234027384, + "clip_ratio/low_min": 0.00609475378296338, + "clip_ratio/region_mean": 0.03133057126542553, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.3, + "completions/mean_length": 47.24479293823242, + "completions/min_length": 29.7, + "epoch": 0.12962765389949857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26109567284584045, + "kl": 0.38700951980426906, + "learning_rate": 4.9977177565447764e-05, + "loss": 0.00940287634730339, + "reward": 0.2391682021319866, + "reward_std": 0.1143830768764019, + "rewards/ASRWerHalluLenRewardV5/mean": 0.2391681969165802, + "rewards/ASRWerHalluLenRewardV5/std": 0.3876656599342823, + "step": 1215, + "step_time": 32.18145955931395 + }, + { + "clip_ratio/high_max": 0.027537487272638828, + "clip_ratio/high_mean": 0.013544260145863518, + "clip_ratio/low_mean": 0.021159316353441683, + "clip_ratio/low_min": 0.0067362473113462325, + "clip_ratio/region_mean": 0.03470357647165656, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.85, + "completions/mean_length": 48.275000953674315, + "completions/min_length": 34.9, + "epoch": 0.13016110103488746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32878392934799194, + "kl": 0.37743106819689276, + "learning_rate": 4.99765583570055e-05, + "loss": 0.00839809626340866, + "reward": 0.1475730324164033, + "reward_std": 0.12000857163220643, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14757302361540497, + "rewards/ASRWerHalluLenRewardV5/std": 0.42323567271232604, + "step": 1220, + "step_time": 30.316489760391413 + }, + { + "epoch": 0.13016110103488746, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.70422535211268, + "eval_completions/mean_length": 46.23474287650954, + "eval_completions/min_length": 33.098591549295776, + "eval_frac_reward_zero_std": 0.15492958166229892, + "eval_kl": 0.21531444832577673, + "eval_loss": 0.012708719819784164, + "eval_reward": 0.6156432797056688, + "eval_reward_std": 0.06682443645008852, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6156432604233564, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3131788386110689, + "eval_runtime": 340.6064, + "eval_samples_per_second": 0.628, + "eval_steps_per_second": 0.053, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.023019167582970113, + "clip_ratio/high_mean": 0.012910409580217674, + "clip_ratio/low_mean": 0.01918815485551022, + "clip_ratio/low_min": 0.005401855520904064, + "clip_ratio/region_mean": 0.03209856442990713, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 288.25, + "completions/mean_length": 52.91458511352539, + "completions/min_length": 34.65, + "epoch": 0.13069454817027631, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0794962644577026, + "kl": 0.30390668334439397, + "learning_rate": 4.99759308645112e-05, + "loss": 0.008363225311040879, + "reward": 0.16694361343979836, + "reward_std": 0.12943546213209628, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1669436059310101, + "rewards/ASRWerHalluLenRewardV5/std": 0.43711976110935213, + "step": 1225, + "step_time": 93.61355560086668 + }, + { + "clip_ratio/high_max": 0.022369395871646703, + "clip_ratio/high_mean": 0.012659403547877446, + "clip_ratio/low_mean": 0.01923122357111424, + "clip_ratio/low_min": 0.006129666219931096, + "clip_ratio/region_mean": 0.031890627008397135, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.9, + "completions/mean_length": 48.7208345413208, + "completions/min_length": 32.9, + "epoch": 0.1312279953056652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4312813878059387, + "kl": 0.3158006416633725, + "learning_rate": 4.997529508817298e-05, + "loss": -0.0033010341227054595, + "reward": 0.20353309232741595, + "reward_std": 0.12541102655231953, + "rewards/ASRWerHalluLenRewardV5/mean": 0.20353308459743857, + "rewards/ASRWerHalluLenRewardV5/std": 0.3425862491130829, + "step": 1230, + "step_time": 31.628210032358766 + }, + { + "clip_ratio/high_max": 0.023971352580701932, + "clip_ratio/high_mean": 0.013156012824038044, + "clip_ratio/low_mean": 0.017038739709823857, + "clip_ratio/low_min": 0.0041122311202343555, + "clip_ratio/region_mean": 0.03019475258188322, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.1, + "completions/mean_length": 48.54375095367432, + "completions/min_length": 32.5, + "epoch": 0.1317614424410541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2647292912006378, + "kl": 0.273429268412292, + "learning_rate": 4.9974651028201725e-05, + "loss": 0.012289105355739594, + "reward": 0.27402831800282, + "reward_std": 0.10544358640909195, + "rewards/ASRWerHalluLenRewardV5/mean": 0.27402831638464703, + "rewards/ASRWerHalluLenRewardV5/std": 0.3522832810878754, + "step": 1235, + "step_time": 30.891505305469035 + }, + { + "clip_ratio/high_max": 0.026968799624592067, + "clip_ratio/high_mean": 0.015487970133835915, + "clip_ratio/low_mean": 0.022009541984880344, + "clip_ratio/low_min": 0.009288272622507065, + "clip_ratio/region_mean": 0.03749751234427094, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.55, + "completions/mean_length": 49.15000057220459, + "completions/min_length": 35.6, + "epoch": 0.13229488957644298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35991910099983215, + "kl": 0.3513230012729764, + "learning_rate": 4.9973998684811055e-05, + "loss": 0.00018060635775327683, + "reward": 0.10322809331119061, + "reward_std": 0.11262127757072449, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10322808458586223, + "rewards/ASRWerHalluLenRewardV5/std": 0.4070937216281891, + "step": 1240, + "step_time": 35.35676127187908 + }, + { + "epoch": 0.13229488957644298, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.394366197183096, + "eval_completions/mean_length": 45.881456348258006, + "eval_completions/min_length": 33.098591549295776, + "eval_frac_reward_zero_std": 0.17370892494497164, + "eval_kl": 0.19513369910418987, + "eval_loss": 0.00987114105373621, + "eval_reward": 0.6216730121785486, + "eval_reward_std": 0.05827437991827306, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6216729952311012, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.29758883815940834, + "eval_runtime": 342.4496, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.053, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.024224631959805266, + "clip_ratio/high_mean": 0.013340575515758245, + "clip_ratio/low_mean": 0.02118625457806047, + "clip_ratio/low_min": 0.008449738769559189, + "clip_ratio/region_mean": 0.03452683070499916, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.9, + "completions/mean_length": 45.77708492279053, + "completions/min_length": 29.35, + "epoch": 0.13282833671183186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4698885977268219, + "kl": 0.3203980565536767, + "learning_rate": 4.997333805821731e-05, + "loss": 0.004951692745089531, + "reward": 0.10557614639401436, + "reward_std": 0.1273523647338152, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1055761409457773, + "rewards/ASRWerHalluLenRewardV5/std": 0.3299987480044365, + "step": 1245, + "step_time": 29.58505427762866 + }, + { + "clip_ratio/high_max": 0.02453693404677324, + "clip_ratio/high_mean": 0.01315730177302612, + "clip_ratio/low_mean": 0.019144508968747687, + "clip_ratio/low_min": 0.007217649300582707, + "clip_ratio/region_mean": 0.032301810634089635, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.95, + "completions/mean_length": 47.73958415985108, + "completions/min_length": 33.45, + "epoch": 0.13336178384722075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30732837319374084, + "kl": 0.3912776374258101, + "learning_rate": 4.997266914863963e-05, + "loss": 0.0036495476961135864, + "reward": 0.1943881966173649, + "reward_std": 0.10876386091113091, + "rewards/ASRWerHalluLenRewardV5/mean": 0.19438819583738223, + "rewards/ASRWerHalluLenRewardV5/std": 0.41325746327638624, + "step": 1250, + "step_time": 30.91806163266301 + }, + { + "clip_ratio/high_max": 0.024634597118711098, + "clip_ratio/high_mean": 0.0143041955656372, + "clip_ratio/low_mean": 0.021930703401449137, + "clip_ratio/low_min": 0.007635849370853975, + "clip_ratio/region_mean": 0.036234899004921314, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 266.0, + "completions/mean_length": 51.758334922790525, + "completions/min_length": 31.8, + "epoch": 0.13389523098260964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30202335119247437, + "kl": 0.2950546517968178, + "learning_rate": 4.997199195629985e-05, + "loss": 0.009187065064907074, + "reward": 0.11988993491977454, + "reward_std": 0.13062361851334572, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1198899330891436, + "rewards/ASRWerHalluLenRewardV5/std": 0.37107628360390665, + "step": 1255, + "step_time": 57.363330910541116 + }, + { + "clip_ratio/high_max": 0.029460000252583995, + "clip_ratio/high_mean": 0.015667620513704607, + "clip_ratio/low_mean": 0.02383956937555922, + "clip_ratio/low_min": 0.008318894251715392, + "clip_ratio/region_mean": 0.03950718977721408, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.3, + "completions/mean_length": 44.80833435058594, + "completions/min_length": 25.8, + "epoch": 0.1344286781179985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.605584979057312, + "kl": 0.35548952044919135, + "learning_rate": 4.99713064814226e-05, + "loss": 0.007641059160232544, + "reward": -0.061611310578882694, + "reward_std": 0.11511753853410482, + "rewards/ASRWerHalluLenRewardV5/mean": -0.06161130731925368, + "rewards/ASRWerHalluLenRewardV5/std": 0.3784534603357315, + "step": 1260, + "step_time": 30.131939112208784 + }, + { + "epoch": 0.1344286781179985, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0011737089201877935, + "eval_completions/max_length": 115.67605633802818, + "eval_completions/mean_length": 51.34741936267262, + "eval_completions/min_length": 33.098591549295776, + "eval_frac_reward_zero_std": 0.16431925372338632, + "eval_kl": 0.2294890349599677, + "eval_loss": 0.016894973814487457, + "eval_reward": 0.6015672613407524, + "eval_reward_std": 0.06595889036871598, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6015672424913082, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3135418942276861, + "eval_runtime": 479.373, + "eval_samples_per_second": 0.446, + "eval_steps_per_second": 0.038, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.027055584412300958, + "clip_ratio/high_mean": 0.01439632269903086, + "clip_ratio/low_mean": 0.02159155089757405, + "clip_ratio/low_min": 0.008258974965428934, + "clip_ratio/region_mean": 0.03598787382943556, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 465.1, + "completions/mean_length": 55.65625171661377, + "completions/min_length": 31.75, + "epoch": 0.13496212525338738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35004130005836487, + "kl": 0.3961567535996437, + "learning_rate": 4.997061272423521e-05, + "loss": 0.010065703839063644, + "reward": 0.12715473733842372, + "reward_std": 0.11806702874600887, + "rewards/ASRWerHalluLenRewardV5/mean": 0.12715473687276244, + "rewards/ASRWerHalluLenRewardV5/std": 0.3261526241898537, + "step": 1265, + "step_time": 118.25449157115072 + }, + { + "clip_ratio/high_max": 0.028169340302702038, + "clip_ratio/high_mean": 0.015184389523346909, + "clip_ratio/low_mean": 0.020241755084134637, + "clip_ratio/low_min": 0.006082823208998889, + "clip_ratio/region_mean": 0.0354261445463635, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 263.55, + "completions/mean_length": 51.09062614440918, + "completions/min_length": 32.1, + "epoch": 0.13549557238877627, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23104676604270935, + "kl": 0.4541174890473485, + "learning_rate": 4.99699106849678e-05, + "loss": -0.003235117718577385, + "reward": 0.09087373167276383, + "reward_std": 0.10959376357495784, + "rewards/ASRWerHalluLenRewardV5/mean": 0.09087372962385416, + "rewards/ASRWerHalluLenRewardV5/std": 0.44979212805628777, + "step": 1270, + "step_time": 92.06207816656679 + }, + { + "clip_ratio/high_max": 0.022540211339946836, + "clip_ratio/high_mean": 0.011426641282741911, + "clip_ratio/low_mean": 0.01827425871306332, + "clip_ratio/low_min": 0.004790582304121926, + "clip_ratio/region_mean": 0.02970090004382655, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.25, + "completions/mean_length": 48.60729217529297, + "completions/min_length": 31.45, + "epoch": 0.13602901952416516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5420587062835693, + "kl": 0.30298791686072946, + "learning_rate": 4.996920036385321e-05, + "loss": 0.0033202867954969404, + "reward": 0.20412076991051437, + "reward_std": 0.10080176182091236, + "rewards/ASRWerHalluLenRewardV5/mean": 0.2041207628673874, + "rewards/ASRWerHalluLenRewardV5/std": 0.3971730910241604, + "step": 1275, + "step_time": 30.34056798890233 + }, + { + "clip_ratio/high_max": 0.025638208084274083, + "clip_ratio/high_mean": 0.014390665908285883, + "clip_ratio/low_mean": 0.026691457876586354, + "clip_ratio/low_min": 0.00947794567910023, + "clip_ratio/region_mean": 0.04108212366700172, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.55, + "completions/mean_length": 45.78541793823242, + "completions/min_length": 26.2, + "epoch": 0.13656246665955404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48816367983818054, + "kl": 0.3716315316036344, + "learning_rate": 4.996848176112703e-05, + "loss": 0.010780350863933563, + "reward": -0.030049724131822587, + "reward_std": 0.11893180496990681, + "rewards/ASRWerHalluLenRewardV5/mean": -0.030049724597483875, + "rewards/ASRWerHalluLenRewardV5/std": 0.41721241250634195, + "step": 1280, + "step_time": 31.94088184274733 + }, + { + "epoch": 0.13656246665955404, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.394366197183096, + "eval_completions/mean_length": 45.86854569341095, + "eval_completions/min_length": 32.91549295774648, + "eval_frac_reward_zero_std": 0.16431925372338632, + "eval_kl": 0.21277233152847055, + "eval_loss": 0.010081085376441479, + "eval_reward": 0.617811921249393, + "eval_reward_std": 0.06217461792935788, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6178119159238019, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3068736702151282, + "eval_runtime": 341.5806, + "eval_samples_per_second": 0.626, + "eval_steps_per_second": 0.053, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.02648161641554907, + "clip_ratio/high_mean": 0.014808848916436546, + "clip_ratio/low_mean": 0.020122458718833513, + "clip_ratio/low_min": 0.006028273311676458, + "clip_ratio/region_mean": 0.034931307384977114, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.5, + "completions/mean_length": 45.35000133514404, + "completions/min_length": 26.1, + "epoch": 0.13709591379494293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3721654713153839, + "kl": 0.32821476748213174, + "learning_rate": 4.996775487702762e-05, + "loss": 0.0010109092108905315, + "reward": 0.06592083834111691, + "reward_std": 0.11227640397846698, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06592083907162305, + "rewards/ASRWerHalluLenRewardV5/std": 0.39092144593596456, + "step": 1285, + "step_time": 32.12527762241662 + }, + { + "clip_ratio/high_max": 0.022671167663065716, + "clip_ratio/high_mean": 0.012637928056938108, + "clip_ratio/low_mean": 0.02459215850394685, + "clip_ratio/low_min": 0.0071766000881325455, + "clip_ratio/region_mean": 0.03723008644301444, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.6, + "completions/mean_length": 49.335418128967284, + "completions/min_length": 35.4, + "epoch": 0.13762936093033182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2869282066822052, + "kl": 0.3276601077988744, + "learning_rate": 4.9967019711796036e-05, + "loss": 0.017004695534706116, + "reward": 0.1444264341145754, + "reward_std": 0.1153901007026434, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14442643113434314, + "rewards/ASRWerHalluLenRewardV5/std": 0.44024357199668884, + "step": 1290, + "step_time": 32.071263059601186 + }, + { + "clip_ratio/high_max": 0.026060979487374424, + "clip_ratio/high_mean": 0.014538901859486942, + "clip_ratio/low_mean": 0.02040668166009709, + "clip_ratio/low_min": 0.005881027402938344, + "clip_ratio/region_mean": 0.034945583413355054, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 467.4, + "completions/mean_length": 56.63541812896729, + "completions/min_length": 29.1, + "epoch": 0.13816280806572068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4621104896068573, + "kl": 0.44171452783048154, + "learning_rate": 4.996627626567614e-05, + "loss": 0.0071450725197792055, + "reward": 0.06452074255794286, + "reward_std": 0.11783649884164334, + "rewards/ASRWerHalluLenRewardV5/mean": 0.06452074381522835, + "rewards/ASRWerHalluLenRewardV5/std": 0.41502648442983625, + "step": 1295, + "step_time": 116.45308880731463 + }, + { + "clip_ratio/high_max": 0.026100362266879527, + "clip_ratio/high_mean": 0.01595587091287598, + "clip_ratio/low_mean": 0.024825791543116794, + "clip_ratio/low_min": 0.008403081016149372, + "clip_ratio/region_mean": 0.04078166231047362, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 265.5, + "completions/mean_length": 53.49062595367432, + "completions/min_length": 36.95, + "epoch": 0.13869625520110956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36724162101745605, + "kl": 0.511685904674232, + "learning_rate": 4.99655245389145e-05, + "loss": 0.0045750241726636885, + "reward": 0.04267330160364509, + "reward_std": 0.12589988596737384, + "rewards/ASRWerHalluLenRewardV5/mean": 0.042673300951719284, + "rewards/ASRWerHalluLenRewardV5/std": 0.39222835302352904, + "step": 1300, + "step_time": 92.08458511326462 + }, + { + "epoch": 0.13869625520110956, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0011737089201877935, + "eval_completions/max_length": 114.78873239436619, + "eval_completions/mean_length": 51.185447639142964, + "eval_completions/min_length": 33.38028169014085, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.23450892867231873, + "eval_loss": 0.012663780711591244, + "eval_reward": 0.6152253104242641, + "eval_reward_std": 0.05600079061778288, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6152252887546177, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30695990688154395, + "eval_runtime": 475.9355, + "eval_samples_per_second": 0.45, + "eval_steps_per_second": 0.038, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.030914877343457194, + "clip_ratio/high_mean": 0.016870503636891954, + "clip_ratio/low_mean": 0.01881950867100386, + "clip_ratio/low_min": 0.007649436284555122, + "clip_ratio/region_mean": 0.03569001239957288, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 268.85, + "completions/mean_length": 54.64687709808349, + "completions/min_length": 36.05, + "epoch": 0.13922970233649845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3064101040363312, + "kl": 0.3605028113350272, + "learning_rate": 4.996476453176044e-05, + "loss": 0.009418854117393493, + "reward": 0.13159683533012867, + "reward_std": 0.11575601808726788, + "rewards/ASRWerHalluLenRewardV5/mean": 0.13159682715777307, + "rewards/ASRWerHalluLenRewardV5/std": 0.3788574308156967, + "step": 1305, + "step_time": 68.01624109838158 + }, + { + "clip_ratio/high_max": 0.028191327396780253, + "clip_ratio/high_mean": 0.014576009678421542, + "clip_ratio/low_mean": 0.020551545231137426, + "clip_ratio/low_min": 0.006047491042409092, + "clip_ratio/region_mean": 0.0351275549735874, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 267.4, + "completions/mean_length": 49.42812671661377, + "completions/min_length": 25.0, + "epoch": 0.13976314947188734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8332604765892029, + "kl": 0.49096572268754246, + "learning_rate": 4.996399624446605e-05, + "loss": -0.0005515488795936107, + "reward": 0.07349770069122315, + "reward_std": 0.12066669277846813, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07349770314176567, + "rewards/ASRWerHalluLenRewardV5/std": 0.4124982178211212, + "step": 1310, + "step_time": 90.02997033353896 + }, + { + "clip_ratio/high_max": 0.029439423425355927, + "clip_ratio/high_mean": 0.016244137020839843, + "clip_ratio/low_mean": 0.023364294634666292, + "clip_ratio/low_min": 0.009141319466289132, + "clip_ratio/region_mean": 0.0396084317821078, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 326.8, + "completions/mean_length": 56.19687671661377, + "completions/min_length": 33.25, + "epoch": 0.14029659660727622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4033411741256714, + "kl": 0.3407327619381249, + "learning_rate": 4.996321967728614e-05, + "loss": 0.015583875775337219, + "reward": 0.051403230987489223, + "reward_std": 0.12631602212786674, + "rewards/ASRWerHalluLenRewardV5/mean": 0.051403222419321534, + "rewards/ASRWerHalluLenRewardV5/std": 0.36937597692012786, + "step": 1315, + "step_time": 50.59734730962664 + }, + { + "clip_ratio/high_max": 0.030162248603301123, + "clip_ratio/high_mean": 0.017776705222786404, + "clip_ratio/low_mean": 0.02522097058244981, + "clip_ratio/low_min": 0.01008501430042088, + "clip_ratio/region_mean": 0.04299767559859902, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 268.3, + "completions/mean_length": 51.57812652587891, + "completions/min_length": 28.45, + "epoch": 0.1408300437426651, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37155604362487793, + "kl": 0.4516682349145412, + "learning_rate": 4.996243483047828e-05, + "loss": 0.005852239951491356, + "reward": -0.037479805015027526, + "reward_std": 0.13242180906236173, + "rewards/ASRWerHalluLenRewardV5/mean": -0.037479796539992095, + "rewards/ASRWerHalluLenRewardV5/std": 0.41191285401582717, + "step": 1320, + "step_time": 91.9268301397562 + }, + { + "epoch": 0.1408300437426651, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 59.74647887323944, + "eval_completions/mean_length": 47.28169132286394, + "eval_completions/min_length": 33.61971830985915, + "eval_frac_reward_zero_std": 0.13145540297870906, + "eval_kl": 0.2080066032628988, + "eval_loss": 0.01647188887000084, + "eval_reward": 0.576849519156459, + "eval_reward_std": 0.07102981603271524, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.5768495083392621, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3182901808808387, + "eval_runtime": 349.5212, + "eval_samples_per_second": 0.612, + "eval_steps_per_second": 0.051, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.02835335458512418, + "clip_ratio/high_mean": 0.01604567348549608, + "clip_ratio/low_mean": 0.017914352130901534, + "clip_ratio/low_min": 0.005359313177177682, + "clip_ratio/region_mean": 0.033960025606211273, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.95, + "completions/mean_length": 49.24375114440918, + "completions/min_length": 27.8, + "epoch": 0.14136349087805397, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26811447739601135, + "kl": 0.299271617596969, + "learning_rate": 4.996164170430277e-05, + "loss": -0.004128267988562584, + "reward": 0.08020335659384728, + "reward_std": 0.11420706994831561, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08020335361361504, + "rewards/ASRWerHalluLenRewardV5/std": 0.4145127959549427, + "step": 1325, + "step_time": 31.64810836631805 + }, + { + "clip_ratio/high_max": 0.02591177910217084, + "clip_ratio/high_mean": 0.015134943471639416, + "clip_ratio/low_mean": 0.022334377781953664, + "clip_ratio/low_min": 0.00770921382936649, + "clip_ratio/region_mean": 0.03746932108188048, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.9, + "completions/mean_length": 44.92708473205566, + "completions/min_length": 31.7, + "epoch": 0.14189693801344286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3182419240474701, + "kl": 0.2128689386881888, + "learning_rate": 4.996084029902268e-05, + "loss": -0.0049121897667646405, + "reward": 0.1441415935754776, + "reward_std": 0.12290981225669384, + "rewards/ASRWerHalluLenRewardV5/mean": 0.14414158748695627, + "rewards/ASRWerHalluLenRewardV5/std": 0.41640149876475335, + "step": 1330, + "step_time": 30.051158401556314 + }, + { + "clip_ratio/high_max": 0.027632137283217163, + "clip_ratio/high_mean": 0.015697227187047247, + "clip_ratio/low_mean": 0.021800728743255605, + "clip_ratio/low_min": 0.008995487843640149, + "clip_ratio/region_mean": 0.03749795609619468, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.05, + "completions/mean_length": 47.41354236602783, + "completions/min_length": 29.95, + "epoch": 0.14243038514883175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5072255730628967, + "kl": 0.2825756220147014, + "learning_rate": 4.996003061490382e-05, + "loss": 0.004158083721995354, + "reward": 0.015180091932415962, + "reward_std": 0.12399710360914469, + "rewards/ASRWerHalluLenRewardV5/mean": 0.01518009351566434, + "rewards/ASRWerHalluLenRewardV5/std": 0.36958366110920904, + "step": 1335, + "step_time": 31.383092961646618 + }, + { + "clip_ratio/high_max": 0.027815924567403272, + "clip_ratio/high_mean": 0.014422058819036465, + "clip_ratio/low_mean": 0.020998114941176028, + "clip_ratio/low_min": 0.006548709896742366, + "clip_ratio/region_mean": 0.035420173429884017, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.05, + "completions/mean_length": 47.1458345413208, + "completions/min_length": 29.8, + "epoch": 0.14296383228422063, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2935876250267029, + "kl": 0.2691140349954367, + "learning_rate": 4.9959212652214726e-05, + "loss": -0.0010920217260718345, + "reward": 0.1435886027291417, + "reward_std": 0.10769168063998222, + "rewards/ASRWerHalluLenRewardV5/mean": 0.1435885988175869, + "rewards/ASRWerHalluLenRewardV5/std": 0.443392850458622, + "step": 1340, + "step_time": 33.12686550337821 + }, + { + "epoch": 0.14296383228422063, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.32394366197183, + "eval_completions/mean_length": 45.95540027887049, + "eval_completions/min_length": 33.056338028169016, + "eval_frac_reward_zero_std": 0.1455399104407136, + "eval_kl": 0.16346069881823702, + "eval_loss": 0.00832957774400711, + "eval_reward": 0.6127914450013302, + "eval_reward_std": 0.06218251018640651, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6127914295230114, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3083276652376836, + "eval_runtime": 340.3947, + "eval_samples_per_second": 0.629, + "eval_steps_per_second": 0.053, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.02787293738219887, + "clip_ratio/high_mean": 0.01575737335951999, + "clip_ratio/low_mean": 0.020793028781190513, + "clip_ratio/low_min": 0.0076123127684695644, + "clip_ratio/region_mean": 0.036550402035936715, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/mean_length": 49.88229331970215, + "completions/min_length": 37.05, + "epoch": 0.14349727941960952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.464756041765213, + "kl": 0.2696240635588765, + "learning_rate": 4.995838641122671e-05, + "loss": -0.001264297217130661, + "reward": 0.1272611130028963, + "reward_std": 0.12108649350702763, + "rewards/ASRWerHalluLenRewardV5/mean": 0.127261113433633, + "rewards/ASRWerHalluLenRewardV5/std": 0.4405731275677681, + "step": 1345, + "step_time": 30.468586936965586 + }, + { + "clip_ratio/high_max": 0.031956243875902145, + "clip_ratio/high_mean": 0.017506475058326032, + "clip_ratio/low_mean": 0.02343797172361519, + "clip_ratio/low_min": 0.008903066156199202, + "clip_ratio/region_mean": 0.040944446565117684, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.2, + "completions/mean_length": 47.869792747497556, + "completions/min_length": 27.65, + "epoch": 0.1440307265549984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2767680883407593, + "kl": 0.24955155663192272, + "learning_rate": 4.99575518922138e-05, + "loss": 0.014605671167373657, + "reward": 0.02193791847676039, + "reward_std": 0.14328977540135385, + "rewards/ASRWerHalluLenRewardV5/mean": 0.021937917452305555, + "rewards/ASRWerHalluLenRewardV5/std": 0.40916320905089376, + "step": 1350, + "step_time": 32.14109262581915 + }, + { + "clip_ratio/high_max": 0.027722464123507963, + "clip_ratio/high_mean": 0.01535752065537963, + "clip_ratio/low_mean": 0.02480277278809808, + "clip_ratio/low_min": 0.011036006192443892, + "clip_ratio/region_mean": 0.04016029336489737, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.8, + "completions/mean_length": 48.670834159851076, + "completions/min_length": 33.95, + "epoch": 0.1445641736903873, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45575806498527527, + "kl": 0.30565385995432737, + "learning_rate": 4.995670909545279e-05, + "loss": 0.005275604128837585, + "reward": 0.04107871502637863, + "reward_std": 0.1264207147061825, + "rewards/ASRWerHalluLenRewardV5/mean": 0.04107871549203992, + "rewards/ASRWerHalluLenRewardV5/std": 0.3557856485247612, + "step": 1355, + "step_time": 31.83995347842574 + }, + { + "clip_ratio/high_max": 0.025769633462186903, + "clip_ratio/high_mean": 0.014566395358997397, + "clip_ratio/low_mean": 0.02054567814047914, + "clip_ratio/low_min": 0.005539662519004196, + "clip_ratio/region_mean": 0.035112073575146494, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.05, + "completions/mean_length": 44.56875152587891, + "completions/min_length": 27.25, + "epoch": 0.14509762082577615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.622713565826416, + "kl": 0.35802572248503567, + "learning_rate": 4.995585802122321e-05, + "loss": 0.0012725348584353923, + "reward": 0.08210737369954586, + "reward_std": 0.11070127449929715, + "rewards/ASRWerHalluLenRewardV5/mean": 0.0821073715109378, + "rewards/ASRWerHalluLenRewardV5/std": 0.4052552193403244, + "step": 1360, + "step_time": 29.217933963239194 + }, + { + "epoch": 0.14509762082577615, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0011737089201877935, + "eval_completions/max_length": 113.7605633802817, + "eval_completions/mean_length": 50.58802935103296, + "eval_completions/min_length": 33.028169014084504, + "eval_frac_reward_zero_std": 0.15023474626138178, + "eval_kl": 0.2099703281304576, + "eval_loss": 0.008831443265080452, + "eval_reward": 0.6169154863857048, + "eval_reward_std": 0.057062348272179216, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.616915474737614, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3051442146616083, + "eval_runtime": 561.632, + "eval_samples_per_second": 0.381, + "eval_steps_per_second": 0.032, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.029657999443588778, + "clip_ratio/high_mean": 0.01780400431889575, + "clip_ratio/low_mean": 0.023832027561729772, + "clip_ratio/low_min": 0.01137040569446981, + "clip_ratio/region_mean": 0.041636032017413525, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.65, + "completions/mean_length": 47.57187633514404, + "completions/min_length": 31.2, + "epoch": 0.14563106796116504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3022272288799286, + "kl": 0.3575389666482806, + "learning_rate": 4.995499866980734e-05, + "loss": -0.004753073304891586, + "reward": -0.07459599152207375, + "reward_std": 0.11470406875014305, + "rewards/ASRWerHalluLenRewardV5/mean": -0.07459598863497377, + "rewards/ASRWerHalluLenRewardV5/std": 0.3669079877436161, + "step": 1365, + "step_time": 30.502131809666754 + }, + { + "clip_ratio/high_max": 0.027769188553793355, + "clip_ratio/high_mean": 0.015449245809577405, + "clip_ratio/low_mean": 0.023089025646913796, + "clip_ratio/low_min": 0.008234375272877514, + "clip_ratio/region_mean": 0.038538270845310765, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 109.85, + "completions/mean_length": 46.708334827423094, + "completions/min_length": 28.75, + "epoch": 0.14616451509655393, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45264750719070435, + "kl": 0.4920773675665259, + "learning_rate": 4.99541310414902e-05, + "loss": 0.005164693295955658, + "reward": -0.01980448067188263, + "reward_std": 0.11222024112939835, + "rewards/ASRWerHalluLenRewardV5/mean": -0.019804472848773003, + "rewards/ASRWerHalluLenRewardV5/std": 0.3988423034548759, + "step": 1370, + "step_time": 34.760743415541945 + }, + { + "clip_ratio/high_max": 0.02486411952995695, + "clip_ratio/high_mean": 0.013199488207465037, + "clip_ratio/low_mean": 0.022422710643149914, + "clip_ratio/low_min": 0.0079383178264834, + "clip_ratio/region_mean": 0.03562219884479419, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.7, + "completions/mean_length": 47.26041793823242, + "completions/min_length": 32.05, + "epoch": 0.1466979622319428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30860435962677, + "kl": 0.4511614849790931, + "learning_rate": 4.995325513655957e-05, + "loss": 0.015288282930850983, + "reward": 0.10682128947228194, + "reward_std": 0.117414765432477, + "rewards/ASRWerHalluLenRewardV5/mean": 0.10682128481566906, + "rewards/ASRWerHalluLenRewardV5/std": 0.3852162331342697, + "step": 1375, + "step_time": 31.763111441023646 + }, + { + "clip_ratio/high_max": 0.026390076597454026, + "clip_ratio/high_mean": 0.013788673812814522, + "clip_ratio/low_mean": 0.022438164404593407, + "clip_ratio/low_min": 0.006422380334697664, + "clip_ratio/region_mean": 0.03622683802386746, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/mean_length": 48.02187614440918, + "completions/min_length": 31.55, + "epoch": 0.1472314093673317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4640622138977051, + "kl": 0.4821148807182908, + "learning_rate": 4.995237095530596e-05, + "loss": 0.0035938411951065064, + "reward": 0.050990286748856306, + "reward_std": 0.1159694217145443, + "rewards/ASRWerHalluLenRewardV5/mean": 0.05099029056727886, + "rewards/ASRWerHalluLenRewardV5/std": 0.4383368924260139, + "step": 1380, + "step_time": 31.17426394391805 + }, + { + "epoch": 0.1472314093673317, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0011737089201877935, + "eval_completions/max_length": 114.88732394366197, + "eval_completions/mean_length": 51.072771367892415, + "eval_completions/min_length": 33.40845070422535, + "eval_frac_reward_zero_std": 0.12676056673828984, + "eval_kl": 0.253546126265551, + "eval_loss": 0.015316903591156006, + "eval_reward": 0.6121550698200582, + "eval_reward_std": 0.06648664437496746, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6121550548139593, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.3070061602432963, + "eval_runtime": 475.2679, + "eval_samples_per_second": 0.45, + "eval_steps_per_second": 0.038, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.025888049957575276, + "clip_ratio/high_mean": 0.014850559514889028, + "clip_ratio/low_mean": 0.021945696615148336, + "clip_ratio/low_min": 0.007690343115245924, + "clip_ratio/region_mean": 0.036796255875378846, + "completions/clipped_ratio": 0.0010416666666666667, + "completions/max_length": 263.8, + "completions/mean_length": 50.31875152587891, + "completions/min_length": 26.95, + "epoch": 0.1477648565027206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43305161595344543, + "kl": 0.430271135084331, + "learning_rate": 4.995147849802262e-05, + "loss": 0.009131060540676117, + "reward": 0.0733426794409752, + "reward_std": 0.1311087805777788, + "rewards/ASRWerHalluLenRewardV5/mean": 0.07334268470294773, + "rewards/ASRWerHalluLenRewardV5/std": 0.314959929138422, + "step": 1385, + "step_time": 91.06063344106079 + }, + { + "clip_ratio/high_max": 0.03204437834210694, + "clip_ratio/high_mean": 0.01745714933495037, + "clip_ratio/low_mean": 0.024523872550344095, + "clip_ratio/low_min": 0.010228942657704465, + "clip_ratio/region_mean": 0.04198102173395455, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.7, + "completions/mean_length": 48.70416812896728, + "completions/min_length": 31.55, + "epoch": 0.14829830363810947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48832741379737854, + "kl": 0.48549016900360586, + "learning_rate": 4.995057776500556e-05, + "loss": 0.015371555089950561, + "reward": 0.019163154996931553, + "reward_std": 0.1171959787607193, + "rewards/ASRWerHalluLenRewardV5/mean": 0.01916315294802189, + "rewards/ASRWerHalluLenRewardV5/std": 0.43001808822154997, + "step": 1390, + "step_time": 31.155499988794325 + }, + { + "clip_ratio/high_max": 0.02913985784398392, + "clip_ratio/high_mean": 0.0160744785782299, + "clip_ratio/low_mean": 0.023852365912171082, + "clip_ratio/low_min": 0.008081013755872845, + "clip_ratio/region_mean": 0.039926844323053955, + "completions/clipped_ratio": 0.0020833333333333333, + "completions/max_length": 264.55, + "completions/mean_length": 58.16562633514404, + "completions/min_length": 37.75, + "epoch": 0.14883175077349833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2802684009075165, + "kl": 0.40779998283833263, + "learning_rate": 4.994966875655354e-05, + "loss": -0.0011006228625774384, + "reward": 0.08246221356093883, + "reward_std": 0.12109558843076229, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08246221486479044, + "rewards/ASRWerHalluLenRewardV5/std": 0.3511412963271141, + "step": 1395, + "step_time": 91.98435200098902 + }, + { + "clip_ratio/high_max": 0.024811976216733457, + "clip_ratio/high_mean": 0.014143715618411079, + "clip_ratio/low_mean": 0.023264350730460136, + "clip_ratio/low_min": 0.009089946612948551, + "clip_ratio/region_mean": 0.037408066494390366, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.2, + "completions/mean_length": 46.183334541320804, + "completions/min_length": 31.75, + "epoch": 0.14936519790888722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7672404050827026, + "kl": 0.342572021111846, + "learning_rate": 4.9948751472968024e-05, + "loss": 0.009508252143859863, + "reward": 0.08012031279504299, + "reward_std": 0.1274383846670389, + "rewards/ASRWerHalluLenRewardV5/mean": 0.08012030944228173, + "rewards/ASRWerHalluLenRewardV5/std": 0.3808165766298771, + "step": 1400, + "step_time": 29.39507036563009 + }, + { + "epoch": 0.14936519790888722, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 57.309859154929576, + "eval_completions/mean_length": 45.97183230225469, + "eval_completions/min_length": 33.0, + "eval_frac_reward_zero_std": 0.17370892536472265, + "eval_kl": 0.21030021102075847, + "eval_loss": 0.009321942925453186, + "eval_reward": 0.6229815750159848, + "eval_reward_std": 0.06499039343702541, + "eval_rewards/ASRWerHalluLenRewardV5/mean": 0.6229815533988072, + "eval_rewards/ASRWerHalluLenRewardV5/std": 0.30382886904121287, + "eval_runtime": 340.5054, + "eval_samples_per_second": 0.628, + "eval_steps_per_second": 0.053, + "step": 1400 + } + ], + "logging_steps": 5, + "max_steps": 28119, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}