| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 935.5, |
| "completions/mean_length": 571.30859375, |
| "completions/min_length": 264.5, |
| "epoch": 0.02, |
| "grad_norm": 1.2956373691558838, |
| "kl": 0.0006160736083984375, |
| "learning_rate": 2e-07, |
| "loss": 0.11099594086408615, |
| "memory(GiB)": 18.17, |
| "reward": 0.18179254233837128, |
| "reward_std": 0.021205796860158443, |
| "rewards/MCQ_Reward/mean": 0.18179254233837128, |
| "rewards/MCQ_Reward/std": 0.0575394481420517, |
| "step": 1, |
| "train_speed(iter/s)": 0.017384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.04, |
| "grad_norm": 1.2956030368804932, |
| "kl": 0.0006160736083984375, |
| "learning_rate": 4e-07, |
| "loss": 0.11099594086408615, |
| "memory(GiB)": 18.17, |
| "step": 2, |
| "train_speed(iter/s)": 0.033769 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1004.0, |
| "completions/mean_length": 582.2890625, |
| "completions/min_length": 126.5, |
| "epoch": 0.06, |
| "grad_norm": 1.1973260641098022, |
| "kl": 0.00061798095703125, |
| "learning_rate": 6e-07, |
| "loss": 0.09401366859674454, |
| "memory(GiB)": 18.17, |
| "reward": 0.1757229119539261, |
| "reward_std": 0.02308646310120821, |
| "rewards/MCQ_Reward/mean": 0.1757229119539261, |
| "rewards/MCQ_Reward/std": 0.06555243954062462, |
| "step": 3, |
| "train_speed(iter/s)": 0.029478 |
| }, |
| { |
| "clip_ratio": 0.0011098573449999094, |
| "epoch": 0.08, |
| "grad_norm": 1.206025242805481, |
| "kl": 0.0006008148193359375, |
| "learning_rate": 8e-07, |
| "loss": 0.09423406422138214, |
| "memory(GiB)": 18.17, |
| "step": 4, |
| "train_speed(iter/s)": 0.038797 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1025.0, |
| "completions/mean_length": 587.22265625, |
| "completions/min_length": 50.0, |
| "epoch": 0.1, |
| "grad_norm": 1.1425890922546387, |
| "kl": 0.0006389617919921875, |
| "learning_rate": 1e-06, |
| "loss": 0.10835893452167511, |
| "memory(GiB)": 18.17, |
| "reward": 0.20135290175676346, |
| "reward_std": 0.026336468756198883, |
| "rewards/MCQ_Reward/mean": 0.20135290175676346, |
| "rewards/MCQ_Reward/std": 0.04013596661388874, |
| "step": 5, |
| "train_speed(iter/s)": 0.033455 |
| }, |
| { |
| "clip_ratio": 0.000744842371204868, |
| "epoch": 0.12, |
| "grad_norm": 1.1426688432693481, |
| "kl": 0.0006389617919921875, |
| "learning_rate": 9.999899300364532e-07, |
| "loss": 0.10809706896543503, |
| "memory(GiB)": 18.17, |
| "step": 6, |
| "train_speed(iter/s)": 0.039768 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 986.0, |
| "completions/mean_length": 554.33203125, |
| "completions/min_length": 187.5, |
| "epoch": 0.14, |
| "grad_norm": 1.2598297595977783, |
| "kl": 0.000637054443359375, |
| "learning_rate": 9.999597205514296e-07, |
| "loss": 0.10747133195400238, |
| "memory(GiB)": 18.17, |
| "reward": 0.18709591031074524, |
| "reward_std": 0.022870728746056557, |
| "rewards/MCQ_Reward/mean": 0.18709591031074524, |
| "rewards/MCQ_Reward/std": 0.061255430802702904, |
| "step": 7, |
| "train_speed(iter/s)": 0.036272 |
| }, |
| { |
| "clip_ratio": 0.0011600544094108045, |
| "epoch": 0.16, |
| "grad_norm": 1.2500499486923218, |
| "kl": 0.0007114410400390625, |
| "learning_rate": 9.999093727617628e-07, |
| "loss": 0.10704316943883896, |
| "memory(GiB)": 18.17, |
| "step": 8, |
| "train_speed(iter/s)": 0.041177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1011.5, |
| "completions/mean_length": 562.61328125, |
| "completions/min_length": 231.5, |
| "epoch": 0.18, |
| "grad_norm": 1.4137037992477417, |
| "kl": 0.00092315673828125, |
| "learning_rate": 9.998388886954545e-07, |
| "loss": 0.1194264367222786, |
| "memory(GiB)": 18.17, |
| "reward": 0.20057281106710434, |
| "reward_std": 0.02457202784717083, |
| "rewards/MCQ_Reward/mean": 0.20057281106710434, |
| "rewards/MCQ_Reward/std": 0.0581410713493824, |
| "step": 9, |
| "train_speed(iter/s)": 0.037627 |
| }, |
| { |
| "clip_ratio": 0.0008636733400635421, |
| "epoch": 0.2, |
| "grad_norm": 1.4122164249420166, |
| "kl": 0.001087188720703125, |
| "learning_rate": 9.997482711915925e-07, |
| "loss": 0.11916504055261612, |
| "memory(GiB)": 18.17, |
| "step": 10, |
| "train_speed(iter/s)": 0.041584 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 988.0, |
| "completions/mean_length": 545.3203125, |
| "completions/min_length": 13.0, |
| "epoch": 0.22, |
| "grad_norm": 1.1587789058685303, |
| "kl": 0.001285552978515625, |
| "learning_rate": 9.996375239002368e-07, |
| "loss": 0.06654135137796402, |
| "memory(GiB)": 18.17, |
| "reward": 0.18803076446056366, |
| "reward_std": 0.027116701006889343, |
| "rewards/MCQ_Reward/mean": 0.18803076446056366, |
| "rewards/MCQ_Reward/std": 0.06116201728582382, |
| "step": 11, |
| "train_speed(iter/s)": 0.037797 |
| }, |
| { |
| "clip_ratio": 0.0012727798894047737, |
| "epoch": 0.24, |
| "grad_norm": 1.1393318176269531, |
| "kl": 0.001819610595703125, |
| "learning_rate": 9.995066512822718e-07, |
| "loss": 0.0661393254995346, |
| "memory(GiB)": 18.17, |
| "step": 12, |
| "train_speed(iter/s)": 0.041011 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 928.0, |
| "completions/mean_length": 502.984375, |
| "completions/min_length": 181.5, |
| "epoch": 0.26, |
| "grad_norm": 1.3736039400100708, |
| "kl": 0.00341796875, |
| "learning_rate": 9.99355658609228e-07, |
| "loss": 0.09961968660354614, |
| "memory(GiB)": 18.17, |
| "reward": 0.2046608179807663, |
| "reward_std": 0.02339835651218891, |
| "rewards/MCQ_Reward/mean": 0.2046608179807663, |
| "rewards/MCQ_Reward/std": 0.07441236078739166, |
| "step": 13, |
| "train_speed(iter/s)": 0.038941 |
| }, |
| { |
| "clip_ratio": 0.0013542931410484016, |
| "epoch": 0.28, |
| "grad_norm": 1.341399073600769, |
| "kl": 0.004730224609375, |
| "learning_rate": 9.991845519630676e-07, |
| "loss": 0.09878668189048767, |
| "memory(GiB)": 18.17, |
| "step": 14, |
| "train_speed(iter/s)": 0.041763 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 939.0, |
| "completions/mean_length": 479.08984375, |
| "completions/min_length": 201.5, |
| "epoch": 0.3, |
| "grad_norm": 1.2583457231521606, |
| "kl": 0.005706787109375, |
| "learning_rate": 9.989933382359422e-07, |
| "loss": 0.09561844170093536, |
| "memory(GiB)": 18.17, |
| "reward": 0.23959992825984955, |
| "reward_std": 0.024829759262502193, |
| "rewards/MCQ_Reward/mean": 0.23959992825984955, |
| "rewards/MCQ_Reward/std": 0.059385696426033974, |
| "step": 15, |
| "train_speed(iter/s)": 0.040033 |
| }, |
| { |
| "clip_ratio": 0.0012090829550288618, |
| "epoch": 0.32, |
| "grad_norm": 1.2485970258712769, |
| "kl": 0.0069122314453125, |
| "learning_rate": 9.98782025129912e-07, |
| "loss": 0.09502086043357849, |
| "memory(GiB)": 18.17, |
| "step": 16, |
| "train_speed(iter/s)": 0.042555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 785.0, |
| "completions/mean_length": 446.19140625, |
| "completions/min_length": 186.5, |
| "epoch": 0.34, |
| "grad_norm": 1.4837766885757446, |
| "kl": 0.0080718994140625, |
| "learning_rate": 9.985506211566386e-07, |
| "loss": 0.11237534880638123, |
| "memory(GiB)": 18.17, |
| "reward": 0.204755961894989, |
| "reward_std": 0.025960725732147694, |
| "rewards/MCQ_Reward/mean": 0.204755961894989, |
| "rewards/MCQ_Reward/std": 0.05882856249809265, |
| "step": 17, |
| "train_speed(iter/s)": 0.041421 |
| }, |
| { |
| "clip_ratio": 0.0012163713108748198, |
| "epoch": 0.36, |
| "grad_norm": 1.4663207530975342, |
| "kl": 0.00933837890625, |
| "learning_rate": 9.982991356370403e-07, |
| "loss": 0.11209464073181152, |
| "memory(GiB)": 18.17, |
| "step": 18, |
| "train_speed(iter/s)": 0.043701 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 771.0, |
| "completions/mean_length": 451.41015625, |
| "completions/min_length": 101.0, |
| "epoch": 0.38, |
| "grad_norm": 1.2070645093917847, |
| "kl": 0.010772705078125, |
| "learning_rate": 9.98027578700917e-07, |
| "loss": 0.0659424215555191, |
| "memory(GiB)": 18.17, |
| "reward": 0.18814751505851746, |
| "reward_std": 0.024471789598464966, |
| "rewards/MCQ_Reward/mean": 0.18814751505851746, |
| "rewards/MCQ_Reward/std": 0.062104713171720505, |
| "step": 19, |
| "train_speed(iter/s)": 0.042657 |
| }, |
| { |
| "clip_ratio": 0.0017630973597988486, |
| "epoch": 0.4, |
| "grad_norm": 1.1632057428359985, |
| "kl": 0.014007568359375, |
| "learning_rate": 9.977359612865422e-07, |
| "loss": 0.0650935024023056, |
| "memory(GiB)": 18.17, |
| "step": 20, |
| "train_speed(iter/s)": 0.044775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 808.0, |
| "completions/mean_length": 392.30078125, |
| "completions/min_length": 84.0, |
| "epoch": 0.42, |
| "grad_norm": 1.313915491104126, |
| "kl": 0.019775390625, |
| "learning_rate": 9.974242951402235e-07, |
| "loss": 0.07705788314342499, |
| "memory(GiB)": 18.17, |
| "reward": 0.23380683362483978, |
| "reward_std": 0.03150738961994648, |
| "rewards/MCQ_Reward/mean": 0.23380683362483978, |
| "rewards/MCQ_Reward/std": 0.057576023042201996, |
| "step": 21, |
| "train_speed(iter/s)": 0.043224 |
| }, |
| { |
| "clip_ratio": 0.0028022455517202616, |
| "epoch": 0.44, |
| "grad_norm": 1.242121934890747, |
| "kl": 0.02642822265625, |
| "learning_rate": 9.970925928158272e-07, |
| "loss": 0.07613129168748856, |
| "memory(GiB)": 18.17, |
| "step": 22, |
| "train_speed(iter/s)": 0.045118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 621.0, |
| "completions/mean_length": 355.48828125, |
| "completions/min_length": 144.0, |
| "epoch": 0.46, |
| "grad_norm": 1.3318829536437988, |
| "kl": 0.034423828125, |
| "learning_rate": 9.967408676742751e-07, |
| "loss": 0.07269842177629471, |
| "memory(GiB)": 18.17, |
| "reward": 0.22312550246715546, |
| "reward_std": 0.031231535598635674, |
| "rewards/MCQ_Reward/mean": 0.22312550246715546, |
| "rewards/MCQ_Reward/std": 0.05438939481973648, |
| "step": 23, |
| "train_speed(iter/s)": 0.044616 |
| }, |
| { |
| "clip_ratio": 0.0020711172837764025, |
| "epoch": 0.48, |
| "grad_norm": 1.2974779605865479, |
| "kl": 0.0413818359375, |
| "learning_rate": 9.963691338830042e-07, |
| "loss": 0.07173984497785568, |
| "memory(GiB)": 18.17, |
| "step": 24, |
| "train_speed(iter/s)": 0.046444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 651.5, |
| "completions/mean_length": 318.5234375, |
| "completions/min_length": 92.0, |
| "epoch": 0.5, |
| "grad_norm": 1.397636890411377, |
| "kl": 0.047119140625, |
| "learning_rate": 9.959774064153975e-07, |
| "loss": 0.03884683549404144, |
| "memory(GiB)": 18.17, |
| "reward": 0.23498350381851196, |
| "reward_std": 0.03053601924329996, |
| "rewards/MCQ_Reward/mean": 0.23498350381851196, |
| "rewards/MCQ_Reward/std": 0.05711263045668602, |
| "step": 25, |
| "train_speed(iter/s)": 0.045888 |
| }, |
| { |
| "clip_ratio": 0.0013737165136262774, |
| "epoch": 0.52, |
| "grad_norm": 1.379469394683838, |
| "kl": 0.052734375, |
| "learning_rate": 9.955657010501806e-07, |
| "loss": 0.038122277706861496, |
| "memory(GiB)": 18.17, |
| "step": 26, |
| "train_speed(iter/s)": 0.047611 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 611.5, |
| "completions/mean_length": 293.42578125, |
| "completions/min_length": 110.5, |
| "epoch": 0.54, |
| "grad_norm": 1.3771414756774902, |
| "kl": 0.0574951171875, |
| "learning_rate": 9.95134034370785e-07, |
| "loss": 0.05064291134476662, |
| "memory(GiB)": 18.17, |
| "reward": 0.257246270775795, |
| "reward_std": 0.03051395993679762, |
| "rewards/MCQ_Reward/mean": 0.257246270775795, |
| "rewards/MCQ_Reward/std": 0.05405682139098644, |
| "step": 27, |
| "train_speed(iter/s)": 0.046967 |
| }, |
| { |
| "clip_ratio": 0.0015082518220879138, |
| "epoch": 0.56, |
| "grad_norm": 1.3394073247909546, |
| "kl": 0.063720703125, |
| "learning_rate": 9.946824237646824e-07, |
| "loss": 0.04972712695598602, |
| "memory(GiB)": 18.17, |
| "step": 28, |
| "train_speed(iter/s)": 0.048554 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 505.5, |
| "completions/mean_length": 259.3515625, |
| "completions/min_length": 76.0, |
| "epoch": 0.58, |
| "grad_norm": 1.4677767753601074, |
| "kl": 0.070556640625, |
| "learning_rate": 9.94210887422681e-07, |
| "loss": -0.01695432886481285, |
| "memory(GiB)": 18.17, |
| "reward": 0.25767549127340317, |
| "reward_std": 0.03901047818362713, |
| "rewards/MCQ_Reward/mean": 0.25767549127340317, |
| "rewards/MCQ_Reward/std": 0.05495491810142994, |
| "step": 29, |
| "train_speed(iter/s)": 0.048377 |
| }, |
| { |
| "clip_ratio": 0.001286374346818775, |
| "epoch": 0.6, |
| "grad_norm": 1.4747378826141357, |
| "kl": 0.076904296875, |
| "learning_rate": 9.93719444338197e-07, |
| "loss": -0.017460569739341736, |
| "memory(GiB)": 18.17, |
| "step": 30, |
| "train_speed(iter/s)": 0.04994 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 561.5, |
| "completions/mean_length": 250.26171875, |
| "completions/min_length": 96.5, |
| "epoch": 0.62, |
| "grad_norm": 1.6029585599899292, |
| "kl": 0.07763671875, |
| "learning_rate": 9.932081143064858e-07, |
| "loss": 0.042436983436346054, |
| "memory(GiB)": 18.17, |
| "reward": 0.23062269389629364, |
| "reward_std": 0.036025889217853546, |
| "rewards/MCQ_Reward/mean": 0.23062269389629364, |
| "rewards/MCQ_Reward/std": 0.0671730749309063, |
| "step": 31, |
| "train_speed(iter/s)": 0.048974 |
| }, |
| { |
| "clip_ratio": 0.00158036028733477, |
| "epoch": 0.64, |
| "grad_norm": 1.5435467958450317, |
| "kl": 0.08349609375, |
| "learning_rate": 9.926769179238464e-07, |
| "loss": 0.04148583859205246, |
| "memory(GiB)": 18.17, |
| "step": 32, |
| "train_speed(iter/s)": 0.050428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 522.5, |
| "completions/mean_length": 246.3984375, |
| "completions/min_length": 89.0, |
| "epoch": 0.66, |
| "grad_norm": 1.466068983078003, |
| "kl": 0.093994140625, |
| "learning_rate": 9.921258765867919e-07, |
| "loss": 0.008220436982810497, |
| "memory(GiB)": 18.17, |
| "reward": 0.22424693405628204, |
| "reward_std": 0.03309958428144455, |
| "rewards/MCQ_Reward/mean": 0.22424693405628204, |
| "rewards/MCQ_Reward/std": 0.06848622299730778, |
| "step": 33, |
| "train_speed(iter/s)": 0.050299 |
| }, |
| { |
| "clip_ratio": 0.0012578482856042683, |
| "epoch": 0.68, |
| "grad_norm": 1.4434019327163696, |
| "kl": 0.10009765625, |
| "learning_rate": 9.915550124911866e-07, |
| "loss": 0.007482614368200302, |
| "memory(GiB)": 18.17, |
| "step": 34, |
| "train_speed(iter/s)": 0.051722 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 520.0, |
| "completions/mean_length": 226.12109375, |
| "completions/min_length": 47.5, |
| "epoch": 0.7, |
| "grad_norm": 1.529449224472046, |
| "kl": 0.10546875, |
| "learning_rate": 9.909643486313533e-07, |
| "loss": -0.024700753390789032, |
| "memory(GiB)": 18.17, |
| "reward": 0.24431276321411133, |
| "reward_std": 0.03709370456635952, |
| "rewards/MCQ_Reward/mean": 0.24431276321411133, |
| "rewards/MCQ_Reward/std": 0.06565525010228157, |
| "step": 35, |
| "train_speed(iter/s)": 0.051572 |
| }, |
| { |
| "clip_ratio": 0.0013001365587115288, |
| "epoch": 0.72, |
| "grad_norm": 1.524826169013977, |
| "kl": 0.110595703125, |
| "learning_rate": 9.903539087991461e-07, |
| "loss": -0.025061530992388725, |
| "memory(GiB)": 18.17, |
| "step": 36, |
| "train_speed(iter/s)": 0.052951 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 461.0, |
| "completions/mean_length": 206.1328125, |
| "completions/min_length": 63.0, |
| "epoch": 0.74, |
| "grad_norm": 1.5648741722106934, |
| "kl": 0.11474609375, |
| "learning_rate": 9.897237175829926e-07, |
| "loss": -0.010986058972775936, |
| "memory(GiB)": 18.17, |
| "reward": 0.26653096079826355, |
| "reward_std": 0.03736630827188492, |
| "rewards/MCQ_Reward/mean": 0.26653096079826355, |
| "rewards/MCQ_Reward/std": 0.065978042781353, |
| "step": 37, |
| "train_speed(iter/s)": 0.052793 |
| }, |
| { |
| "clip_ratio": 0.0015517690917477012, |
| "epoch": 0.76, |
| "grad_norm": 1.5597436428070068, |
| "kl": 0.122802734375, |
| "learning_rate": 9.890738003669027e-07, |
| "loss": -0.011755033396184444, |
| "memory(GiB)": 18.17, |
| "step": 38, |
| "train_speed(iter/s)": 0.054118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 412.5, |
| "completions/mean_length": 203.34375, |
| "completions/min_length": 35.5, |
| "epoch": 0.78, |
| "grad_norm": 1.6045058965682983, |
| "kl": 0.125244140625, |
| "learning_rate": 9.884041833294475e-07, |
| "loss": -0.04164643585681915, |
| "memory(GiB)": 18.17, |
| "reward": 0.2605663910508156, |
| "reward_std": 0.03675983473658562, |
| "rewards/MCQ_Reward/mean": 0.2605663910508156, |
| "rewards/MCQ_Reward/std": 0.06591521203517914, |
| "step": 39, |
| "train_speed(iter/s)": 0.054082 |
| }, |
| { |
| "clip_ratio": 0.0013205534196458757, |
| "epoch": 0.8, |
| "grad_norm": 1.608991265296936, |
| "kl": 0.1337890625, |
| "learning_rate": 9.877148934427035e-07, |
| "loss": -0.042494483292102814, |
| "memory(GiB)": 18.17, |
| "step": 40, |
| "train_speed(iter/s)": 0.055369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 385.0, |
| "completions/mean_length": 189.2734375, |
| "completions/min_length": 60.5, |
| "epoch": 0.82, |
| "grad_norm": 1.8442962169647217, |
| "kl": 0.14208984375, |
| "learning_rate": 9.870059584711668e-07, |
| "loss": -0.07683762162923813, |
| "memory(GiB)": 18.17, |
| "reward": 0.26815178990364075, |
| "reward_std": 0.04410684481263161, |
| "rewards/MCQ_Reward/mean": 0.26815178990364075, |
| "rewards/MCQ_Reward/std": 0.06000189855694771, |
| "step": 41, |
| "train_speed(iter/s)": 0.055022 |
| }, |
| { |
| "clip_ratio": 0.0013334141112864017, |
| "epoch": 0.84, |
| "grad_norm": 1.8422967195510864, |
| "kl": 0.14599609375, |
| "learning_rate": 9.862774069706345e-07, |
| "loss": -0.0775442123413086, |
| "memory(GiB)": 18.17, |
| "step": 42, |
| "train_speed(iter/s)": 0.056271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 447.5, |
| "completions/mean_length": 185.765625, |
| "completions/min_length": 65.5, |
| "epoch": 0.86, |
| "grad_norm": 1.7880198955535889, |
| "kl": 0.14453125, |
| "learning_rate": 9.85529268287055e-07, |
| "loss": 0.009722323156893253, |
| "memory(GiB)": 18.17, |
| "reward": 0.26024360954761505, |
| "reward_std": 0.04201339744031429, |
| "rewards/MCQ_Reward/mean": 0.26024360954761505, |
| "rewards/MCQ_Reward/std": 0.0699400007724762, |
| "step": 43, |
| "train_speed(iter/s)": 0.056122 |
| }, |
| { |
| "clip_ratio": 0.0013897960307076573, |
| "epoch": 0.88, |
| "grad_norm": 1.7613471746444702, |
| "kl": 0.14599609375, |
| "learning_rate": 9.847615725553455e-07, |
| "loss": 0.008702307008206844, |
| "memory(GiB)": 18.17, |
| "step": 44, |
| "train_speed(iter/s)": 0.057328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 329.5, |
| "completions/mean_length": 180.44921875, |
| "completions/min_length": 71.5, |
| "epoch": 0.9, |
| "grad_norm": 1.8986045122146606, |
| "kl": 0.16357421875, |
| "learning_rate": 9.83974350698178e-07, |
| "loss": -0.01265439111739397, |
| "memory(GiB)": 18.17, |
| "reward": 0.24561913311481476, |
| "reward_std": 0.041749605908989906, |
| "rewards/MCQ_Reward/mean": 0.24561913311481476, |
| "rewards/MCQ_Reward/std": 0.0692291297018528, |
| "step": 45, |
| "train_speed(iter/s)": 0.057564 |
| }, |
| { |
| "clip_ratio": 0.0017767796525731683, |
| "epoch": 0.92, |
| "grad_norm": 1.8627526760101318, |
| "kl": 0.1669921875, |
| "learning_rate": 9.831676344247342e-07, |
| "loss": -0.013573069125413895, |
| "memory(GiB)": 18.17, |
| "step": 46, |
| "train_speed(iter/s)": 0.058753 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 360.5, |
| "completions/mean_length": 181.046875, |
| "completions/min_length": 58.0, |
| "epoch": 0.94, |
| "grad_norm": 1.8329010009765625, |
| "kl": 0.1689453125, |
| "learning_rate": 9.82341456229428e-07, |
| "loss": -0.009910675697028637, |
| "memory(GiB)": 18.17, |
| "reward": 0.2712182253599167, |
| "reward_std": 0.03875480592250824, |
| "rewards/MCQ_Reward/mean": 0.2712182253599167, |
| "rewards/MCQ_Reward/std": 0.05874207057058811, |
| "step": 47, |
| "train_speed(iter/s)": 0.05881 |
| }, |
| { |
| "clip_ratio": 0.0020254994742572308, |
| "epoch": 0.96, |
| "grad_norm": 1.7636630535125732, |
| "kl": 0.17529296875, |
| "learning_rate": 9.814958493905962e-07, |
| "loss": -0.011010742746293545, |
| "memory(GiB)": 18.17, |
| "step": 48, |
| "train_speed(iter/s)": 0.05997 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 373.0, |
| "completions/mean_length": 198.5, |
| "completions/min_length": 83.0, |
| "epoch": 0.98, |
| "grad_norm": 1.9754475355148315, |
| "kl": 0.15625, |
| "learning_rate": 9.806308479691594e-07, |
| "loss": 0.026388226076960564, |
| "memory(GiB)": 18.17, |
| "reward": 0.2969816029071808, |
| "reward_std": 0.033485451713204384, |
| "rewards/MCQ_Reward/mean": 0.2969816029071808, |
| "rewards/MCQ_Reward/std": 0.06154371425509453, |
| "step": 49, |
| "train_speed(iter/s)": 0.059869 |
| }, |
| { |
| "clip_ratio": 0.002143923775292933, |
| "epoch": 1.0, |
| "grad_norm": 1.9168144464492798, |
| "kl": 0.16455078125, |
| "learning_rate": 9.797464868072486e-07, |
| "loss": 0.025302505120635033, |
| "memory(GiB)": 18.17, |
| "step": 50, |
| "train_speed(iter/s)": 0.060949 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 349.5, |
| "completions/mean_length": 175.86328125, |
| "completions/min_length": 67.5, |
| "epoch": 1.02, |
| "grad_norm": 1.949724793434143, |
| "kl": 0.18359375, |
| "learning_rate": 9.788428015268026e-07, |
| "loss": 0.016914475709199905, |
| "memory(GiB)": 18.17, |
| "reward": 0.28643812239170074, |
| "reward_std": 0.038882166147232056, |
| "rewards/MCQ_Reward/mean": 0.28643812239170074, |
| "rewards/MCQ_Reward/std": 0.05762592889368534, |
| "step": 51, |
| "train_speed(iter/s)": 0.06051 |
| }, |
| { |
| "clip_ratio": 0.0030939964344725013, |
| "epoch": 1.04, |
| "grad_norm": 1.873901128768921, |
| "kl": 0.1962890625, |
| "learning_rate": 9.779198285281326e-07, |
| "loss": 0.015664130449295044, |
| "memory(GiB)": 18.17, |
| "step": 52, |
| "train_speed(iter/s)": 0.061602 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 301.0, |
| "completions/mean_length": 173.26953125, |
| "completions/min_length": 50.5, |
| "epoch": 1.06, |
| "grad_norm": 1.748197317123413, |
| "kl": 0.20361328125, |
| "learning_rate": 9.769776049884563e-07, |
| "loss": -0.012495264410972595, |
| "memory(GiB)": 18.17, |
| "reward": 0.2694673240184784, |
| "reward_std": 0.03306659869849682, |
| "rewards/MCQ_Reward/mean": 0.2694673240184784, |
| "rewards/MCQ_Reward/std": 0.06984242424368858, |
| "step": 53, |
| "train_speed(iter/s)": 0.061749 |
| }, |
| { |
| "clip_ratio": 0.003254209994338453, |
| "epoch": 1.08, |
| "grad_norm": 1.7254936695098877, |
| "kl": 0.22021484375, |
| "learning_rate": 9.760161688604007e-07, |
| "loss": -0.012979630380868912, |
| "memory(GiB)": 18.17, |
| "step": 54, |
| "train_speed(iter/s)": 0.062813 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 323.0, |
| "completions/mean_length": 164.23046875, |
| "completions/min_length": 74.0, |
| "epoch": 1.1, |
| "grad_norm": 1.8942813873291016, |
| "kl": 0.21044921875, |
| "learning_rate": 9.750355588704727e-07, |
| "loss": -0.009442738257348537, |
| "memory(GiB)": 18.17, |
| "reward": 0.29137177765369415, |
| "reward_std": 0.03919493593275547, |
| "rewards/MCQ_Reward/mean": 0.29137177765369415, |
| "rewards/MCQ_Reward/std": 0.055357255041599274, |
| "step": 55, |
| "train_speed(iter/s)": 0.062825 |
| }, |
| { |
| "clip_ratio": 0.0029244048055261374, |
| "epoch": 1.12, |
| "grad_norm": 1.8403282165527344, |
| "kl": 0.2255859375, |
| "learning_rate": 9.740358145174997e-07, |
| "loss": -0.010412258096039295, |
| "memory(GiB)": 18.17, |
| "step": 56, |
| "train_speed(iter/s)": 0.063885 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 291.5, |
| "completions/mean_length": 159.5703125, |
| "completions/min_length": 68.5, |
| "epoch": 1.1400000000000001, |
| "grad_norm": 1.9502640962600708, |
| "kl": 0.24072265625, |
| "learning_rate": 9.730169760710385e-07, |
| "loss": -0.01350313052535057, |
| "memory(GiB)": 18.17, |
| "reward": 0.3086051344871521, |
| "reward_std": 0.036856647580862045, |
| "rewards/MCQ_Reward/mean": 0.3086051344871521, |
| "rewards/MCQ_Reward/std": 0.05716245248913765, |
| "step": 57, |
| "train_speed(iter/s)": 0.064059 |
| }, |
| { |
| "clip_ratio": 0.0026392132276669145, |
| "epoch": 1.16, |
| "grad_norm": 1.8639681339263916, |
| "kl": 0.244140625, |
| "learning_rate": 9.719790845697532e-07, |
| "loss": -0.014377694576978683, |
| "memory(GiB)": 18.17, |
| "step": 58, |
| "train_speed(iter/s)": 0.065093 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.0, |
| "completions/mean_length": 133.83984375, |
| "completions/min_length": 52.5, |
| "epoch": 1.18, |
| "grad_norm": 2.159579038619995, |
| "kl": 0.2607421875, |
| "learning_rate": 9.709221818197623e-07, |
| "loss": -0.03235793486237526, |
| "memory(GiB)": 18.17, |
| "reward": 0.3192738890647888, |
| "reward_std": 0.03647255524992943, |
| "rewards/MCQ_Reward/mean": 0.3192738890647888, |
| "rewards/MCQ_Reward/std": 0.04580973833799362, |
| "step": 59, |
| "train_speed(iter/s)": 0.065376 |
| }, |
| { |
| "clip_ratio": 0.0033569036750122905, |
| "epoch": 1.2, |
| "grad_norm": 2.0858945846557617, |
| "kl": 0.2685546875, |
| "learning_rate": 9.698463103929541e-07, |
| "loss": -0.03384597226977348, |
| "memory(GiB)": 18.17, |
| "step": 60, |
| "train_speed(iter/s)": 0.066397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 275.5, |
| "completions/mean_length": 152.6640625, |
| "completions/min_length": 54.0, |
| "epoch": 1.22, |
| "grad_norm": 1.9752745628356934, |
| "kl": 0.2509765625, |
| "learning_rate": 9.68751513625273e-07, |
| "loss": -0.012610888108611107, |
| "memory(GiB)": 18.17, |
| "reward": 0.30408790707588196, |
| "reward_std": 0.03896576911211014, |
| "rewards/MCQ_Reward/mean": 0.30408790707588196, |
| "rewards/MCQ_Reward/std": 0.059865519404411316, |
| "step": 61, |
| "train_speed(iter/s)": 0.066047 |
| }, |
| { |
| "clip_ratio": 0.0028306948952376842, |
| "epoch": 1.24, |
| "grad_norm": 1.8911457061767578, |
| "kl": 0.2509765625, |
| "learning_rate": 9.676378356149732e-07, |
| "loss": -0.014004014432430267, |
| "memory(GiB)": 18.17, |
| "step": 62, |
| "train_speed(iter/s)": 0.067044 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 275.5, |
| "completions/mean_length": 147.6953125, |
| "completions/min_length": 69.0, |
| "epoch": 1.26, |
| "grad_norm": 2.153862953186035, |
| "kl": 0.265625, |
| "learning_rate": 9.665053212208426e-07, |
| "loss": -0.027626825496554375, |
| "memory(GiB)": 18.17, |
| "reward": 0.31602054834365845, |
| "reward_std": 0.03946657292544842, |
| "rewards/MCQ_Reward/mean": 0.31602054834365845, |
| "rewards/MCQ_Reward/std": 0.06625748611986637, |
| "step": 63, |
| "train_speed(iter/s)": 0.067162 |
| }, |
| { |
| "clip_ratio": 0.004200217663310468, |
| "epoch": 1.28, |
| "grad_norm": 2.027595281600952, |
| "kl": 0.2626953125, |
| "learning_rate": 9.653540160603955e-07, |
| "loss": -0.028667613863945007, |
| "memory(GiB)": 18.17, |
| "step": 64, |
| "train_speed(iter/s)": 0.06814 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 300.5, |
| "completions/mean_length": 153.3828125, |
| "completions/min_length": 42.0, |
| "epoch": 1.3, |
| "grad_norm": 2.058096170425415, |
| "kl": 0.26318359375, |
| "learning_rate": 9.641839665080363e-07, |
| "loss": 0.019130591303110123, |
| "memory(GiB)": 18.17, |
| "reward": 0.3058909475803375, |
| "reward_std": 0.03743278048932552, |
| "rewards/MCQ_Reward/mean": 0.3058909475803375, |
| "rewards/MCQ_Reward/std": 0.06633425317704678, |
| "step": 65, |
| "train_speed(iter/s)": 0.068294 |
| }, |
| { |
| "clip_ratio": 0.0030368451261892915, |
| "epoch": 1.32, |
| "grad_norm": 2.0810675621032715, |
| "kl": 0.26708984375, |
| "learning_rate": 9.6299521969319e-07, |
| "loss": 0.01858600787818432, |
| "memory(GiB)": 18.17, |
| "step": 66, |
| "train_speed(iter/s)": 0.069245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 310.5, |
| "completions/mean_length": 170.65625, |
| "completions/min_length": 70.0, |
| "epoch": 1.34, |
| "grad_norm": 1.9177082777023315, |
| "kl": 0.25390625, |
| "learning_rate": 9.617878234984054e-07, |
| "loss": 0.013776745647192001, |
| "memory(GiB)": 18.17, |
| "reward": 0.32124653458595276, |
| "reward_std": 0.03586815297603607, |
| "rewards/MCQ_Reward/mean": 0.32124653458595276, |
| "rewards/MCQ_Reward/std": 0.05279739946126938, |
| "step": 67, |
| "train_speed(iter/s)": 0.069258 |
| }, |
| { |
| "clip_ratio": 0.003581640077754855, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 1.800355076789856, |
| "kl": 0.271484375, |
| "learning_rate": 9.60561826557425e-07, |
| "loss": 0.01218567043542862, |
| "memory(GiB)": 18.17, |
| "step": 68, |
| "train_speed(iter/s)": 0.070198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 320.5, |
| "completions/mean_length": 165.45703125, |
| "completions/min_length": 84.5, |
| "epoch": 1.38, |
| "grad_norm": 1.9321861267089844, |
| "kl": 0.2734375, |
| "learning_rate": 9.593172782532267e-07, |
| "loss": -0.06093820929527283, |
| "memory(GiB)": 18.17, |
| "reward": 0.33785562217235565, |
| "reward_std": 0.03626340813934803, |
| "rewards/MCQ_Reward/mean": 0.33785562217235565, |
| "rewards/MCQ_Reward/std": 0.04918426461517811, |
| "step": 69, |
| "train_speed(iter/s)": 0.070079 |
| }, |
| { |
| "clip_ratio": 0.002684593666344881, |
| "epoch": 1.4, |
| "grad_norm": 1.9250681400299072, |
| "kl": 0.2822265625, |
| "learning_rate": 9.580542287160346e-07, |
| "loss": -0.06187870353460312, |
| "memory(GiB)": 18.17, |
| "step": 70, |
| "train_speed(iter/s)": 0.071007 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.5, |
| "completions/mean_length": 167.71875, |
| "completions/min_length": 60.0, |
| "epoch": 1.42, |
| "grad_norm": 1.9310671091079712, |
| "kl": 0.26953125, |
| "learning_rate": 9.567727288213004e-07, |
| "loss": -0.03052324429154396, |
| "memory(GiB)": 18.17, |
| "reward": 0.3391506224870682, |
| "reward_std": 0.037205325439572334, |
| "rewards/MCQ_Reward/mean": 0.3391506224870682, |
| "rewards/MCQ_Reward/std": 0.06270403787493706, |
| "step": 71, |
| "train_speed(iter/s)": 0.070595 |
| }, |
| { |
| "clip_ratio": 0.004182511591352522, |
| "epoch": 1.44, |
| "grad_norm": 1.808637261390686, |
| "kl": 0.26953125, |
| "learning_rate": 9.554728301876524e-07, |
| "loss": -0.031438540667295456, |
| "memory(GiB)": 18.17, |
| "step": 72, |
| "train_speed(iter/s)": 0.071499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 330.0, |
| "completions/mean_length": 171.5859375, |
| "completions/min_length": 73.5, |
| "epoch": 1.46, |
| "grad_norm": 2.1356284618377686, |
| "kl": 0.2666015625, |
| "learning_rate": 9.541545851748185e-07, |
| "loss": 0.06165466085076332, |
| "memory(GiB)": 18.17, |
| "reward": 0.3267658054828644, |
| "reward_std": 0.03793729655444622, |
| "rewards/MCQ_Reward/mean": 0.3267658054828644, |
| "rewards/MCQ_Reward/std": 0.06866181083023548, |
| "step": 73, |
| "train_speed(iter/s)": 0.071359 |
| }, |
| { |
| "clip_ratio": 0.0023740422911942005, |
| "epoch": 1.48, |
| "grad_norm": 2.081942319869995, |
| "kl": 0.2724609375, |
| "learning_rate": 9.528180468815154e-07, |
| "loss": 0.06085401773452759, |
| "memory(GiB)": 18.17, |
| "step": 74, |
| "train_speed(iter/s)": 0.072254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 388.0, |
| "completions/mean_length": 176.4140625, |
| "completions/min_length": 60.0, |
| "epoch": 1.5, |
| "grad_norm": 1.819736361503601, |
| "kl": 0.291015625, |
| "learning_rate": 9.514632691433106e-07, |
| "loss": 0.041995078325271606, |
| "memory(GiB)": 18.17, |
| "reward": 0.34543414413928986, |
| "reward_std": 0.03658975474536419, |
| "rewards/MCQ_Reward/mean": 0.34543414413928986, |
| "rewards/MCQ_Reward/std": 0.0643342137336731, |
| "step": 75, |
| "train_speed(iter/s)": 0.072103 |
| }, |
| { |
| "clip_ratio": 0.0024005533196032047, |
| "epoch": 1.52, |
| "grad_norm": 1.7825483083724976, |
| "kl": 0.302734375, |
| "learning_rate": 9.500903065304539e-07, |
| "loss": 0.04098404943943024, |
| "memory(GiB)": 18.17, |
| "step": 76, |
| "train_speed(iter/s)": 0.072975 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 324.0, |
| "completions/mean_length": 179.35546875, |
| "completions/min_length": 71.5, |
| "epoch": 1.54, |
| "grad_norm": 1.83073091506958, |
| "kl": 0.2919921875, |
| "learning_rate": 9.486992143456791e-07, |
| "loss": 0.026145532727241516, |
| "memory(GiB)": 18.17, |
| "reward": 0.33697785437107086, |
| "reward_std": 0.033385418355464935, |
| "rewards/MCQ_Reward/mean": 0.33697785437107086, |
| "rewards/MCQ_Reward/std": 0.06162330321967602, |
| "step": 77, |
| "train_speed(iter/s)": 0.072818 |
| }, |
| { |
| "clip_ratio": 0.0029612210346385837, |
| "epoch": 1.56, |
| "grad_norm": 1.7568435668945312, |
| "kl": 0.3046875, |
| "learning_rate": 9.472900486219768e-07, |
| "loss": 0.02535586804151535, |
| "memory(GiB)": 18.17, |
| "step": 78, |
| "train_speed(iter/s)": 0.07364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 297.0, |
| "completions/mean_length": 181.63671875, |
| "completions/min_length": 86.0, |
| "epoch": 1.58, |
| "grad_norm": 1.763022541999817, |
| "kl": 0.296875, |
| "learning_rate": 9.458628661203366e-07, |
| "loss": -0.016155043616890907, |
| "memory(GiB)": 18.17, |
| "reward": 0.3397578001022339, |
| "reward_std": 0.030555096454918385, |
| "rewards/MCQ_Reward/mean": 0.3397578001022339, |
| "rewards/MCQ_Reward/std": 0.0736413523554802, |
| "step": 79, |
| "train_speed(iter/s)": 0.073639 |
| }, |
| { |
| "clip_ratio": 0.003752505173906684, |
| "epoch": 1.6, |
| "grad_norm": 1.75266695022583, |
| "kl": 0.314453125, |
| "learning_rate": 9.444177243274617e-07, |
| "loss": -0.016932127997279167, |
| "memory(GiB)": 18.17, |
| "step": 80, |
| "train_speed(iter/s)": 0.074482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 316.0, |
| "completions/mean_length": 173.53515625, |
| "completions/min_length": 82.5, |
| "epoch": 1.62, |
| "grad_norm": 1.813202142715454, |
| "kl": 0.3193359375, |
| "learning_rate": 9.429546814534528e-07, |
| "loss": 0.014175940304994583, |
| "memory(GiB)": 18.17, |
| "reward": 0.35451021790504456, |
| "reward_std": 0.0316955391317606, |
| "rewards/MCQ_Reward/mean": 0.35451021790504456, |
| "rewards/MCQ_Reward/std": 0.058956997469067574, |
| "step": 81, |
| "train_speed(iter/s)": 0.073923 |
| }, |
| { |
| "clip_ratio": 0.003929685335606337, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 1.7315208911895752, |
| "kl": 0.337890625, |
| "learning_rate": 9.414737964294634e-07, |
| "loss": 0.013125661760568619, |
| "memory(GiB)": 18.17, |
| "step": 82, |
| "train_speed(iter/s)": 0.074757 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 265.5, |
| "completions/mean_length": 159.95703125, |
| "completions/min_length": 68.5, |
| "epoch": 1.6600000000000001, |
| "grad_norm": 1.86507248878479, |
| "kl": 0.333984375, |
| "learning_rate": 9.399751289053266e-07, |
| "loss": 0.0190749391913414, |
| "memory(GiB)": 18.17, |
| "reward": 0.32107532024383545, |
| "reward_std": 0.03531700000166893, |
| "rewards/MCQ_Reward/mean": 0.32107532024383545, |
| "rewards/MCQ_Reward/std": 0.06730588898062706, |
| "step": 83, |
| "train_speed(iter/s)": 0.074766 |
| }, |
| { |
| "clip_ratio": 0.005602485965937376, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 1.8452680110931396, |
| "kl": 0.3515625, |
| "learning_rate": 9.384587392471514e-07, |
| "loss": 0.018391648307442665, |
| "memory(GiB)": 18.17, |
| "step": 84, |
| "train_speed(iter/s)": 0.075562 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 274.5, |
| "completions/mean_length": 146.36328125, |
| "completions/min_length": 51.5, |
| "epoch": 1.7, |
| "grad_norm": 2.060523271560669, |
| "kl": 0.3564453125, |
| "learning_rate": 9.369246885348925e-07, |
| "loss": 0.00966290757060051, |
| "memory(GiB)": 18.17, |
| "reward": 0.34230072796344757, |
| "reward_std": 0.03451686259359121, |
| "rewards/MCQ_Reward/mean": 0.34230072796344757, |
| "rewards/MCQ_Reward/std": 0.07506715506315231, |
| "step": 85, |
| "train_speed(iter/s)": 0.075608 |
| }, |
| { |
| "clip_ratio": 0.0025914940051734447, |
| "epoch": 1.72, |
| "grad_norm": 2.089233875274658, |
| "kl": 0.357421875, |
| "learning_rate": 9.353730385598886e-07, |
| "loss": 0.008917246013879776, |
| "memory(GiB)": 18.17, |
| "step": 86, |
| "train_speed(iter/s)": 0.076403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 251.0, |
| "completions/mean_length": 149.41796875, |
| "completions/min_length": 72.0, |
| "epoch": 1.74, |
| "grad_norm": 2.100825071334839, |
| "kl": 0.3642578125, |
| "learning_rate": 9.338038518223745e-07, |
| "loss": 0.0011688023805618286, |
| "memory(GiB)": 18.17, |
| "reward": 0.29714760184288025, |
| "reward_std": 0.03046888206154108, |
| "rewards/MCQ_Reward/mean": 0.29714760184288025, |
| "rewards/MCQ_Reward/std": 0.0724717304110527, |
| "step": 87, |
| "train_speed(iter/s)": 0.076468 |
| }, |
| { |
| "clip_ratio": 0.0029116831719875336, |
| "epoch": 1.76, |
| "grad_norm": 2.091975688934326, |
| "kl": 0.3740234375, |
| "learning_rate": 9.322171915289633e-07, |
| "loss": 0.0007365690544247627, |
| "memory(GiB)": 18.17, |
| "step": 88, |
| "train_speed(iter/s)": 0.077267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 243.0, |
| "completions/mean_length": 149.0546875, |
| "completions/min_length": 74.5, |
| "epoch": 1.78, |
| "grad_norm": 2.0660133361816406, |
| "kl": 0.5546875, |
| "learning_rate": 9.306131215901003e-07, |
| "loss": -0.002558637410402298, |
| "memory(GiB)": 18.17, |
| "reward": 0.3453996330499649, |
| "reward_std": 0.030298423022031784, |
| "rewards/MCQ_Reward/mean": 0.3453996330499649, |
| "rewards/MCQ_Reward/std": 0.05576108209788799, |
| "step": 89, |
| "train_speed(iter/s)": 0.07741 |
| }, |
| { |
| "clip_ratio": 0.0030759836081415415, |
| "epoch": 1.8, |
| "grad_norm": 1.9661788940429688, |
| "kl": 0.5439453125, |
| "learning_rate": 9.289917066174885e-07, |
| "loss": -0.003219339996576309, |
| "memory(GiB)": 18.17, |
| "step": 90, |
| "train_speed(iter/s)": 0.078204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 279.0, |
| "completions/mean_length": 137.28125, |
| "completions/min_length": 57.0, |
| "epoch": 1.8199999999999998, |
| "grad_norm": 2.1432077884674072, |
| "kl": 0.4169921875, |
| "learning_rate": 9.273530119214867e-07, |
| "loss": -0.019994597882032394, |
| "memory(GiB)": 18.17, |
| "reward": 0.3450734615325928, |
| "reward_std": 0.03698188066482544, |
| "rewards/MCQ_Reward/mean": 0.3450734615325928, |
| "rewards/MCQ_Reward/std": 0.06834666058421135, |
| "step": 91, |
| "train_speed(iter/s)": 0.077823 |
| }, |
| { |
| "clip_ratio": 0.006807451136410236, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 2.026726484298706, |
| "kl": 0.4423828125, |
| "learning_rate": 9.256971035084784e-07, |
| "loss": -0.02127775177359581, |
| "memory(GiB)": 18.17, |
| "step": 92, |
| "train_speed(iter/s)": 0.078595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 258.0, |
| "completions/mean_length": 144.11328125, |
| "completions/min_length": 62.5, |
| "epoch": 1.8599999999999999, |
| "grad_norm": 2.5080695152282715, |
| "kl": 0.44140625, |
| "learning_rate": 9.240240480782129e-07, |
| "loss": 0.038984864950180054, |
| "memory(GiB)": 18.17, |
| "reward": 0.34395235776901245, |
| "reward_std": 0.030767593532800674, |
| "rewards/MCQ_Reward/mean": 0.34395235776901245, |
| "rewards/MCQ_Reward/std": 0.08772432059049606, |
| "step": 93, |
| "train_speed(iter/s)": 0.07864 |
| }, |
| { |
| "clip_ratio": 0.0038948373403400183, |
| "epoch": 1.88, |
| "grad_norm": 2.293992042541504, |
| "kl": 0.466796875, |
| "learning_rate": 9.223339130211192e-07, |
| "loss": 0.03854737430810928, |
| "memory(GiB)": 18.17, |
| "step": 94, |
| "train_speed(iter/s)": 0.0794 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 288.0, |
| "completions/mean_length": 144.3671875, |
| "completions/min_length": 66.5, |
| "epoch": 1.9, |
| "grad_norm": 2.3717093467712402, |
| "kl": 0.4423828125, |
| "learning_rate": 9.206267664155906e-07, |
| "loss": 0.02822975069284439, |
| "memory(GiB)": 18.17, |
| "reward": 0.35692907869815826, |
| "reward_std": 0.033766910433769226, |
| "rewards/MCQ_Reward/mean": 0.35692907869815826, |
| "rewards/MCQ_Reward/std": 0.055017637088894844, |
| "step": 95, |
| "train_speed(iter/s)": 0.079264 |
| }, |
| { |
| "clip_ratio": 0.01540788309648633, |
| "epoch": 1.92, |
| "grad_norm": 2.8082501888275146, |
| "kl": 0.4873046875, |
| "learning_rate": 9.189026770252436e-07, |
| "loss": 0.027400558814406395, |
| "memory(GiB)": 18.17, |
| "step": 96, |
| "train_speed(iter/s)": 0.080015 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 216.5, |
| "completions/mean_length": 131.2265625, |
| "completions/min_length": 64.0, |
| "epoch": 1.94, |
| "grad_norm": 2.578866481781006, |
| "kl": 0.458984375, |
| "learning_rate": 9.171617142961476e-07, |
| "loss": -0.028647061437368393, |
| "memory(GiB)": 18.17, |
| "reward": 0.35198159515857697, |
| "reward_std": 0.036471933126449585, |
| "rewards/MCQ_Reward/mean": 0.35198159515857697, |
| "rewards/MCQ_Reward/std": 0.09679177403450012, |
| "step": 97, |
| "train_speed(iter/s)": 0.080136 |
| }, |
| { |
| "clip_ratio": 0.007482210174202919, |
| "epoch": 1.96, |
| "grad_norm": 2.6245126724243164, |
| "kl": 0.455078125, |
| "learning_rate": 9.154039483540272e-07, |
| "loss": -0.02990054339170456, |
| "memory(GiB)": 18.17, |
| "step": 98, |
| "train_speed(iter/s)": 0.080877 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 254.5, |
| "completions/mean_length": 140.546875, |
| "completions/min_length": 70.0, |
| "epoch": 1.98, |
| "grad_norm": 2.0212841033935547, |
| "kl": 0.4462890625, |
| "learning_rate": 9.136294500014385e-07, |
| "loss": 0.007645269390195608, |
| "memory(GiB)": 18.17, |
| "reward": 0.3687240034341812, |
| "reward_std": 0.0377286896109581, |
| "rewards/MCQ_Reward/mean": 0.3687240034341812, |
| "rewards/MCQ_Reward/std": 0.09235312044620514, |
| "step": 99, |
| "train_speed(iter/s)": 0.080838 |
| }, |
| { |
| "clip_ratio": 0.004757207585498691, |
| "epoch": 2.0, |
| "grad_norm": 1.9354287385940552, |
| "kl": 0.4638671875, |
| "learning_rate": 9.118382907149163e-07, |
| "loss": 0.006971254944801331, |
| "memory(GiB)": 18.17, |
| "step": 100, |
| "train_speed(iter/s)": 0.08155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 252.5, |
| "completions/mean_length": 123.4140625, |
| "completions/min_length": 54.0, |
| "epoch": 2.02, |
| "grad_norm": 2.3176586627960205, |
| "kl": 0.4755859375, |
| "learning_rate": 9.100305426420956e-07, |
| "loss": -0.016116395592689514, |
| "memory(GiB)": 18.17, |
| "reward": 0.38898809254169464, |
| "reward_std": 0.038034453988075256, |
| "rewards/MCQ_Reward/mean": 0.38898809254169464, |
| "rewards/MCQ_Reward/std": 0.07776015624403954, |
| "step": 101, |
| "train_speed(iter/s)": 0.081234 |
| }, |
| { |
| "clip_ratio": 0.004006300354376435, |
| "epoch": 2.04, |
| "grad_norm": 2.1871023178100586, |
| "kl": 0.4931640625, |
| "learning_rate": 9.082062785988048e-07, |
| "loss": -0.01703297346830368, |
| "memory(GiB)": 18.17, |
| "step": 102, |
| "train_speed(iter/s)": 0.081962 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.0, |
| "completions/mean_length": 113.1484375, |
| "completions/min_length": 56.5, |
| "epoch": 2.06, |
| "grad_norm": 2.5120768547058105, |
| "kl": 0.517578125, |
| "learning_rate": 9.06365572066134e-07, |
| "loss": -0.027387384325265884, |
| "memory(GiB)": 18.17, |
| "reward": 0.357058048248291, |
| "reward_std": 0.031020362861454487, |
| "rewards/MCQ_Reward/mean": 0.357058048248291, |
| "rewards/MCQ_Reward/std": 0.06582547165453434, |
| "step": 103, |
| "train_speed(iter/s)": 0.082061 |
| }, |
| { |
| "clip_ratio": 0.014288442209362984, |
| "epoch": 2.08, |
| "grad_norm": 3.2106845378875732, |
| "kl": 0.5009765625, |
| "learning_rate": 9.045084971874737e-07, |
| "loss": -0.02823379635810852, |
| "memory(GiB)": 18.17, |
| "step": 104, |
| "train_speed(iter/s)": 0.082761 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 211.0, |
| "completions/mean_length": 126.953125, |
| "completions/min_length": 70.0, |
| "epoch": 2.1, |
| "grad_norm": 2.2478950023651123, |
| "kl": 0.48828125, |
| "learning_rate": 9.026351287655293e-07, |
| "loss": 0.02888938970863819, |
| "memory(GiB)": 18.17, |
| "reward": 0.3573220670223236, |
| "reward_std": 0.03388269431889057, |
| "rewards/MCQ_Reward/mean": 0.3573220670223236, |
| "rewards/MCQ_Reward/std": 0.08621830865740776, |
| "step": 105, |
| "train_speed(iter/s)": 0.082851 |
| }, |
| { |
| "clip_ratio": 0.005271225702017546, |
| "epoch": 2.12, |
| "grad_norm": 2.07523250579834, |
| "kl": 0.513671875, |
| "learning_rate": 9.007455422593075e-07, |
| "loss": 0.028001034632325172, |
| "memory(GiB)": 18.17, |
| "step": 106, |
| "train_speed(iter/s)": 0.083561 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 244.0, |
| "completions/mean_length": 143.50390625, |
| "completions/min_length": 62.5, |
| "epoch": 2.14, |
| "grad_norm": 2.149932861328125, |
| "kl": 0.474609375, |
| "learning_rate": 8.988398137810776e-07, |
| "loss": -0.0027789073064923286, |
| "memory(GiB)": 18.17, |
| "reward": 0.37795157730579376, |
| "reward_std": 0.03415030054748058, |
| "rewards/MCQ_Reward/mean": 0.37795157730579376, |
| "rewards/MCQ_Reward/std": 0.07794364914298058, |
| "step": 107, |
| "train_speed(iter/s)": 0.083617 |
| }, |
| { |
| "clip_ratio": 0.008057619212195277, |
| "epoch": 2.16, |
| "grad_norm": 2.7377026081085205, |
| "kl": 0.5078125, |
| "learning_rate": 8.969180200933047e-07, |
| "loss": -0.003491489216685295, |
| "memory(GiB)": 18.17, |
| "step": 108, |
| "train_speed(iter/s)": 0.084274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 226.5, |
| "completions/mean_length": 133.1875, |
| "completions/min_length": 58.5, |
| "epoch": 2.18, |
| "grad_norm": 2.826488494873047, |
| "kl": 0.5390625, |
| "learning_rate": 8.94980238605558e-07, |
| "loss": 0.02833351120352745, |
| "memory(GiB)": 18.17, |
| "reward": 0.39782722294330597, |
| "reward_std": 0.031135279685258865, |
| "rewards/MCQ_Reward/mean": 0.39782722294330597, |
| "rewards/MCQ_Reward/std": 0.07045348361134529, |
| "step": 109, |
| "train_speed(iter/s)": 0.084336 |
| }, |
| { |
| "clip_ratio": 0.00684792990796268, |
| "epoch": 2.2, |
| "grad_norm": 2.434086322784424, |
| "kl": 0.5703125, |
| "learning_rate": 8.930265473713937e-07, |
| "loss": 0.027658611536026, |
| "memory(GiB)": 18.17, |
| "step": 110, |
| "train_speed(iter/s)": 0.085034 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 216.0, |
| "completions/mean_length": 131.703125, |
| "completions/min_length": 67.0, |
| "epoch": 2.22, |
| "grad_norm": 2.134516716003418, |
| "kl": 0.48828125, |
| "learning_rate": 8.910570250852096e-07, |
| "loss": 0.006394753232598305, |
| "memory(GiB)": 18.17, |
| "reward": 0.3707956522703171, |
| "reward_std": 0.03248129412531853, |
| "rewards/MCQ_Reward/mean": 0.3707956522703171, |
| "rewards/MCQ_Reward/std": 0.10541465878486633, |
| "step": 111, |
| "train_speed(iter/s)": 0.084685 |
| }, |
| { |
| "clip_ratio": 0.00865771621465683, |
| "epoch": 2.24, |
| "grad_norm": 2.2900125980377197, |
| "kl": 0.513671875, |
| "learning_rate": 8.890717510790762e-07, |
| "loss": 0.00539240799844265, |
| "memory(GiB)": 18.17, |
| "step": 112, |
| "train_speed(iter/s)": 0.085353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 264.5, |
| "completions/mean_length": 126.9140625, |
| "completions/min_length": 62.0, |
| "epoch": 2.26, |
| "grad_norm": 2.6178812980651855, |
| "kl": 0.546875, |
| "learning_rate": 8.870708053195413e-07, |
| "loss": 0.019267559051513672, |
| "memory(GiB)": 18.17, |
| "reward": 0.3922416865825653, |
| "reward_std": 0.03025819268077612, |
| "rewards/MCQ_Reward/mean": 0.3922416865825653, |
| "rewards/MCQ_Reward/std": 0.08424495533108711, |
| "step": 113, |
| "train_speed(iter/s)": 0.085338 |
| }, |
| { |
| "clip_ratio": 0.006454117828980088, |
| "epoch": 2.2800000000000002, |
| "grad_norm": 2.1509737968444824, |
| "kl": 0.57421875, |
| "learning_rate": 8.850542684044078e-07, |
| "loss": 0.01820582151412964, |
| "memory(GiB)": 18.17, |
| "step": 114, |
| "train_speed(iter/s)": 0.085985 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 212.5, |
| "completions/mean_length": 118.85546875, |
| "completions/min_length": 59.5, |
| "epoch": 2.3, |
| "grad_norm": 2.528681755065918, |
| "kl": 0.525390625, |
| "learning_rate": 8.83022221559489e-07, |
| "loss": 0.008160990662872791, |
| "memory(GiB)": 18.17, |
| "reward": 0.404242143034935, |
| "reward_std": 0.03400178253650665, |
| "rewards/MCQ_Reward/mean": 0.404242143034935, |
| "rewards/MCQ_Reward/std": 0.09943690523505211, |
| "step": 115, |
| "train_speed(iter/s)": 0.086069 |
| }, |
| { |
| "clip_ratio": 0.005366077646613121, |
| "epoch": 2.32, |
| "grad_norm": 2.1966934204101562, |
| "kl": 0.546875, |
| "learning_rate": 8.809747466353355e-07, |
| "loss": 0.007157166488468647, |
| "memory(GiB)": 18.17, |
| "step": 116, |
| "train_speed(iter/s)": 0.086734 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 239.0, |
| "completions/mean_length": 125.3359375, |
| "completions/min_length": 59.5, |
| "epoch": 2.34, |
| "grad_norm": 2.4033124446868896, |
| "kl": 0.537109375, |
| "learning_rate": 8.789119261039384e-07, |
| "loss": 0.017890973016619682, |
| "memory(GiB)": 18.17, |
| "reward": 0.36347851157188416, |
| "reward_std": 0.027591521851718426, |
| "rewards/MCQ_Reward/mean": 0.36347851157188416, |
| "rewards/MCQ_Reward/std": 0.09114562720060349, |
| "step": 117, |
| "train_speed(iter/s)": 0.086687 |
| }, |
| { |
| "clip_ratio": 0.011405623517930508, |
| "epoch": 2.36, |
| "grad_norm": 2.8501975536346436, |
| "kl": 0.587890625, |
| "learning_rate": 8.768338430554082e-07, |
| "loss": 0.016866052523255348, |
| "memory(GiB)": 18.17, |
| "step": 118, |
| "train_speed(iter/s)": 0.08735 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 194.0, |
| "completions/mean_length": 122.23046875, |
| "completions/min_length": 65.0, |
| "epoch": 2.38, |
| "grad_norm": 2.5570151805877686, |
| "kl": 0.5126953125, |
| "learning_rate": 8.74740581194627e-07, |
| "loss": -0.011926580220460892, |
| "memory(GiB)": 18.17, |
| "reward": 0.40480077266693115, |
| "reward_std": 0.03289741463959217, |
| "rewards/MCQ_Reward/mean": 0.40480077266693115, |
| "rewards/MCQ_Reward/std": 0.08261778578162193, |
| "step": 119, |
| "train_speed(iter/s)": 0.087419 |
| }, |
| { |
| "clip_ratio": 0.007963848765939474, |
| "epoch": 2.4, |
| "grad_norm": 2.1802773475646973, |
| "kl": 0.5009765625, |
| "learning_rate": 8.726322248378774e-07, |
| "loss": -0.0127539848908782, |
| "memory(GiB)": 18.17, |
| "step": 120, |
| "train_speed(iter/s)": 0.088053 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 244.0, |
| "completions/mean_length": 130.2421875, |
| "completions/min_length": 60.5, |
| "epoch": 2.42, |
| "grad_norm": 2.4936065673828125, |
| "kl": 0.537109375, |
| "learning_rate": 8.705088589094458e-07, |
| "loss": 0.008000252768397331, |
| "memory(GiB)": 18.17, |
| "reward": 0.36072438955307007, |
| "reward_std": 0.030319811776280403, |
| "rewards/MCQ_Reward/mean": 0.36072438955307007, |
| "rewards/MCQ_Reward/std": 0.1019350104033947, |
| "step": 121, |
| "train_speed(iter/s)": 0.08768 |
| }, |
| { |
| "clip_ratio": 0.006943409331142902, |
| "epoch": 2.44, |
| "grad_norm": 2.4447567462921143, |
| "kl": 0.544921875, |
| "learning_rate": 8.683705689382024e-07, |
| "loss": 0.0072016119956970215, |
| "memory(GiB)": 18.17, |
| "step": 122, |
| "train_speed(iter/s)": 0.088326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 191.5, |
| "completions/mean_length": 112.40234375, |
| "completions/min_length": 53.0, |
| "epoch": 2.46, |
| "grad_norm": 2.279759168624878, |
| "kl": 0.55859375, |
| "learning_rate": 8.662174410541554e-07, |
| "loss": 0.00623547937721014, |
| "memory(GiB)": 18.17, |
| "reward": 0.3670702576637268, |
| "reward_std": 0.02890967670828104, |
| "rewards/MCQ_Reward/mean": 0.3670702576637268, |
| "rewards/MCQ_Reward/std": 0.0740283839404583, |
| "step": 123, |
| "train_speed(iter/s)": 0.088484 |
| }, |
| { |
| "clip_ratio": 0.007923177909106016, |
| "epoch": 2.48, |
| "grad_norm": 2.789609909057617, |
| "kl": 0.587890625, |
| "learning_rate": 8.64049561984982e-07, |
| "loss": 0.005373558960855007, |
| "memory(GiB)": 18.17, |
| "step": 124, |
| "train_speed(iter/s)": 0.089133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 194.5, |
| "completions/mean_length": 124.91796875, |
| "completions/min_length": 73.0, |
| "epoch": 2.5, |
| "grad_norm": 2.2765557765960693, |
| "kl": 0.498046875, |
| "learning_rate": 8.61867019052535e-07, |
| "loss": -0.0031618811190128326, |
| "memory(GiB)": 18.17, |
| "reward": 0.3880574107170105, |
| "reward_std": 0.02767461072653532, |
| "rewards/MCQ_Reward/mean": 0.3880574107170105, |
| "rewards/MCQ_Reward/std": 0.11312882974743843, |
| "step": 125, |
| "train_speed(iter/s)": 0.089217 |
| }, |
| { |
| "clip_ratio": 0.006887951632961631, |
| "epoch": 2.52, |
| "grad_norm": 2.2742230892181396, |
| "kl": 0.509765625, |
| "learning_rate": 8.596699001693255e-07, |
| "loss": -0.004048643633723259, |
| "memory(GiB)": 18.17, |
| "step": 126, |
| "train_speed(iter/s)": 0.089838 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 202.5, |
| "completions/mean_length": 117.484375, |
| "completions/min_length": 56.5, |
| "epoch": 2.54, |
| "grad_norm": 2.340428113937378, |
| "kl": 0.546875, |
| "learning_rate": 8.574582938349817e-07, |
| "loss": -0.009344515390694141, |
| "memory(GiB)": 18.17, |
| "reward": 0.38609637320041656, |
| "reward_std": 0.033216655254364014, |
| "rewards/MCQ_Reward/mean": 0.38609637320041656, |
| "rewards/MCQ_Reward/std": 0.09242032468318939, |
| "step": 127, |
| "train_speed(iter/s)": 0.089914 |
| }, |
| { |
| "clip_ratio": 0.007429210003465414, |
| "epoch": 2.56, |
| "grad_norm": 2.3134751319885254, |
| "kl": 0.57421875, |
| "learning_rate": 8.552322891326844e-07, |
| "loss": -0.010545218363404274, |
| "memory(GiB)": 18.17, |
| "step": 128, |
| "train_speed(iter/s)": 0.090544 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 238.0, |
| "completions/mean_length": 119.9765625, |
| "completions/min_length": 57.0, |
| "epoch": 2.58, |
| "grad_norm": 2.265873670578003, |
| "kl": 0.4931640625, |
| "learning_rate": 8.529919757255781e-07, |
| "loss": -0.007635302376002073, |
| "memory(GiB)": 18.17, |
| "reward": 0.41428878903388977, |
| "reward_std": 0.028425303287804127, |
| "rewards/MCQ_Reward/mean": 0.41428878903388977, |
| "rewards/MCQ_Reward/std": 0.07786687836050987, |
| "step": 129, |
| "train_speed(iter/s)": 0.09048 |
| }, |
| { |
| "clip_ratio": 0.006183756981045008, |
| "epoch": 2.6, |
| "grad_norm": 2.283554792404175, |
| "kl": 0.498046875, |
| "learning_rate": 8.507374438531606e-07, |
| "loss": -0.008446864783763885, |
| "memory(GiB)": 18.17, |
| "step": 130, |
| "train_speed(iter/s)": 0.091107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 196.0, |
| "completions/mean_length": 119.125, |
| "completions/min_length": 59.0, |
| "epoch": 2.62, |
| "grad_norm": 2.8296353816986084, |
| "kl": 0.525390625, |
| "learning_rate": 8.484687843276468e-07, |
| "loss": 0.003696079831570387, |
| "memory(GiB)": 18.17, |
| "reward": 0.40898391604423523, |
| "reward_std": 0.02961808815598488, |
| "rewards/MCQ_Reward/mean": 0.40898391604423523, |
| "rewards/MCQ_Reward/std": 0.09117832407355309, |
| "step": 131, |
| "train_speed(iter/s)": 0.09081 |
| }, |
| { |
| "clip_ratio": 0.010138689540326595, |
| "epoch": 2.64, |
| "grad_norm": 2.565761089324951, |
| "kl": 0.53515625, |
| "learning_rate": 8.461860885303113e-07, |
| "loss": 0.003048412501811981, |
| "memory(GiB)": 18.17, |
| "step": 132, |
| "train_speed(iter/s)": 0.091425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 209.0, |
| "completions/mean_length": 129.40234375, |
| "completions/min_length": 70.0, |
| "epoch": 2.66, |
| "grad_norm": 2.344294786453247, |
| "kl": 0.513671875, |
| "learning_rate": 8.438894484078085e-07, |
| "loss": 0.005981519352644682, |
| "memory(GiB)": 18.17, |
| "reward": 0.40958625078201294, |
| "reward_std": 0.027244774624705315, |
| "rewards/MCQ_Reward/mean": 0.40958625078201294, |
| "rewards/MCQ_Reward/std": 0.07108591124415398, |
| "step": 133, |
| "train_speed(iter/s)": 0.091506 |
| }, |
| { |
| "clip_ratio": 0.006955728633329272, |
| "epoch": 2.68, |
| "grad_norm": 2.667799949645996, |
| "kl": 0.50390625, |
| "learning_rate": 8.415789564684673e-07, |
| "loss": 0.0052396636456251144, |
| "memory(GiB)": 18.17, |
| "step": 134, |
| "train_speed(iter/s)": 0.092113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 193.0, |
| "completions/mean_length": 132.30859375, |
| "completions/min_length": 79.0, |
| "epoch": 2.7, |
| "grad_norm": 2.6722846031188965, |
| "kl": 0.5029296875, |
| "learning_rate": 8.392547057785661e-07, |
| "loss": 0.0176947470754385, |
| "memory(GiB)": 18.17, |
| "reward": 0.39249348640441895, |
| "reward_std": 0.024370728991925716, |
| "rewards/MCQ_Reward/mean": 0.39249348640441895, |
| "rewards/MCQ_Reward/std": 0.10880232974886894, |
| "step": 135, |
| "train_speed(iter/s)": 0.092158 |
| }, |
| { |
| "clip_ratio": 0.009976111352443695, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 2.80319881439209, |
| "kl": 0.548828125, |
| "learning_rate": 8.369167899585839e-07, |
| "loss": 0.01698880083858967, |
| "memory(GiB)": 18.17, |
| "step": 136, |
| "train_speed(iter/s)": 0.092755 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 186.0, |
| "completions/mean_length": 117.91015625, |
| "completions/min_length": 53.5, |
| "epoch": 2.74, |
| "grad_norm": 2.5274980068206787, |
| "kl": 0.5087890625, |
| "learning_rate": 8.34565303179429e-07, |
| "loss": -0.004888280760496855, |
| "memory(GiB)": 18.17, |
| "reward": 0.3668254613876343, |
| "reward_std": 0.02390660159289837, |
| "rewards/MCQ_Reward/mean": 0.3668254613876343, |
| "rewards/MCQ_Reward/std": 0.06858384422957897, |
| "step": 137, |
| "train_speed(iter/s)": 0.092788 |
| }, |
| { |
| "clip_ratio": 0.00792233063839376, |
| "epoch": 2.76, |
| "grad_norm": 2.6973214149475098, |
| "kl": 0.513671875, |
| "learning_rate": 8.322003401586461e-07, |
| "loss": -0.0054510245099663734, |
| "memory(GiB)": 18.17, |
| "step": 138, |
| "train_speed(iter/s)": 0.093386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 212.5, |
| "completions/mean_length": 128.76953125, |
| "completions/min_length": 74.0, |
| "epoch": 2.7800000000000002, |
| "grad_norm": 2.22070574760437, |
| "kl": 0.4912109375, |
| "learning_rate": 8.298219961566008e-07, |
| "loss": -0.001897591631859541, |
| "memory(GiB)": 18.17, |
| "reward": 0.3943639397621155, |
| "reward_std": 0.021683918312191963, |
| "rewards/MCQ_Reward/mean": 0.3943639397621155, |
| "rewards/MCQ_Reward/std": 0.08081439509987831, |
| "step": 139, |
| "train_speed(iter/s)": 0.093426 |
| }, |
| { |
| "clip_ratio": 0.005092586623504758, |
| "epoch": 2.8, |
| "grad_norm": 2.3254384994506836, |
| "kl": 0.5009765625, |
| "learning_rate": 8.274303669726426e-07, |
| "loss": -0.0023171789944171906, |
| "memory(GiB)": 18.17, |
| "step": 140, |
| "train_speed(iter/s)": 0.094018 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.0, |
| "completions/mean_length": 131.94140625, |
| "completions/min_length": 76.0, |
| "epoch": 2.82, |
| "grad_norm": 2.8199474811553955, |
| "kl": 0.513671875, |
| "learning_rate": 8.250255489412462e-07, |
| "loss": 0.03072257712483406, |
| "memory(GiB)": 18.17, |
| "reward": 0.4145784378051758, |
| "reward_std": 0.026746340095996857, |
| "rewards/MCQ_Reward/mean": 0.4145784378051758, |
| "rewards/MCQ_Reward/std": 0.1253884807229042, |
| "step": 141, |
| "train_speed(iter/s)": 0.093563 |
| }, |
| { |
| "clip_ratio": 0.01698949094861746, |
| "epoch": 2.84, |
| "grad_norm": 3.6371665000915527, |
| "kl": 0.5654296875, |
| "learning_rate": 8.226076389281314e-07, |
| "loss": 0.030751001089811325, |
| "memory(GiB)": 18.17, |
| "step": 142, |
| "train_speed(iter/s)": 0.094156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 222.0, |
| "completions/mean_length": 122.05859375, |
| "completions/min_length": 41.0, |
| "epoch": 2.86, |
| "grad_norm": 3.697355031967163, |
| "kl": 0.529296875, |
| "learning_rate": 8.201767343263611e-07, |
| "loss": 0.001254035159945488, |
| "memory(GiB)": 18.17, |
| "reward": 0.4235128164291382, |
| "reward_std": 0.02945070993155241, |
| "rewards/MCQ_Reward/mean": 0.4235128164291382, |
| "rewards/MCQ_Reward/std": 0.0826257448643446, |
| "step": 143, |
| "train_speed(iter/s)": 0.094158 |
| }, |
| { |
| "clip_ratio": 0.010704205837100744, |
| "epoch": 2.88, |
| "grad_norm": 2.6047918796539307, |
| "kl": 0.556640625, |
| "learning_rate": 8.177329330524181e-07, |
| "loss": 0.0003689592704176903, |
| "memory(GiB)": 18.17, |
| "step": 144, |
| "train_speed(iter/s)": 0.09474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 314.5, |
| "completions/mean_length": 147.65234375, |
| "completions/min_length": 84.0, |
| "epoch": 2.9, |
| "grad_norm": 2.0444202423095703, |
| "kl": 0.4521484375, |
| "learning_rate": 8.152763335422612e-07, |
| "loss": 0.009064443409442902, |
| "memory(GiB)": 18.17, |
| "reward": 0.38259103894233704, |
| "reward_std": 0.023838728666305542, |
| "rewards/MCQ_Reward/mean": 0.38259103894233704, |
| "rewards/MCQ_Reward/std": 0.0847747940570116, |
| "step": 145, |
| "train_speed(iter/s)": 0.09459 |
| }, |
| { |
| "clip_ratio": 0.013846603687852621, |
| "epoch": 2.92, |
| "grad_norm": 3.0148403644561768, |
| "kl": 0.47265625, |
| "learning_rate": 8.128070347473608e-07, |
| "loss": 0.008937995880842209, |
| "memory(GiB)": 18.17, |
| "step": 146, |
| "train_speed(iter/s)": 0.095167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 212.0, |
| "completions/mean_length": 131.4609375, |
| "completions/min_length": 58.5, |
| "epoch": 2.94, |
| "grad_norm": 2.3035802841186523, |
| "kl": 0.515625, |
| "learning_rate": 8.103251361307118e-07, |
| "loss": -0.003920593298971653, |
| "memory(GiB)": 18.17, |
| "reward": 0.46591490507125854, |
| "reward_std": 0.02803555503487587, |
| "rewards/MCQ_Reward/mean": 0.46591490507125854, |
| "rewards/MCQ_Reward/std": 0.08151933178305626, |
| "step": 147, |
| "train_speed(iter/s)": 0.095144 |
| }, |
| { |
| "clip_ratio": 0.008604592643678188, |
| "epoch": 2.96, |
| "grad_norm": 3.269644021987915, |
| "kl": 0.498046875, |
| "learning_rate": 8.07830737662829e-07, |
| "loss": -0.004623805172741413, |
| "memory(GiB)": 18.17, |
| "step": 148, |
| "train_speed(iter/s)": 0.095712 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 229.0, |
| "completions/mean_length": 115.5859375, |
| "completions/min_length": 47.5, |
| "epoch": 2.98, |
| "grad_norm": 2.762554883956909, |
| "kl": 0.55859375, |
| "learning_rate": 8.053239398177191e-07, |
| "loss": -0.002270375844091177, |
| "memory(GiB)": 18.17, |
| "reward": 0.40475866198539734, |
| "reward_std": 0.02323055360466242, |
| "rewards/MCQ_Reward/mean": 0.40475866198539734, |
| "rewards/MCQ_Reward/std": 0.11423858627676964, |
| "step": 149, |
| "train_speed(iter/s)": 0.095646 |
| }, |
| { |
| "clip_ratio": 0.005962205119431019, |
| "epoch": 3.0, |
| "grad_norm": 2.495875358581543, |
| "kl": 0.5625, |
| "learning_rate": 8.028048435688333e-07, |
| "loss": -0.0031687067821621895, |
| "memory(GiB)": 18.17, |
| "step": 150, |
| "train_speed(iter/s)": 0.0962 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 167.5, |
| "completions/mean_length": 117.21484375, |
| "completions/min_length": 58.0, |
| "epoch": 3.02, |
| "grad_norm": 3.30179762840271, |
| "kl": 0.572265625, |
| "learning_rate": 8.002735503850015e-07, |
| "loss": -0.0032917922362685204, |
| "memory(GiB)": 18.17, |
| "reward": 0.39226125180721283, |
| "reward_std": 0.025511370040476322, |
| "rewards/MCQ_Reward/mean": 0.39226125180721283, |
| "rewards/MCQ_Reward/std": 0.08468513377010822, |
| "step": 151, |
| "train_speed(iter/s)": 0.095897 |
| }, |
| { |
| "clip_ratio": 0.007298078387975693, |
| "epoch": 3.04, |
| "grad_norm": 2.3152873516082764, |
| "kl": 0.56640625, |
| "learning_rate": 7.97730162226344e-07, |
| "loss": -0.004036391619592905, |
| "memory(GiB)": 18.17, |
| "step": 152, |
| "train_speed(iter/s)": 0.096461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 200.0, |
| "completions/mean_length": 121.8984375, |
| "completions/min_length": 63.5, |
| "epoch": 3.06, |
| "grad_norm": 2.2318758964538574, |
| "kl": 0.51171875, |
| "learning_rate": 7.951747815401649e-07, |
| "loss": 0.008308425545692444, |
| "memory(GiB)": 18.17, |
| "reward": 0.425733745098114, |
| "reward_std": 0.02289827074855566, |
| "rewards/MCQ_Reward/mean": 0.425733745098114, |
| "rewards/MCQ_Reward/std": 0.12863966077566147, |
| "step": 153, |
| "train_speed(iter/s)": 0.096546 |
| }, |
| { |
| "clip_ratio": 0.009599440731108189, |
| "epoch": 3.08, |
| "grad_norm": 3.2350826263427734, |
| "kl": 0.5009765625, |
| "learning_rate": 7.926075112568258e-07, |
| "loss": 0.00774328364059329, |
| "memory(GiB)": 18.17, |
| "step": 154, |
| "train_speed(iter/s)": 0.0971 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.5, |
| "completions/mean_length": 129.765625, |
| "completions/min_length": 63.5, |
| "epoch": 3.1, |
| "grad_norm": 2.8958089351654053, |
| "kl": 0.5146484375, |
| "learning_rate": 7.900284547855991e-07, |
| "loss": 0.005472003482282162, |
| "memory(GiB)": 18.17, |
| "reward": 0.3814770430326462, |
| "reward_std": 0.021100854501128197, |
| "rewards/MCQ_Reward/mean": 0.3814770430326462, |
| "rewards/MCQ_Reward/std": 0.08354593068361282, |
| "step": 155, |
| "train_speed(iter/s)": 0.096733 |
| }, |
| { |
| "clip_ratio": 0.008797692600637674, |
| "epoch": 3.12, |
| "grad_norm": 2.330720901489258, |
| "kl": 0.5107421875, |
| "learning_rate": 7.874377160105036e-07, |
| "loss": 0.00483354227617383, |
| "memory(GiB)": 18.17, |
| "step": 156, |
| "train_speed(iter/s)": 0.097282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 208.0, |
| "completions/mean_length": 123.1640625, |
| "completions/min_length": 68.0, |
| "epoch": 3.14, |
| "grad_norm": 2.1395411491394043, |
| "kl": 0.515625, |
| "learning_rate": 7.848353992861194e-07, |
| "loss": 0.009709931910037994, |
| "memory(GiB)": 18.17, |
| "reward": 0.4426523745059967, |
| "reward_std": 0.024569914676249027, |
| "rewards/MCQ_Reward/mean": 0.4426523745059967, |
| "rewards/MCQ_Reward/std": 0.10452848672866821, |
| "step": 157, |
| "train_speed(iter/s)": 0.097277 |
| }, |
| { |
| "clip_ratio": 0.008177514653652906, |
| "epoch": 3.16, |
| "grad_norm": 2.8377902507781982, |
| "kl": 0.49609375, |
| "learning_rate": 7.822216094333847e-07, |
| "loss": 0.00888834334909916, |
| "memory(GiB)": 18.17, |
| "step": 158, |
| "train_speed(iter/s)": 0.097824 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 192.5, |
| "completions/mean_length": 121.08203125, |
| "completions/min_length": 59.0, |
| "epoch": 3.18, |
| "grad_norm": 2.439819574356079, |
| "kl": 0.5009765625, |
| "learning_rate": 7.795964517353733e-07, |
| "loss": -0.005721232853829861, |
| "memory(GiB)": 18.17, |
| "reward": 0.4260745346546173, |
| "reward_std": 0.024243751540780067, |
| "rewards/MCQ_Reward/mean": 0.4260745346546173, |
| "rewards/MCQ_Reward/std": 0.08284034207463264, |
| "step": 159, |
| "train_speed(iter/s)": 0.09781 |
| }, |
| { |
| "clip_ratio": 0.006790396990254521, |
| "epoch": 3.2, |
| "grad_norm": 1.9817484617233276, |
| "kl": 0.4970703125, |
| "learning_rate": 7.769600319330552e-07, |
| "loss": -0.006797813344746828, |
| "memory(GiB)": 18.17, |
| "step": 160, |
| "train_speed(iter/s)": 0.098355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 200.5, |
| "completions/mean_length": 112.234375, |
| "completions/min_length": 54.0, |
| "epoch": 3.22, |
| "grad_norm": 2.4277918338775635, |
| "kl": 0.60546875, |
| "learning_rate": 7.743124562210351e-07, |
| "loss": 0.011250641196966171, |
| "memory(GiB)": 18.17, |
| "reward": 0.4286917597055435, |
| "reward_std": 0.023968255147337914, |
| "rewards/MCQ_Reward/mean": 0.4286917597055435, |
| "rewards/MCQ_Reward/std": 0.08755803853273392, |
| "step": 161, |
| "train_speed(iter/s)": 0.097905 |
| }, |
| { |
| "clip_ratio": 0.008228898979723454, |
| "epoch": 3.24, |
| "grad_norm": 2.4396235942840576, |
| "kl": 0.63671875, |
| "learning_rate": 7.716538312432765e-07, |
| "loss": 0.009992354549467564, |
| "memory(GiB)": 18.17, |
| "step": 162, |
| "train_speed(iter/s)": 0.098438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 182.0, |
| "completions/mean_length": 128.484375, |
| "completions/min_length": 65.5, |
| "epoch": 3.26, |
| "grad_norm": 2.378303289413452, |
| "kl": 0.4560546875, |
| "learning_rate": 7.689842640888063e-07, |
| "loss": 0.014578643254935741, |
| "memory(GiB)": 18.17, |
| "reward": 0.4368235617876053, |
| "reward_std": 0.024292019195854664, |
| "rewards/MCQ_Reward/mean": 0.4368235617876053, |
| "rewards/MCQ_Reward/std": 0.10128979757428169, |
| "step": 163, |
| "train_speed(iter/s)": 0.098485 |
| }, |
| { |
| "clip_ratio": 0.006144619081169367, |
| "epoch": 3.2800000000000002, |
| "grad_norm": 2.336179733276367, |
| "kl": 0.455078125, |
| "learning_rate": 7.663038622873999e-07, |
| "loss": 0.014264167286455631, |
| "memory(GiB)": 18.17, |
| "step": 164, |
| "train_speed(iter/s)": 0.09902 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 190.0, |
| "completions/mean_length": 127.5546875, |
| "completions/min_length": 68.0, |
| "epoch": 3.3, |
| "grad_norm": 2.3888978958129883, |
| "kl": 0.51953125, |
| "learning_rate": 7.636127338052511e-07, |
| "loss": 0.0008876635693013668, |
| "memory(GiB)": 18.17, |
| "reward": 0.3655773550271988, |
| "reward_std": 0.023151511326432228, |
| "rewards/MCQ_Reward/mean": 0.3655773550271988, |
| "rewards/MCQ_Reward/std": 0.08209535107016563, |
| "step": 165, |
| "train_speed(iter/s)": 0.099067 |
| }, |
| { |
| "clip_ratio": 0.009708862751722336, |
| "epoch": 3.32, |
| "grad_norm": 2.849376678466797, |
| "kl": 0.53515625, |
| "learning_rate": 7.60910987040623e-07, |
| "loss": 0.0005215085111558437, |
| "memory(GiB)": 18.17, |
| "step": 166, |
| "train_speed(iter/s)": 0.099591 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 183.0, |
| "completions/mean_length": 114.09375, |
| "completions/min_length": 68.5, |
| "epoch": 3.34, |
| "grad_norm": 2.3568837642669678, |
| "kl": 0.568359375, |
| "learning_rate": 7.581987308194809e-07, |
| "loss": 0.009412365034222603, |
| "memory(GiB)": 18.17, |
| "reward": 0.38831935822963715, |
| "reward_std": 0.024401471950113773, |
| "rewards/MCQ_Reward/mean": 0.38831935822963715, |
| "rewards/MCQ_Reward/std": 0.07682501710951328, |
| "step": 167, |
| "train_speed(iter/s)": 0.099643 |
| }, |
| { |
| "clip_ratio": 0.009874043520539999, |
| "epoch": 3.36, |
| "grad_norm": 4.141200542449951, |
| "kl": 0.548828125, |
| "learning_rate": 7.554760743911103e-07, |
| "loss": 0.008638818748295307, |
| "memory(GiB)": 18.17, |
| "step": 168, |
| "train_speed(iter/s)": 0.100139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 177.5, |
| "completions/mean_length": 116.015625, |
| "completions/min_length": 68.0, |
| "epoch": 3.38, |
| "grad_norm": 2.3995447158813477, |
| "kl": 0.5390625, |
| "learning_rate": 7.527431274237149e-07, |
| "loss": 0.009148918092250824, |
| "memory(GiB)": 18.17, |
| "reward": 0.43169474601745605, |
| "reward_std": 0.023636899888515472, |
| "rewards/MCQ_Reward/mean": 0.43169474601745605, |
| "rewards/MCQ_Reward/std": 0.08781928941607475, |
| "step": 169, |
| "train_speed(iter/s)": 0.100207 |
| }, |
| { |
| "clip_ratio": 0.011634313501417637, |
| "epoch": 3.4, |
| "grad_norm": 3.3103132247924805, |
| "kl": 0.580078125, |
| "learning_rate": 7.5e-07, |
| "loss": 0.008654891513288021, |
| "memory(GiB)": 18.17, |
| "step": 170, |
| "train_speed(iter/s)": 0.100725 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 198.5, |
| "completions/mean_length": 116.5859375, |
| "completions/min_length": 61.0, |
| "epoch": 3.42, |
| "grad_norm": 2.4376144409179688, |
| "kl": 0.51171875, |
| "learning_rate": 7.472468026127384e-07, |
| "loss": 0.0037187309935688972, |
| "memory(GiB)": 18.17, |
| "reward": 0.4193449318408966, |
| "reward_std": 0.024272997863590717, |
| "rewards/MCQ_Reward/mean": 0.4193449318408966, |
| "rewards/MCQ_Reward/std": 0.08024471625685692, |
| "step": 171, |
| "train_speed(iter/s)": 0.100337 |
| }, |
| { |
| "clip_ratio": 0.004286584910005331, |
| "epoch": 3.44, |
| "grad_norm": 2.298527479171753, |
| "kl": 0.501953125, |
| "learning_rate": 7.444836461603194e-07, |
| "loss": 0.0035052020102739334, |
| "memory(GiB)": 18.17, |
| "step": 172, |
| "train_speed(iter/s)": 0.10083 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 192.0, |
| "completions/mean_length": 107.78515625, |
| "completions/min_length": 54.0, |
| "epoch": 3.46, |
| "grad_norm": 2.706815004348755, |
| "kl": 0.572265625, |
| "learning_rate": 7.417106419422818e-07, |
| "loss": 0.001836567185819149, |
| "memory(GiB)": 18.17, |
| "reward": 0.4373796284198761, |
| "reward_std": 0.024632513523101807, |
| "rewards/MCQ_Reward/mean": 0.4373796284198761, |
| "rewards/MCQ_Reward/std": 0.10328296199440956, |
| "step": 173, |
| "train_speed(iter/s)": 0.100842 |
| }, |
| { |
| "clip_ratio": 0.00837572431191802, |
| "epoch": 3.48, |
| "grad_norm": 2.7765517234802246, |
| "kl": 0.55859375, |
| "learning_rate": 7.389279016548316e-07, |
| "loss": 0.0008762972429394722, |
| "memory(GiB)": 18.17, |
| "step": 174, |
| "train_speed(iter/s)": 0.10133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 235.5, |
| "completions/mean_length": 141.59375, |
| "completions/min_length": 93.0, |
| "epoch": 3.5, |
| "grad_norm": 2.0208756923675537, |
| "kl": 0.494140625, |
| "learning_rate": 7.361355373863413e-07, |
| "loss": -0.0017252122052013874, |
| "memory(GiB)": 18.17, |
| "reward": 0.4430805742740631, |
| "reward_std": 0.023134860210120678, |
| "rewards/MCQ_Reward/mean": 0.4430805742740631, |
| "rewards/MCQ_Reward/std": 0.10230642557144165, |
| "step": 175, |
| "train_speed(iter/s)": 0.101269 |
| }, |
| { |
| "clip_ratio": 0.008417821954935789, |
| "epoch": 3.52, |
| "grad_norm": 2.5541892051696777, |
| "kl": 0.498046875, |
| "learning_rate": 7.333336616128369e-07, |
| "loss": -0.0020766020752489567, |
| "memory(GiB)": 18.17, |
| "step": 176, |
| "train_speed(iter/s)": 0.101776 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 237.5, |
| "completions/mean_length": 140.26953125, |
| "completions/min_length": 61.5, |
| "epoch": 3.54, |
| "grad_norm": 2.090574264526367, |
| "kl": 0.455078125, |
| "learning_rate": 7.305223871934656e-07, |
| "loss": -0.004062575753778219, |
| "memory(GiB)": 18.17, |
| "reward": 0.4077337831258774, |
| "reward_std": 0.021388554014265537, |
| "rewards/MCQ_Reward/mean": 0.4077337831258774, |
| "rewards/MCQ_Reward/std": 0.1092216707766056, |
| "step": 177, |
| "train_speed(iter/s)": 0.101717 |
| }, |
| { |
| "clip_ratio": 0.009097482077777386, |
| "epoch": 3.56, |
| "grad_norm": 2.031277894973755, |
| "kl": 0.4638671875, |
| "learning_rate": 7.277018273659516e-07, |
| "loss": -0.005147318355739117, |
| "memory(GiB)": 18.17, |
| "step": 178, |
| "train_speed(iter/s)": 0.102192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.5, |
| "completions/mean_length": 103.77734375, |
| "completions/min_length": 56.0, |
| "epoch": 3.58, |
| "grad_norm": 2.28383731842041, |
| "kl": 0.55078125, |
| "learning_rate": 7.248720957420329e-07, |
| "loss": 0.0054731229320168495, |
| "memory(GiB)": 18.17, |
| "reward": 0.37708504498004913, |
| "reward_std": 0.022474835626780987, |
| "rewards/MCQ_Reward/mean": 0.37708504498004913, |
| "rewards/MCQ_Reward/std": 0.10817139223217964, |
| "step": 179, |
| "train_speed(iter/s)": 0.102207 |
| }, |
| { |
| "clip_ratio": 0.005004609236493707, |
| "epoch": 3.6, |
| "grad_norm": 2.2720046043395996, |
| "kl": 0.552734375, |
| "learning_rate": 7.220333063028871e-07, |
| "loss": 0.004853987134993076, |
| "memory(GiB)": 18.17, |
| "step": 180, |
| "train_speed(iter/s)": 0.10258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 267.0, |
| "completions/mean_length": 135.375, |
| "completions/min_length": 64.5, |
| "epoch": 3.62, |
| "grad_norm": 2.0278213024139404, |
| "kl": 0.537109375, |
| "learning_rate": 7.191855733945386e-07, |
| "loss": 0.007204895373433828, |
| "memory(GiB)": 18.17, |
| "reward": 0.37996095418930054, |
| "reward_std": 0.024972867220640182, |
| "rewards/MCQ_Reward/mean": 0.37996095418930054, |
| "rewards/MCQ_Reward/std": 0.06211347132921219, |
| "step": 181, |
| "train_speed(iter/s)": 0.102022 |
| }, |
| { |
| "clip_ratio": 0.0050066676922142506, |
| "epoch": 3.64, |
| "grad_norm": 2.026421308517456, |
| "kl": 0.54296875, |
| "learning_rate": 7.163290117232541e-07, |
| "loss": 0.006550833582878113, |
| "memory(GiB)": 18.17, |
| "step": 182, |
| "train_speed(iter/s)": 0.102515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 231.0, |
| "completions/mean_length": 132.80078125, |
| "completions/min_length": 70.0, |
| "epoch": 3.66, |
| "grad_norm": 2.322474479675293, |
| "kl": 0.4560546875, |
| "learning_rate": 7.134637363509209e-07, |
| "loss": 0.00408747885376215, |
| "memory(GiB)": 18.17, |
| "reward": 0.42590010166168213, |
| "reward_std": 0.02117757499217987, |
| "rewards/MCQ_Reward/mean": 0.42590010166168213, |
| "rewards/MCQ_Reward/std": 0.10450495779514313, |
| "step": 183, |
| "train_speed(iter/s)": 0.102439 |
| }, |
| { |
| "clip_ratio": 0.005717001855373383, |
| "epoch": 3.68, |
| "grad_norm": 2.0725347995758057, |
| "kl": 0.4501953125, |
| "learning_rate": 7.105898626904134e-07, |
| "loss": 0.003590245731174946, |
| "memory(GiB)": 18.17, |
| "step": 184, |
| "train_speed(iter/s)": 0.10291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 185.0, |
| "completions/mean_length": 107.73828125, |
| "completions/min_length": 67.5, |
| "epoch": 3.7, |
| "grad_norm": 2.94624662399292, |
| "kl": 0.578125, |
| "learning_rate": 7.077075065009433e-07, |
| "loss": -0.0015533820260316133, |
| "memory(GiB)": 18.17, |
| "reward": 0.4082287549972534, |
| "reward_std": 0.023994137533009052, |
| "rewards/MCQ_Reward/mean": 0.4082287549972534, |
| "rewards/MCQ_Reward/std": 0.09996674209833145, |
| "step": 185, |
| "train_speed(iter/s)": 0.102951 |
| }, |
| { |
| "clip_ratio": 0.006125608924776316, |
| "epoch": 3.7199999999999998, |
| "grad_norm": 2.3971669673919678, |
| "kl": 0.572265625, |
| "learning_rate": 7.048167838833976e-07, |
| "loss": -0.0021633533760905266, |
| "memory(GiB)": 18.17, |
| "step": 186, |
| "train_speed(iter/s)": 0.103425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 227.5, |
| "completions/mean_length": 131.453125, |
| "completions/min_length": 59.0, |
| "epoch": 3.74, |
| "grad_norm": 2.0767407417297363, |
| "kl": 0.513671875, |
| "learning_rate": 7.019178112756625e-07, |
| "loss": 0.005040531512349844, |
| "memory(GiB)": 18.17, |
| "reward": 0.43931877613067627, |
| "reward_std": 0.02542781364172697, |
| "rewards/MCQ_Reward/mean": 0.43931877613067627, |
| "rewards/MCQ_Reward/std": 0.0755577739328146, |
| "step": 187, |
| "train_speed(iter/s)": 0.103367 |
| }, |
| { |
| "clip_ratio": 0.007456609280779958, |
| "epoch": 3.76, |
| "grad_norm": 2.0555458068847656, |
| "kl": 0.513671875, |
| "learning_rate": 6.990107054479312e-07, |
| "loss": 0.004873338155448437, |
| "memory(GiB)": 18.17, |
| "step": 188, |
| "train_speed(iter/s)": 0.103852 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 187.0, |
| "completions/mean_length": 120.015625, |
| "completions/min_length": 56.0, |
| "epoch": 3.7800000000000002, |
| "grad_norm": 2.1511483192443848, |
| "kl": 0.546875, |
| "learning_rate": 6.960955834980027e-07, |
| "loss": -0.007258214056491852, |
| "memory(GiB)": 18.17, |
| "reward": 0.3652060180902481, |
| "reward_std": 0.023877170868217945, |
| "rewards/MCQ_Reward/mean": 0.3652060180902481, |
| "rewards/MCQ_Reward/std": 0.09329301491379738, |
| "step": 189, |
| "train_speed(iter/s)": 0.103851 |
| }, |
| { |
| "clip_ratio": 0.006274498999118805, |
| "epoch": 3.8, |
| "grad_norm": 2.204212188720703, |
| "kl": 0.5546875, |
| "learning_rate": 6.931725628465642e-07, |
| "loss": -0.0077828834764659405, |
| "memory(GiB)": 18.17, |
| "step": 190, |
| "train_speed(iter/s)": 0.104325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 179.5, |
| "completions/mean_length": 119.02734375, |
| "completions/min_length": 68.0, |
| "epoch": 3.82, |
| "grad_norm": 2.489328384399414, |
| "kl": 0.5625, |
| "learning_rate": 6.902417612324615e-07, |
| "loss": -0.004156440030783415, |
| "memory(GiB)": 18.17, |
| "reward": 0.41069237887859344, |
| "reward_std": 0.02522939257323742, |
| "rewards/MCQ_Reward/mean": 0.41069237887859344, |
| "rewards/MCQ_Reward/std": 0.10438777878880501, |
| "step": 191, |
| "train_speed(iter/s)": 0.103961 |
| }, |
| { |
| "clip_ratio": 0.006902764085680246, |
| "epoch": 3.84, |
| "grad_norm": 2.573939085006714, |
| "kl": 0.53125, |
| "learning_rate": 6.87303296707956e-07, |
| "loss": -0.004263042006641626, |
| "memory(GiB)": 18.17, |
| "step": 192, |
| "train_speed(iter/s)": 0.104434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.0, |
| "completions/mean_length": 119.2109375, |
| "completions/min_length": 63.5, |
| "epoch": 3.86, |
| "grad_norm": 2.4605846405029297, |
| "kl": 0.537109375, |
| "learning_rate": 6.843572876339704e-07, |
| "loss": -0.006107931490987539, |
| "memory(GiB)": 18.17, |
| "reward": 0.41506680846214294, |
| "reward_std": 0.025901762768626213, |
| "rewards/MCQ_Reward/mean": 0.41506680846214294, |
| "rewards/MCQ_Reward/std": 0.11812347918748856, |
| "step": 193, |
| "train_speed(iter/s)": 0.104435 |
| }, |
| { |
| "clip_ratio": 0.006947604939341545, |
| "epoch": 3.88, |
| "grad_norm": 2.9201459884643555, |
| "kl": 0.533203125, |
| "learning_rate": 6.814038526753204e-07, |
| "loss": -0.006667410954833031, |
| "memory(GiB)": 18.17, |
| "step": 194, |
| "train_speed(iter/s)": 0.104911 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 204.0, |
| "completions/mean_length": 123.375, |
| "completions/min_length": 58.5, |
| "epoch": 3.9, |
| "grad_norm": 2.481006145477295, |
| "kl": 0.638671875, |
| "learning_rate": 6.784431107959358e-07, |
| "loss": -0.00256272591650486, |
| "memory(GiB)": 18.17, |
| "reward": 0.4147709757089615, |
| "reward_std": 0.023487260565161705, |
| "rewards/MCQ_Reward/mean": 0.4147709757089615, |
| "rewards/MCQ_Reward/std": 0.08765164762735367, |
| "step": 195, |
| "train_speed(iter/s)": 0.104938 |
| }, |
| { |
| "clip_ratio": 0.00836537522263825, |
| "epoch": 3.92, |
| "grad_norm": 2.211996078491211, |
| "kl": 0.62109375, |
| "learning_rate": 6.754751812540679e-07, |
| "loss": -0.0026485356502234936, |
| "memory(GiB)": 18.17, |
| "step": 196, |
| "train_speed(iter/s)": 0.105375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 161.0, |
| "completions/mean_length": 113.68359375, |
| "completions/min_length": 58.0, |
| "epoch": 3.94, |
| "grad_norm": 2.5469682216644287, |
| "kl": 0.556640625, |
| "learning_rate": 6.725001835974852e-07, |
| "loss": -0.005141774192452431, |
| "memory(GiB)": 18.17, |
| "reward": 0.39422211050987244, |
| "reward_std": 0.022977779619395733, |
| "rewards/MCQ_Reward/mean": 0.39422211050987244, |
| "rewards/MCQ_Reward/std": 0.09659452736377716, |
| "step": 197, |
| "train_speed(iter/s)": 0.105428 |
| }, |
| { |
| "clip_ratio": 0.007515270030125976, |
| "epoch": 3.96, |
| "grad_norm": 2.603193998336792, |
| "kl": 0.57421875, |
| "learning_rate": 6.695182376586602e-07, |
| "loss": -0.00558980368077755, |
| "memory(GiB)": 18.17, |
| "step": 198, |
| "train_speed(iter/s)": 0.105897 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 251.0, |
| "completions/mean_length": 124.140625, |
| "completions/min_length": 66.5, |
| "epoch": 3.98, |
| "grad_norm": 2.8109734058380127, |
| "kl": 0.5703125, |
| "learning_rate": 6.665294635499403e-07, |
| "loss": -0.008472483605146408, |
| "memory(GiB)": 18.17, |
| "reward": 0.3954710364341736, |
| "reward_std": 0.026893282309174538, |
| "rewards/MCQ_Reward/mean": 0.3954710364341736, |
| "rewards/MCQ_Reward/std": 0.07466300576925278, |
| "step": 199, |
| "train_speed(iter/s)": 0.10569 |
| }, |
| { |
| "clip_ratio": 0.007555491756647825, |
| "epoch": 4.0, |
| "grad_norm": 3.981370687484741, |
| "kl": 0.5625, |
| "learning_rate": 6.635339816587108e-07, |
| "loss": -0.008467345498502254, |
| "memory(GiB)": 18.17, |
| "step": 200, |
| "train_speed(iter/s)": 0.106122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 189.0, |
| "completions/mean_length": 114.1640625, |
| "completions/min_length": 67.0, |
| "epoch": 4.02, |
| "grad_norm": 3.464586019515991, |
| "kl": 1.001953125, |
| "learning_rate": 6.605319126425453e-07, |
| "loss": 0.010952511802315712, |
| "memory(GiB)": 18.17, |
| "reward": 0.4330308884382248, |
| "reward_std": 0.022406785748898983, |
| "rewards/MCQ_Reward/mean": 0.4330308884382248, |
| "rewards/MCQ_Reward/std": 0.09031685814261436, |
| "step": 201, |
| "train_speed(iter/s)": 0.10573 |
| }, |
| { |
| "clip_ratio": 0.010695958975702524, |
| "epoch": 4.04, |
| "grad_norm": 3.2848002910614014, |
| "kl": 1.3125, |
| "learning_rate": 6.575233774243464e-07, |
| "loss": 0.010859224945306778, |
| "memory(GiB)": 18.17, |
| "step": 202, |
| "train_speed(iter/s)": 0.106187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.5, |
| "completions/mean_length": 115.0625, |
| "completions/min_length": 64.5, |
| "epoch": 4.06, |
| "grad_norm": 2.5354137420654297, |
| "kl": 0.521484375, |
| "learning_rate": 6.545084971874736e-07, |
| "loss": 0.008116345852613449, |
| "memory(GiB)": 18.17, |
| "reward": 0.4043910503387451, |
| "reward_std": 0.023216267116367817, |
| "rewards/MCQ_Reward/mean": 0.4043910503387451, |
| "rewards/MCQ_Reward/std": 0.09529644250869751, |
| "step": 203, |
| "train_speed(iter/s)": 0.106255 |
| }, |
| { |
| "clip_ratio": 0.005409660283476114, |
| "epoch": 4.08, |
| "grad_norm": 2.4091176986694336, |
| "kl": 0.52734375, |
| "learning_rate": 6.514873933708637e-07, |
| "loss": 0.007959958165884018, |
| "memory(GiB)": 18.17, |
| "step": 204, |
| "train_speed(iter/s)": 0.10667 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 191.5, |
| "completions/mean_length": 103.03515625, |
| "completions/min_length": 53.0, |
| "epoch": 4.1, |
| "grad_norm": 2.983665704727173, |
| "kl": 0.62109375, |
| "learning_rate": 6.484601876641375e-07, |
| "loss": -0.014035141095519066, |
| "memory(GiB)": 18.17, |
| "reward": 0.4240594506263733, |
| "reward_std": 0.025937434285879135, |
| "rewards/MCQ_Reward/mean": 0.4240594506263733, |
| "rewards/MCQ_Reward/std": 0.07473786175251007, |
| "step": 205, |
| "train_speed(iter/s)": 0.106723 |
| }, |
| { |
| "clip_ratio": 0.018164899200201035, |
| "epoch": 4.12, |
| "grad_norm": 6.4920454025268555, |
| "kl": 0.5859375, |
| "learning_rate": 6.454270020026995e-07, |
| "loss": -0.013708272948861122, |
| "memory(GiB)": 18.17, |
| "step": 206, |
| "train_speed(iter/s)": 0.107162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 242.0, |
| "completions/mean_length": 129.375, |
| "completions/min_length": 58.5, |
| "epoch": 4.14, |
| "grad_norm": 2.714660882949829, |
| "kl": 0.5625, |
| "learning_rate": 6.423879585628261e-07, |
| "loss": -0.014167927205562592, |
| "memory(GiB)": 18.17, |
| "reward": 0.396339014172554, |
| "reward_std": 0.02192540653049946, |
| "rewards/MCQ_Reward/mean": 0.396339014172554, |
| "rewards/MCQ_Reward/std": 0.11277944594621658, |
| "step": 207, |
| "train_speed(iter/s)": 0.106875 |
| }, |
| { |
| "clip_ratio": 0.007178165018558502, |
| "epoch": 4.16, |
| "grad_norm": 2.4650375843048096, |
| "kl": 0.560546875, |
| "learning_rate": 6.393431797567439e-07, |
| "loss": -0.014689125120639801, |
| "memory(GiB)": 18.17, |
| "step": 208, |
| "train_speed(iter/s)": 0.107325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 255.0, |
| "completions/mean_length": 131.01953125, |
| "completions/min_length": 64.5, |
| "epoch": 4.18, |
| "grad_norm": 2.1339519023895264, |
| "kl": 0.58203125, |
| "learning_rate": 6.362927882276989e-07, |
| "loss": -0.017007270827889442, |
| "memory(GiB)": 18.17, |
| "reward": 0.42686355113983154, |
| "reward_std": 0.023915644735097885, |
| "rewards/MCQ_Reward/mean": 0.42686355113983154, |
| "rewards/MCQ_Reward/std": 0.10529575496912003, |
| "step": 209, |
| "train_speed(iter/s)": 0.107141 |
| }, |
| { |
| "clip_ratio": 0.005084275268018246, |
| "epoch": 4.2, |
| "grad_norm": 2.0464680194854736, |
| "kl": 0.59375, |
| "learning_rate": 6.332369068450174e-07, |
| "loss": -0.0175747312605381, |
| "memory(GiB)": 18.17, |
| "step": 210, |
| "train_speed(iter/s)": 0.107586 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 197.5, |
| "completions/mean_length": 116.63671875, |
| "completions/min_length": 61.5, |
| "epoch": 4.22, |
| "grad_norm": 2.4869492053985596, |
| "kl": 0.544921875, |
| "learning_rate": 6.30175658699156e-07, |
| "loss": -0.0016960185021162033, |
| "memory(GiB)": 18.17, |
| "reward": 0.43242450058460236, |
| "reward_std": 0.02396441251039505, |
| "rewards/MCQ_Reward/mean": 0.43242450058460236, |
| "rewards/MCQ_Reward/std": 0.07406600937247276, |
| "step": 211, |
| "train_speed(iter/s)": 0.107182 |
| }, |
| { |
| "clip_ratio": 0.006936221849173307, |
| "epoch": 4.24, |
| "grad_norm": 2.2954320907592773, |
| "kl": 0.5390625, |
| "learning_rate": 6.271091670967436e-07, |
| "loss": -0.001955235842615366, |
| "memory(GiB)": 18.17, |
| "step": 212, |
| "train_speed(iter/s)": 0.10762 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 212.5, |
| "completions/mean_length": 132.296875, |
| "completions/min_length": 90.0, |
| "epoch": 4.26, |
| "grad_norm": 2.5567421913146973, |
| "kl": 0.548828125, |
| "learning_rate": 6.240375555556145e-07, |
| "loss": -0.010683618485927582, |
| "memory(GiB)": 18.17, |
| "reward": 0.3712979108095169, |
| "reward_std": 0.022392110899090767, |
| "rewards/MCQ_Reward/mean": 0.3712979108095169, |
| "rewards/MCQ_Reward/std": 0.0758376233279705, |
| "step": 213, |
| "train_speed(iter/s)": 0.107578 |
| }, |
| { |
| "clip_ratio": 0.01051389379426837, |
| "epoch": 4.28, |
| "grad_norm": 3.9029605388641357, |
| "kl": 0.529296875, |
| "learning_rate": 6.209609477998338e-07, |
| "loss": -0.010750237852334976, |
| "memory(GiB)": 18.17, |
| "step": 214, |
| "train_speed(iter/s)": 0.108018 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.5, |
| "completions/mean_length": 117.71875, |
| "completions/min_length": 60.5, |
| "epoch": 4.3, |
| "grad_norm": 2.3913040161132812, |
| "kl": 0.6015625, |
| "learning_rate": 6.178794677547137e-07, |
| "loss": -0.012967615388333797, |
| "memory(GiB)": 18.17, |
| "reward": 0.3914954960346222, |
| "reward_std": 0.021691203117370605, |
| "rewards/MCQ_Reward/mean": 0.3914954960346222, |
| "rewards/MCQ_Reward/std": 0.10047328472137451, |
| "step": 215, |
| "train_speed(iter/s)": 0.108034 |
| }, |
| { |
| "clip_ratio": 0.005430733785033226, |
| "epoch": 4.32, |
| "grad_norm": 2.3732998371124268, |
| "kl": 0.61328125, |
| "learning_rate": 6.147932395418205e-07, |
| "loss": -0.013309886679053307, |
| "memory(GiB)": 18.17, |
| "step": 216, |
| "train_speed(iter/s)": 0.108474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 209.5, |
| "completions/mean_length": 123.4765625, |
| "completions/min_length": 65.0, |
| "epoch": 4.34, |
| "grad_norm": 2.7147343158721924, |
| "kl": 0.552734375, |
| "learning_rate": 6.117023874739771e-07, |
| "loss": -0.0006074332632124424, |
| "memory(GiB)": 18.17, |
| "reward": 0.4220256060361862, |
| "reward_std": 0.0257421238347888, |
| "rewards/MCQ_Reward/mean": 0.4220256060361862, |
| "rewards/MCQ_Reward/std": 0.12063978612422943, |
| "step": 217, |
| "train_speed(iter/s)": 0.10841 |
| }, |
| { |
| "clip_ratio": 0.006779439281672239, |
| "epoch": 4.36, |
| "grad_norm": 2.3169238567352295, |
| "kl": 0.544921875, |
| "learning_rate": 6.086070360502539e-07, |
| "loss": -0.0006955214776098728, |
| "memory(GiB)": 18.17, |
| "step": 218, |
| "train_speed(iter/s)": 0.108822 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 203.5, |
| "completions/mean_length": 116.0625, |
| "completions/min_length": 53.5, |
| "epoch": 4.38, |
| "grad_norm": 2.7408437728881836, |
| "kl": 0.615234375, |
| "learning_rate": 6.055073099509549e-07, |
| "loss": -0.007178765721619129, |
| "memory(GiB)": 18.17, |
| "reward": 0.41480791568756104, |
| "reward_std": 0.028133179992437363, |
| "rewards/MCQ_Reward/mean": 0.41480791568756104, |
| "rewards/MCQ_Reward/std": 0.1095062680542469, |
| "step": 219, |
| "train_speed(iter/s)": 0.108796 |
| }, |
| { |
| "clip_ratio": 0.007214481011033058, |
| "epoch": 4.4, |
| "grad_norm": 2.457122802734375, |
| "kl": 0.6171875, |
| "learning_rate": 6.024033340325954e-07, |
| "loss": -0.008253653533756733, |
| "memory(GiB)": 18.17, |
| "step": 220, |
| "train_speed(iter/s)": 0.109227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 209.5, |
| "completions/mean_length": 118.609375, |
| "completions/min_length": 59.0, |
| "epoch": 4.42, |
| "grad_norm": 2.8679587841033936, |
| "kl": 0.568359375, |
| "learning_rate": 5.992952333228726e-07, |
| "loss": 0.013627042062580585, |
| "memory(GiB)": 18.17, |
| "reward": 0.4350634217262268, |
| "reward_std": 0.0218770457431674, |
| "rewards/MCQ_Reward/mean": 0.4350634217262268, |
| "rewards/MCQ_Reward/std": 0.07635831832885742, |
| "step": 221, |
| "train_speed(iter/s)": 0.108811 |
| }, |
| { |
| "clip_ratio": 0.005678659770637751, |
| "epoch": 4.44, |
| "grad_norm": 2.187412738800049, |
| "kl": 0.58203125, |
| "learning_rate": 5.961831330156305e-07, |
| "loss": 0.013213744387030602, |
| "memory(GiB)": 18.17, |
| "step": 222, |
| "train_speed(iter/s)": 0.109221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 226.5, |
| "completions/mean_length": 125.9765625, |
| "completions/min_length": 48.5, |
| "epoch": 4.46, |
| "grad_norm": 3.5221126079559326, |
| "kl": 0.587890625, |
| "learning_rate": 5.93067158465815e-07, |
| "loss": -0.0011408873833715916, |
| "memory(GiB)": 18.17, |
| "reward": 0.44135691225528717, |
| "reward_std": 0.025366419926285744, |
| "rewards/MCQ_Reward/mean": 0.44135691225528717, |
| "rewards/MCQ_Reward/std": 0.07711124420166016, |
| "step": 223, |
| "train_speed(iter/s)": 0.109176 |
| }, |
| { |
| "clip_ratio": 0.007937990361824632, |
| "epoch": 4.48, |
| "grad_norm": 2.513356924057007, |
| "kl": 0.5703125, |
| "learning_rate": 5.899474351844269e-07, |
| "loss": -0.0011316398158669472, |
| "memory(GiB)": 18.17, |
| "step": 224, |
| "train_speed(iter/s)": 0.109601 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 228.0, |
| "completions/mean_length": 120.234375, |
| "completions/min_length": 54.0, |
| "epoch": 4.5, |
| "grad_norm": 2.853579044342041, |
| "kl": 0.744140625, |
| "learning_rate": 5.868240888334652e-07, |
| "loss": -0.0010898616164922714, |
| "memory(GiB)": 18.17, |
| "reward": 0.41750770807266235, |
| "reward_std": 0.024566995911300182, |
| "rewards/MCQ_Reward/mean": 0.41750770807266235, |
| "rewards/MCQ_Reward/std": 0.09383138827979565, |
| "step": 225, |
| "train_speed(iter/s)": 0.109546 |
| }, |
| { |
| "clip_ratio": 0.012675716076046228, |
| "epoch": 4.52, |
| "grad_norm": 5.211337089538574, |
| "kl": 0.658203125, |
| "learning_rate": 5.836972452208654e-07, |
| "loss": -0.001642034389078617, |
| "memory(GiB)": 18.17, |
| "step": 226, |
| "train_speed(iter/s)": 0.109972 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 183.5, |
| "completions/mean_length": 126.68359375, |
| "completions/min_length": 64.0, |
| "epoch": 4.54, |
| "grad_norm": 2.3116183280944824, |
| "kl": 0.505859375, |
| "learning_rate": 5.805670302954321e-07, |
| "loss": 0.017429981380701065, |
| "memory(GiB)": 18.17, |
| "reward": 0.41671665012836456, |
| "reward_std": 0.02627546712756157, |
| "rewards/MCQ_Reward/mean": 0.41671665012836456, |
| "rewards/MCQ_Reward/std": 0.09354511648416519, |
| "step": 227, |
| "train_speed(iter/s)": 0.109937 |
| }, |
| { |
| "clip_ratio": 0.005898691713809967, |
| "epoch": 4.5600000000000005, |
| "grad_norm": 2.306483507156372, |
| "kl": 0.5087890625, |
| "learning_rate": 5.774335701417662e-07, |
| "loss": 0.016744598746299744, |
| "memory(GiB)": 18.17, |
| "step": 228, |
| "train_speed(iter/s)": 0.110353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 213.5, |
| "completions/mean_length": 124.5546875, |
| "completions/min_length": 61.0, |
| "epoch": 4.58, |
| "grad_norm": 2.3084402084350586, |
| "kl": 0.552734375, |
| "learning_rate": 5.742969909751858e-07, |
| "loss": -0.009621858596801758, |
| "memory(GiB)": 18.17, |
| "reward": 0.45828977227211, |
| "reward_std": 0.023471640422940254, |
| "rewards/MCQ_Reward/mean": 0.45828977227211, |
| "rewards/MCQ_Reward/std": 0.09269878640770912, |
| "step": 229, |
| "train_speed(iter/s)": 0.110326 |
| }, |
| { |
| "clip_ratio": 0.005610911408439279, |
| "epoch": 4.6, |
| "grad_norm": 2.163801431655884, |
| "kl": 0.552734375, |
| "learning_rate": 5.711574191366427e-07, |
| "loss": -0.010531945154070854, |
| "memory(GiB)": 18.17, |
| "step": 230, |
| "train_speed(iter/s)": 0.110743 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 187.5, |
| "completions/mean_length": 116.93359375, |
| "completions/min_length": 62.5, |
| "epoch": 4.62, |
| "grad_norm": 3.1812872886657715, |
| "kl": 2.26171875, |
| "learning_rate": 5.680149810876322e-07, |
| "loss": 0.006941274274140596, |
| "memory(GiB)": 18.17, |
| "reward": 0.45568907260894775, |
| "reward_std": 0.023496804758906364, |
| "rewards/MCQ_Reward/mean": 0.45568907260894775, |
| "rewards/MCQ_Reward/std": 0.09556515514850616, |
| "step": 231, |
| "train_speed(iter/s)": 0.110377 |
| }, |
| { |
| "clip_ratio": 0.006443677702918649, |
| "epoch": 4.64, |
| "grad_norm": 2.733854293823242, |
| "kl": 2.2734375, |
| "learning_rate": 5.648698034051008e-07, |
| "loss": 0.006462510209530592, |
| "memory(GiB)": 18.17, |
| "step": 232, |
| "train_speed(iter/s)": 0.110787 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 233.0, |
| "completions/mean_length": 133.1015625, |
| "completions/min_length": 70.5, |
| "epoch": 4.66, |
| "grad_norm": 2.4281585216522217, |
| "kl": 0.55859375, |
| "learning_rate": 5.617220127763474e-07, |
| "loss": 0.013438165187835693, |
| "memory(GiB)": 18.17, |
| "reward": 0.43506887555122375, |
| "reward_std": 0.025797616690397263, |
| "rewards/MCQ_Reward/mean": 0.43506887555122375, |
| "rewards/MCQ_Reward/std": 0.09859243780374527, |
| "step": 233, |
| "train_speed(iter/s)": 0.110691 |
| }, |
| { |
| "clip_ratio": 0.0072706313803792, |
| "epoch": 4.68, |
| "grad_norm": 2.526357889175415, |
| "kl": 0.55859375, |
| "learning_rate": 5.585717359939192e-07, |
| "loss": 0.012631012126803398, |
| "memory(GiB)": 18.17, |
| "step": 234, |
| "train_speed(iter/s)": 0.111101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 214.5, |
| "completions/mean_length": 133.1328125, |
| "completions/min_length": 57.0, |
| "epoch": 4.7, |
| "grad_norm": 2.639338731765747, |
| "kl": 0.552734375, |
| "learning_rate": 5.554190999505055e-07, |
| "loss": -0.008054563775658607, |
| "memory(GiB)": 18.17, |
| "reward": 0.40963128209114075, |
| "reward_std": 0.024876238778233528, |
| "rewards/MCQ_Reward/mean": 0.40963128209114075, |
| "rewards/MCQ_Reward/std": 0.06643268279731274, |
| "step": 235, |
| "train_speed(iter/s)": 0.111027 |
| }, |
| { |
| "clip_ratio": 0.008271400351077318, |
| "epoch": 4.72, |
| "grad_norm": 2.7264564037323, |
| "kl": 0.568359375, |
| "learning_rate": 5.522642316338268e-07, |
| "loss": -0.008453292772173882, |
| "memory(GiB)": 18.17, |
| "step": 236, |
| "train_speed(iter/s)": 0.111434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 188.0, |
| "completions/mean_length": 123.84765625, |
| "completions/min_length": 65.5, |
| "epoch": 4.74, |
| "grad_norm": 2.405317544937134, |
| "kl": 0.5400390625, |
| "learning_rate": 5.491072581215186e-07, |
| "loss": 0.00114892004057765, |
| "memory(GiB)": 18.17, |
| "reward": 0.4337426722049713, |
| "reward_std": 0.020247386768460274, |
| "rewards/MCQ_Reward/mean": 0.4337426722049713, |
| "rewards/MCQ_Reward/std": 0.07973705604672432, |
| "step": 237, |
| "train_speed(iter/s)": 0.111369 |
| }, |
| { |
| "clip_ratio": 0.006459691561758518, |
| "epoch": 4.76, |
| "grad_norm": 2.8662662506103516, |
| "kl": 0.5400390625, |
| "learning_rate": 5.459483065760138e-07, |
| "loss": 0.0009391154162585735, |
| "memory(GiB)": 18.17, |
| "step": 238, |
| "train_speed(iter/s)": 0.111775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 244.0, |
| "completions/mean_length": 133.96875, |
| "completions/min_length": 75.0, |
| "epoch": 4.78, |
| "grad_norm": 2.400651216506958, |
| "kl": 0.5078125, |
| "learning_rate": 5.427875042394199e-07, |
| "loss": 0.002962369006127119, |
| "memory(GiB)": 18.17, |
| "reward": 0.4192984253168106, |
| "reward_std": 0.023103597573935986, |
| "rewards/MCQ_Reward/mean": 0.4192984253168106, |
| "rewards/MCQ_Reward/std": 0.08515846729278564, |
| "step": 239, |
| "train_speed(iter/s)": 0.11166 |
| }, |
| { |
| "clip_ratio": 0.00794414198026061, |
| "epoch": 4.8, |
| "grad_norm": 3.1118853092193604, |
| "kl": 0.5029296875, |
| "learning_rate": 5.396249784283942e-07, |
| "loss": 0.0026899795047938824, |
| "memory(GiB)": 18.17, |
| "step": 240, |
| "train_speed(iter/s)": 0.112066 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.5, |
| "completions/mean_length": 114.4609375, |
| "completions/min_length": 47.5, |
| "epoch": 4.82, |
| "grad_norm": 2.5313034057617188, |
| "kl": 0.5390625, |
| "learning_rate": 5.364608565290154e-07, |
| "loss": -0.0074430471286177635, |
| "memory(GiB)": 18.17, |
| "reward": 0.4074428677558899, |
| "reward_std": 0.02112921793013811, |
| "rewards/MCQ_Reward/mean": 0.4074428677558899, |
| "rewards/MCQ_Reward/std": 0.07994595915079117, |
| "step": 241, |
| "train_speed(iter/s)": 0.111745 |
| }, |
| { |
| "clip_ratio": 0.007256179815158248, |
| "epoch": 4.84, |
| "grad_norm": 2.768711566925049, |
| "kl": 0.5625, |
| "learning_rate": 5.33295265991652e-07, |
| "loss": -0.0077315750531852245, |
| "memory(GiB)": 18.17, |
| "step": 242, |
| "train_speed(iter/s)": 0.112147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 167.0, |
| "completions/mean_length": 115.47265625, |
| "completions/min_length": 67.5, |
| "epoch": 4.86, |
| "grad_norm": 2.561013698577881, |
| "kl": 0.57421875, |
| "learning_rate": 5.301283343258292e-07, |
| "loss": -0.0039140088483691216, |
| "memory(GiB)": 18.17, |
| "reward": 0.42967718839645386, |
| "reward_std": 0.020259867422282696, |
| "rewards/MCQ_Reward/mean": 0.42967718839645386, |
| "rewards/MCQ_Reward/std": 0.09365658834576607, |
| "step": 243, |
| "train_speed(iter/s)": 0.112166 |
| }, |
| { |
| "clip_ratio": 0.008353757206350565, |
| "epoch": 4.88, |
| "grad_norm": 3.9286372661590576, |
| "kl": 0.560546875, |
| "learning_rate": 5.26960189095093e-07, |
| "loss": -0.003905682824552059, |
| "memory(GiB)": 18.17, |
| "step": 244, |
| "train_speed(iter/s)": 0.112566 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 206.0, |
| "completions/mean_length": 130.84375, |
| "completions/min_length": 77.0, |
| "epoch": 4.9, |
| "grad_norm": 2.3792028427124023, |
| "kl": 0.515625, |
| "learning_rate": 5.237909579118712e-07, |
| "loss": 0.0075805773958563805, |
| "memory(GiB)": 18.17, |
| "reward": 0.37578998506069183, |
| "reward_std": 0.022264255210757256, |
| "rewards/MCQ_Reward/mean": 0.37578998506069183, |
| "rewards/MCQ_Reward/std": 0.09643128886818886, |
| "step": 245, |
| "train_speed(iter/s)": 0.112504 |
| }, |
| { |
| "clip_ratio": 0.006022685440257192, |
| "epoch": 4.92, |
| "grad_norm": 2.490131378173828, |
| "kl": 0.501953125, |
| "learning_rate": 5.206207684323335e-07, |
| "loss": 0.007525968365371227, |
| "memory(GiB)": 18.17, |
| "step": 246, |
| "train_speed(iter/s)": 0.112901 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 177.5, |
| "completions/mean_length": 112.49609375, |
| "completions/min_length": 62.5, |
| "epoch": 4.9399999999999995, |
| "grad_norm": 2.270827293395996, |
| "kl": 0.580078125, |
| "learning_rate": 5.174497483512505e-07, |
| "loss": 0.011211629025638103, |
| "memory(GiB)": 18.17, |
| "reward": 0.39156346023082733, |
| "reward_std": 0.02191222459077835, |
| "rewards/MCQ_Reward/mean": 0.39156346023082733, |
| "rewards/MCQ_Reward/std": 0.12107554450631142, |
| "step": 247, |
| "train_speed(iter/s)": 0.112883 |
| }, |
| { |
| "clip_ratio": 0.006176856812089682, |
| "epoch": 4.96, |
| "grad_norm": 2.373053550720215, |
| "kl": 0.57421875, |
| "learning_rate": 5.142780253968481e-07, |
| "loss": 0.010641951113939285, |
| "memory(GiB)": 18.17, |
| "step": 248, |
| "train_speed(iter/s)": 0.11328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 204.0, |
| "completions/mean_length": 131.14453125, |
| "completions/min_length": 62.5, |
| "epoch": 4.98, |
| "grad_norm": 2.2482690811157227, |
| "kl": 0.525390625, |
| "learning_rate": 5.111057273256647e-07, |
| "loss": 0.0050743343308568, |
| "memory(GiB)": 18.17, |
| "reward": 0.40770605206489563, |
| "reward_std": 0.022150222212076187, |
| "rewards/MCQ_Reward/mean": 0.40770605206489563, |
| "rewards/MCQ_Reward/std": 0.11748149991035461, |
| "step": 249, |
| "train_speed(iter/s)": 0.113183 |
| }, |
| { |
| "clip_ratio": 0.006638662423938513, |
| "epoch": 5.0, |
| "grad_norm": 2.2492520809173584, |
| "kl": 0.5390625, |
| "learning_rate": 5.07932981917404e-07, |
| "loss": 0.004837746266275644, |
| "memory(GiB)": 18.17, |
| "step": 250, |
| "train_speed(iter/s)": 0.113563 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 214.0, |
| "completions/mean_length": 125.765625, |
| "completions/min_length": 68.5, |
| "epoch": 5.02, |
| "grad_norm": 2.556406259536743, |
| "kl": 0.5078125, |
| "learning_rate": 5.047599169697883e-07, |
| "loss": 0.017076797783374786, |
| "memory(GiB)": 18.17, |
| "reward": 0.4466231018304825, |
| "reward_std": 0.0222383551299572, |
| "rewards/MCQ_Reward/mean": 0.4466231018304825, |
| "rewards/MCQ_Reward/std": 0.11308542639017105, |
| "step": 251, |
| "train_speed(iter/s)": 0.113109 |
| }, |
| { |
| "clip_ratio": 0.007436602842062712, |
| "epoch": 5.04, |
| "grad_norm": 2.0482616424560547, |
| "kl": 0.515625, |
| "learning_rate": 5.015866602934111e-07, |
| "loss": 0.01610303670167923, |
| "memory(GiB)": 18.17, |
| "step": 252, |
| "train_speed(iter/s)": 0.113475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 168.0, |
| "completions/mean_length": 109.3359375, |
| "completions/min_length": 65.0, |
| "epoch": 5.06, |
| "grad_norm": 2.6583385467529297, |
| "kl": 0.6015625, |
| "learning_rate": 4.984133397065888e-07, |
| "loss": 0.005715301260352135, |
| "memory(GiB)": 18.17, |
| "reward": 0.3956441879272461, |
| "reward_std": 0.02386545669287443, |
| "rewards/MCQ_Reward/mean": 0.3956441879272461, |
| "rewards/MCQ_Reward/std": 0.0772719755768776, |
| "step": 253, |
| "train_speed(iter/s)": 0.113471 |
| }, |
| { |
| "clip_ratio": 0.006691478192806244, |
| "epoch": 5.08, |
| "grad_norm": 2.478234052658081, |
| "kl": 0.5859375, |
| "learning_rate": 4.952400830302116e-07, |
| "loss": 0.00553365983068943, |
| "memory(GiB)": 18.17, |
| "step": 254, |
| "train_speed(iter/s)": 0.113858 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 235.0, |
| "completions/mean_length": 144.1796875, |
| "completions/min_length": 78.0, |
| "epoch": 5.1, |
| "grad_norm": 2.308807373046875, |
| "kl": 0.5009765625, |
| "learning_rate": 4.92067018082596e-07, |
| "loss": -0.0058871605433523655, |
| "memory(GiB)": 18.17, |
| "reward": 0.4203776866197586, |
| "reward_std": 0.022159602493047714, |
| "rewards/MCQ_Reward/mean": 0.4203776866197586, |
| "rewards/MCQ_Reward/std": 0.09526496008038521, |
| "step": 255, |
| "train_speed(iter/s)": 0.113761 |
| }, |
| { |
| "clip_ratio": 0.007533560739830136, |
| "epoch": 5.12, |
| "grad_norm": 2.9820773601531982, |
| "kl": 0.4921875, |
| "learning_rate": 4.888942726743353e-07, |
| "loss": -0.006009383127093315, |
| "memory(GiB)": 18.17, |
| "step": 256, |
| "train_speed(iter/s)": 0.114127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 189.5, |
| "completions/mean_length": 115.0078125, |
| "completions/min_length": 65.0, |
| "epoch": 5.14, |
| "grad_norm": 2.3862602710723877, |
| "kl": 0.57421875, |
| "learning_rate": 4.857219746031519e-07, |
| "loss": -0.010767871513962746, |
| "memory(GiB)": 18.17, |
| "reward": 0.43338486552238464, |
| "reward_std": 0.025110138580203056, |
| "rewards/MCQ_Reward/mean": 0.43338486552238464, |
| "rewards/MCQ_Reward/std": 0.08122389577329159, |
| "step": 257, |
| "train_speed(iter/s)": 0.114083 |
| }, |
| { |
| "clip_ratio": 0.005816203076392412, |
| "epoch": 5.16, |
| "grad_norm": 2.2391088008880615, |
| "kl": 0.57421875, |
| "learning_rate": 4.825502516487496e-07, |
| "loss": -0.011337889358401299, |
| "memory(GiB)": 18.17, |
| "step": 258, |
| "train_speed(iter/s)": 0.11446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 241.0, |
| "completions/mean_length": 121.109375, |
| "completions/min_length": 68.0, |
| "epoch": 5.18, |
| "grad_norm": 3.2198102474212646, |
| "kl": 0.642578125, |
| "learning_rate": 4.793792315676664e-07, |
| "loss": -0.0017241109162569046, |
| "memory(GiB)": 18.17, |
| "reward": 0.41922956705093384, |
| "reward_std": 0.02394416555762291, |
| "rewards/MCQ_Reward/mean": 0.41922956705093384, |
| "rewards/MCQ_Reward/std": 0.08786309324204922, |
| "step": 259, |
| "train_speed(iter/s)": 0.11433 |
| }, |
| { |
| "clip_ratio": 0.008633819408714771, |
| "epoch": 5.2, |
| "grad_norm": 2.5045688152313232, |
| "kl": 0.611328125, |
| "learning_rate": 4.762090420881288e-07, |
| "loss": -0.0024092746898531914, |
| "memory(GiB)": 18.17, |
| "step": 260, |
| "train_speed(iter/s)": 0.11471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 203.5, |
| "completions/mean_length": 121.0, |
| "completions/min_length": 59.5, |
| "epoch": 5.22, |
| "grad_norm": 3.3788204193115234, |
| "kl": 0.65625, |
| "learning_rate": 4.7303981090490706e-07, |
| "loss": 0.0016009537503123283, |
| "memory(GiB)": 18.17, |
| "reward": 0.4228467643260956, |
| "reward_std": 0.02382771298289299, |
| "rewards/MCQ_Reward/mean": 0.4228467643260956, |
| "rewards/MCQ_Reward/std": 0.08922314271330833, |
| "step": 261, |
| "train_speed(iter/s)": 0.114325 |
| }, |
| { |
| "clip_ratio": 0.009796116035431623, |
| "epoch": 5.24, |
| "grad_norm": 3.2910051345825195, |
| "kl": 0.603515625, |
| "learning_rate": 4.698716656741708e-07, |
| "loss": 0.0013471171259880066, |
| "memory(GiB)": 18.17, |
| "step": 262, |
| "train_speed(iter/s)": 0.114703 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 178.0, |
| "completions/mean_length": 117.85546875, |
| "completions/min_length": 58.5, |
| "epoch": 5.26, |
| "grad_norm": 3.0833852291107178, |
| "kl": 0.607421875, |
| "learning_rate": 4.66704734008348e-07, |
| "loss": 0.01880352757871151, |
| "memory(GiB)": 18.17, |
| "reward": 0.4038514196872711, |
| "reward_std": 0.024144282564520836, |
| "rewards/MCQ_Reward/mean": 0.4038514196872711, |
| "rewards/MCQ_Reward/std": 0.11032669246196747, |
| "step": 263, |
| "train_speed(iter/s)": 0.114712 |
| }, |
| { |
| "clip_ratio": 0.0071860982570797205, |
| "epoch": 5.28, |
| "grad_norm": 2.223651885986328, |
| "kl": 0.62109375, |
| "learning_rate": 4.6353914347098467e-07, |
| "loss": 0.018028832972049713, |
| "memory(GiB)": 18.17, |
| "step": 264, |
| "train_speed(iter/s)": 0.115068 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 178.0, |
| "completions/mean_length": 126.16796875, |
| "completions/min_length": 63.0, |
| "epoch": 5.3, |
| "grad_norm": 2.7954585552215576, |
| "kl": 0.521484375, |
| "learning_rate": 4.6037502157160567e-07, |
| "loss": 0.008576348423957825, |
| "memory(GiB)": 18.17, |
| "reward": 0.4126065671443939, |
| "reward_std": 0.02162686362862587, |
| "rewards/MCQ_Reward/mean": 0.4126065671443939, |
| "rewards/MCQ_Reward/std": 0.08540061488747597, |
| "step": 265, |
| "train_speed(iter/s)": 0.115013 |
| }, |
| { |
| "clip_ratio": 0.00956161879003048, |
| "epoch": 5.32, |
| "grad_norm": 4.209680557250977, |
| "kl": 0.544921875, |
| "learning_rate": 4.5721249576058027e-07, |
| "loss": 0.009101202711462975, |
| "memory(GiB)": 18.17, |
| "step": 266, |
| "train_speed(iter/s)": 0.115384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.5, |
| "completions/mean_length": 113.3515625, |
| "completions/min_length": 71.5, |
| "epoch": 5.34, |
| "grad_norm": 2.6387808322906494, |
| "kl": 0.595703125, |
| "learning_rate": 4.540516934239863e-07, |
| "loss": 0.008354030549526215, |
| "memory(GiB)": 18.17, |
| "reward": 0.4057372510433197, |
| "reward_std": 0.025215539149940014, |
| "rewards/MCQ_Reward/mean": 0.4057372510433197, |
| "rewards/MCQ_Reward/std": 0.10797113552689552, |
| "step": 267, |
| "train_speed(iter/s)": 0.115352 |
| }, |
| { |
| "clip_ratio": 0.004749758169054985, |
| "epoch": 5.36, |
| "grad_norm": 2.726827383041382, |
| "kl": 0.59765625, |
| "learning_rate": 4.508927418784814e-07, |
| "loss": 0.008263107389211655, |
| "memory(GiB)": 18.17, |
| "step": 268, |
| "train_speed(iter/s)": 0.115721 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 253.0, |
| "completions/mean_length": 128.75, |
| "completions/min_length": 65.0, |
| "epoch": 5.38, |
| "grad_norm": 2.4489338397979736, |
| "kl": 0.5859375, |
| "learning_rate": 4.477357683661733e-07, |
| "loss": 0.0003694836050271988, |
| "memory(GiB)": 18.17, |
| "reward": 0.39796915650367737, |
| "reward_std": 0.0229190643876791, |
| "rewards/MCQ_Reward/mean": 0.39796915650367737, |
| "rewards/MCQ_Reward/std": 0.06984946131706238, |
| "step": 269, |
| "train_speed(iter/s)": 0.115538 |
| }, |
| { |
| "clip_ratio": 0.0044297389686107635, |
| "epoch": 5.4, |
| "grad_norm": 2.187133312225342, |
| "kl": 0.587890625, |
| "learning_rate": 4.445809000494945e-07, |
| "loss": 6.162561476230621e-06, |
| "memory(GiB)": 18.17, |
| "step": 270, |
| "train_speed(iter/s)": 0.115873 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 207.5, |
| "completions/mean_length": 120.48046875, |
| "completions/min_length": 76.5, |
| "epoch": 5.42, |
| "grad_norm": 2.354365348815918, |
| "kl": 0.595703125, |
| "learning_rate": 4.4142826400608085e-07, |
| "loss": -0.011774084530770779, |
| "memory(GiB)": 18.17, |
| "reward": 0.4731539338827133, |
| "reward_std": 0.025172382593154907, |
| "rewards/MCQ_Reward/mean": 0.4731539338827133, |
| "rewards/MCQ_Reward/std": 0.09358260780572891, |
| "step": 271, |
| "train_speed(iter/s)": 0.115479 |
| }, |
| { |
| "clip_ratio": 0.007754836697131395, |
| "epoch": 5.44, |
| "grad_norm": 2.9754416942596436, |
| "kl": 0.568359375, |
| "learning_rate": 4.382779872236526e-07, |
| "loss": -0.01219811663031578, |
| "memory(GiB)": 18.17, |
| "step": 272, |
| "train_speed(iter/s)": 0.115843 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 193.0, |
| "completions/mean_length": 127.671875, |
| "completions/min_length": 82.0, |
| "epoch": 5.46, |
| "grad_norm": 2.66938853263855, |
| "kl": 0.587890625, |
| "learning_rate": 4.3513019659489906e-07, |
| "loss": -0.01641671359539032, |
| "memory(GiB)": 18.17, |
| "reward": 0.3951749950647354, |
| "reward_std": 0.026222089305520058, |
| "rewards/MCQ_Reward/mean": 0.3951749950647354, |
| "rewards/MCQ_Reward/std": 0.07432432845234871, |
| "step": 273, |
| "train_speed(iter/s)": 0.11581 |
| }, |
| { |
| "clip_ratio": 0.006316621555015445, |
| "epoch": 5.48, |
| "grad_norm": 2.3686916828155518, |
| "kl": 0.595703125, |
| "learning_rate": 4.31985018912368e-07, |
| "loss": -0.01686863601207733, |
| "memory(GiB)": 18.17, |
| "step": 274, |
| "train_speed(iter/s)": 0.116173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 212.0, |
| "completions/mean_length": 127.2578125, |
| "completions/min_length": 64.5, |
| "epoch": 5.5, |
| "grad_norm": 2.3570117950439453, |
| "kl": 0.5390625, |
| "learning_rate": 4.2884258086335745e-07, |
| "loss": 0.0007358621805906296, |
| "memory(GiB)": 18.17, |
| "reward": 0.44543667137622833, |
| "reward_std": 0.024644173681735992, |
| "rewards/MCQ_Reward/mean": 0.44543667137622833, |
| "rewards/MCQ_Reward/std": 0.09130855649709702, |
| "step": 275, |
| "train_speed(iter/s)": 0.116062 |
| }, |
| { |
| "clip_ratio": 0.009702229872345924, |
| "epoch": 5.52, |
| "grad_norm": 4.230794906616211, |
| "kl": 0.517578125, |
| "learning_rate": 4.257030090248142e-07, |
| "loss": 0.0004968619905412197, |
| "memory(GiB)": 18.17, |
| "step": 276, |
| "train_speed(iter/s)": 0.116424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 188.5, |
| "completions/mean_length": 124.16796875, |
| "completions/min_length": 66.5, |
| "epoch": 5.54, |
| "grad_norm": 2.1478097438812256, |
| "kl": 0.607421875, |
| "learning_rate": 4.2256642985823387e-07, |
| "loss": 0.012350899167358875, |
| "memory(GiB)": 18.17, |
| "reward": 0.4112658351659775, |
| "reward_std": 0.023498238995671272, |
| "rewards/MCQ_Reward/mean": 0.4112658351659775, |
| "rewards/MCQ_Reward/std": 0.08520639687776566, |
| "step": 277, |
| "train_speed(iter/s)": 0.116375 |
| }, |
| { |
| "clip_ratio": 0.004101653583347797, |
| "epoch": 5.5600000000000005, |
| "grad_norm": 2.062098503112793, |
| "kl": 0.62109375, |
| "learning_rate": 4.19432969704568e-07, |
| "loss": 0.012091840617358685, |
| "memory(GiB)": 18.17, |
| "step": 278, |
| "train_speed(iter/s)": 0.116723 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 252.5, |
| "completions/mean_length": 122.69921875, |
| "completions/min_length": 59.0, |
| "epoch": 5.58, |
| "grad_norm": 2.9315075874328613, |
| "kl": 0.5390625, |
| "learning_rate": 4.1630275477913465e-07, |
| "loss": -0.013242216780781746, |
| "memory(GiB)": 18.17, |
| "reward": 0.39477604627609253, |
| "reward_std": 0.02283278852701187, |
| "rewards/MCQ_Reward/mean": 0.39477604627609253, |
| "rewards/MCQ_Reward/std": 0.09505810588598251, |
| "step": 279, |
| "train_speed(iter/s)": 0.116608 |
| }, |
| { |
| "clip_ratio": 0.006070411531254649, |
| "epoch": 5.6, |
| "grad_norm": 2.2812304496765137, |
| "kl": 0.53515625, |
| "learning_rate": 4.131759111665348e-07, |
| "loss": -0.013854868710041046, |
| "memory(GiB)": 18.17, |
| "step": 280, |
| "train_speed(iter/s)": 0.116971 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 206.5, |
| "completions/mean_length": 129.95703125, |
| "completions/min_length": 60.5, |
| "epoch": 5.62, |
| "grad_norm": 2.015717029571533, |
| "kl": 0.513671875, |
| "learning_rate": 4.1005256481557306e-07, |
| "loss": 0.0003234475152567029, |
| "memory(GiB)": 18.17, |
| "reward": 0.40168674290180206, |
| "reward_std": 0.020120804198086262, |
| "rewards/MCQ_Reward/mean": 0.40168674290180206, |
| "rewards/MCQ_Reward/std": 0.09599081426858902, |
| "step": 281, |
| "train_speed(iter/s)": 0.116542 |
| }, |
| { |
| "clip_ratio": 0.0076590063981711864, |
| "epoch": 5.64, |
| "grad_norm": 2.828334331512451, |
| "kl": 0.5009765625, |
| "learning_rate": 4.0693284153418497e-07, |
| "loss": 0.00015916512347757816, |
| "memory(GiB)": 18.17, |
| "step": 282, |
| "train_speed(iter/s)": 0.116903 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 201.5, |
| "completions/mean_length": 121.16015625, |
| "completions/min_length": 71.5, |
| "epoch": 5.66, |
| "grad_norm": 2.985908269882202, |
| "kl": 0.58203125, |
| "learning_rate": 4.038168669843697e-07, |
| "loss": -0.0021479236893355846, |
| "memory(GiB)": 18.17, |
| "reward": 0.4441321939229965, |
| "reward_std": 0.021154197864234447, |
| "rewards/MCQ_Reward/mean": 0.4441321939229965, |
| "rewards/MCQ_Reward/std": 0.10662735998630524, |
| "step": 283, |
| "train_speed(iter/s)": 0.116806 |
| }, |
| { |
| "clip_ratio": 0.00845325831323862, |
| "epoch": 5.68, |
| "grad_norm": 2.2008328437805176, |
| "kl": 0.5703125, |
| "learning_rate": 4.0070476667712736e-07, |
| "loss": -0.0024233213625848293, |
| "memory(GiB)": 18.17, |
| "step": 284, |
| "train_speed(iter/s)": 0.117157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 192.5, |
| "completions/mean_length": 131.40625, |
| "completions/min_length": 65.5, |
| "epoch": 5.7, |
| "grad_norm": 2.1404271125793457, |
| "kl": 0.609375, |
| "learning_rate": 3.9759666596740473e-07, |
| "loss": 0.009725593030452728, |
| "memory(GiB)": 18.17, |
| "reward": 0.4451696425676346, |
| "reward_std": 0.02477285359054804, |
| "rewards/MCQ_Reward/mean": 0.4451696425676346, |
| "rewards/MCQ_Reward/std": 0.07242370769381523, |
| "step": 285, |
| "train_speed(iter/s)": 0.117116 |
| }, |
| { |
| "clip_ratio": 0.004681814229115844, |
| "epoch": 5.72, |
| "grad_norm": 2.289313316345215, |
| "kl": 0.61328125, |
| "learning_rate": 3.9449269004904516e-07, |
| "loss": 0.009346994571387768, |
| "memory(GiB)": 18.17, |
| "step": 286, |
| "train_speed(iter/s)": 0.117466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 160.0, |
| "completions/mean_length": 104.30078125, |
| "completions/min_length": 51.0, |
| "epoch": 5.74, |
| "grad_norm": 2.770270347595215, |
| "kl": 1.189453125, |
| "learning_rate": 3.913929639497462e-07, |
| "loss": 0.009477443993091583, |
| "memory(GiB)": 18.17, |
| "reward": 0.43081943690776825, |
| "reward_std": 0.025431891903281212, |
| "rewards/MCQ_Reward/mean": 0.43081943690776825, |
| "rewards/MCQ_Reward/std": 0.10991119593381882, |
| "step": 287, |
| "train_speed(iter/s)": 0.117471 |
| }, |
| { |
| "clip_ratio": 0.006838085595518351, |
| "epoch": 5.76, |
| "grad_norm": 2.8960061073303223, |
| "kl": 1.087890625, |
| "learning_rate": 3.882976125260229e-07, |
| "loss": 0.008670520968735218, |
| "memory(GiB)": 18.17, |
| "step": 288, |
| "train_speed(iter/s)": 0.117827 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 172.5, |
| "completions/mean_length": 114.71484375, |
| "completions/min_length": 56.5, |
| "epoch": 5.78, |
| "grad_norm": 2.4359030723571777, |
| "kl": 0.552734375, |
| "learning_rate": 3.852067604581794e-07, |
| "loss": 0.006409616209566593, |
| "memory(GiB)": 18.17, |
| "reward": 0.41095563769340515, |
| "reward_std": 0.02436618786305189, |
| "rewards/MCQ_Reward/mean": 0.41095563769340515, |
| "rewards/MCQ_Reward/std": 0.09878598526120186, |
| "step": 289, |
| "train_speed(iter/s)": 0.117814 |
| }, |
| { |
| "clip_ratio": 0.007955410983413458, |
| "epoch": 5.8, |
| "grad_norm": 3.950528383255005, |
| "kl": 0.5390625, |
| "learning_rate": 3.821205322452863e-07, |
| "loss": 0.0066283950582146645, |
| "memory(GiB)": 18.17, |
| "step": 290, |
| "train_speed(iter/s)": 0.118161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 225.5, |
| "completions/mean_length": 134.59375, |
| "completions/min_length": 63.0, |
| "epoch": 5.82, |
| "grad_norm": 2.4326717853546143, |
| "kl": 0.5263671875, |
| "learning_rate": 3.790390522001662e-07, |
| "loss": 0.002648044377565384, |
| "memory(GiB)": 18.17, |
| "reward": 0.4533398002386093, |
| "reward_std": 0.023892495781183243, |
| "rewards/MCQ_Reward/mean": 0.4533398002386093, |
| "rewards/MCQ_Reward/std": 0.08347899466753006, |
| "step": 291, |
| "train_speed(iter/s)": 0.117724 |
| }, |
| { |
| "clip_ratio": 0.004736665170639753, |
| "epoch": 5.84, |
| "grad_norm": 2.2011497020721436, |
| "kl": 0.541015625, |
| "learning_rate": 3.7596244444438574e-07, |
| "loss": 0.002431286498904228, |
| "memory(GiB)": 18.17, |
| "step": 292, |
| "train_speed(iter/s)": 0.118068 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 191.5, |
| "completions/mean_length": 117.24609375, |
| "completions/min_length": 63.5, |
| "epoch": 5.86, |
| "grad_norm": 2.58125376701355, |
| "kl": 0.541015625, |
| "learning_rate": 3.728908329032566e-07, |
| "loss": -0.003335139248520136, |
| "memory(GiB)": 18.17, |
| "reward": 0.4097088426351547, |
| "reward_std": 0.022918211296200752, |
| "rewards/MCQ_Reward/mean": 0.4097088426351547, |
| "rewards/MCQ_Reward/std": 0.1199105829000473, |
| "step": 293, |
| "train_speed(iter/s)": 0.118029 |
| }, |
| { |
| "clip_ratio": 0.007036251947283745, |
| "epoch": 5.88, |
| "grad_norm": 2.4533321857452393, |
| "kl": 0.5625, |
| "learning_rate": 3.6982434130084396e-07, |
| "loss": -0.0037924423813819885, |
| "memory(GiB)": 18.17, |
| "step": 294, |
| "train_speed(iter/s)": 0.118366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.5, |
| "completions/mean_length": 127.00390625, |
| "completions/min_length": 75.0, |
| "epoch": 5.9, |
| "grad_norm": 2.2269814014434814, |
| "kl": 0.5, |
| "learning_rate": 3.6676309315498255e-07, |
| "loss": 0.012001181952655315, |
| "memory(GiB)": 18.17, |
| "reward": 0.42691150307655334, |
| "reward_std": 0.021617514081299305, |
| "rewards/MCQ_Reward/mean": 0.42691150307655334, |
| "rewards/MCQ_Reward/std": 0.11347687244415283, |
| "step": 295, |
| "train_speed(iter/s)": 0.11833 |
| }, |
| { |
| "clip_ratio": 0.004536686465144157, |
| "epoch": 5.92, |
| "grad_norm": 2.593670129776001, |
| "kl": 0.513671875, |
| "learning_rate": 3.6370721177230115e-07, |
| "loss": 0.011945893988013268, |
| "memory(GiB)": 18.17, |
| "step": 296, |
| "train_speed(iter/s)": 0.118674 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 231.5, |
| "completions/mean_length": 123.5234375, |
| "completions/min_length": 71.5, |
| "epoch": 5.9399999999999995, |
| "grad_norm": 2.1928629875183105, |
| "kl": 0.4970703125, |
| "learning_rate": 3.6065682024325617e-07, |
| "loss": 0.015498391352593899, |
| "memory(GiB)": 18.17, |
| "reward": 0.41268619894981384, |
| "reward_std": 0.02419480960816145, |
| "rewards/MCQ_Reward/mean": 0.41268619894981384, |
| "rewards/MCQ_Reward/std": 0.09195958822965622, |
| "step": 297, |
| "train_speed(iter/s)": 0.118532 |
| }, |
| { |
| "clip_ratio": 0.0050865779630839825, |
| "epoch": 5.96, |
| "grad_norm": 2.1392431259155273, |
| "kl": 0.494140625, |
| "learning_rate": 3.5761204143717385e-07, |
| "loss": 0.014891544356942177, |
| "memory(GiB)": 18.17, |
| "step": 298, |
| "train_speed(iter/s)": 0.118872 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 226.5, |
| "completions/mean_length": 124.328125, |
| "completions/min_length": 64.5, |
| "epoch": 5.98, |
| "grad_norm": 2.7249698638916016, |
| "kl": 0.880859375, |
| "learning_rate": 3.5457299799730045e-07, |
| "loss": -0.010070513002574444, |
| "memory(GiB)": 18.17, |
| "reward": 0.4588439464569092, |
| "reward_std": 0.029408703558146954, |
| "rewards/MCQ_Reward/mean": 0.4588439464569092, |
| "rewards/MCQ_Reward/std": 0.09774744883179665, |
| "step": 299, |
| "train_speed(iter/s)": 0.118723 |
| }, |
| { |
| "clip_ratio": 0.01025686739012599, |
| "epoch": 6.0, |
| "grad_norm": 3.8231394290924072, |
| "kl": 0.7529296875, |
| "learning_rate": 3.5153981233586274e-07, |
| "loss": -0.009807607159018517, |
| "memory(GiB)": 18.17, |
| "step": 300, |
| "train_speed(iter/s)": 0.119048 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 185.5, |
| "completions/mean_length": 109.19140625, |
| "completions/min_length": 56.0, |
| "epoch": 6.02, |
| "grad_norm": 2.6895663738250732, |
| "kl": 0.599609375, |
| "learning_rate": 3.485126066291364e-07, |
| "loss": -0.010052207857370377, |
| "memory(GiB)": 18.17, |
| "reward": 0.4080576002597809, |
| "reward_std": 0.02562197484076023, |
| "rewards/MCQ_Reward/mean": 0.4080576002597809, |
| "rewards/MCQ_Reward/std": 0.09971121698617935, |
| "step": 301, |
| "train_speed(iter/s)": 0.118697 |
| }, |
| { |
| "clip_ratio": 0.005149862729012966, |
| "epoch": 6.04, |
| "grad_norm": 2.655897855758667, |
| "kl": 0.607421875, |
| "learning_rate": 3.454915028125263e-07, |
| "loss": -0.010359197854995728, |
| "memory(GiB)": 18.17, |
| "step": 302, |
| "train_speed(iter/s)": 0.11903 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 179.5, |
| "completions/mean_length": 117.90234375, |
| "completions/min_length": 56.5, |
| "epoch": 6.06, |
| "grad_norm": 2.423926591873169, |
| "kl": 0.546875, |
| "learning_rate": 3.4247662257565366e-07, |
| "loss": 0.018125958740711212, |
| "memory(GiB)": 18.17, |
| "reward": 0.4407869875431061, |
| "reward_std": 0.025757532566785812, |
| "rewards/MCQ_Reward/mean": 0.4407869875431061, |
| "rewards/MCQ_Reward/std": 0.12692639231681824, |
| "step": 303, |
| "train_speed(iter/s)": 0.118923 |
| }, |
| { |
| "clip_ratio": 0.00550723378546536, |
| "epoch": 6.08, |
| "grad_norm": 2.2029030323028564, |
| "kl": 0.5546875, |
| "learning_rate": 3.394680873574546e-07, |
| "loss": 0.017929650843143463, |
| "memory(GiB)": 18.17, |
| "step": 304, |
| "train_speed(iter/s)": 0.119254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 195.0, |
| "completions/mean_length": 124.44140625, |
| "completions/min_length": 54.5, |
| "epoch": 6.1, |
| "grad_norm": 2.3613805770874023, |
| "kl": 0.5703125, |
| "learning_rate": 3.3646601834128916e-07, |
| "loss": -0.007877168245613575, |
| "memory(GiB)": 18.17, |
| "reward": 0.49866482615470886, |
| "reward_std": 0.024780258536338806, |
| "rewards/MCQ_Reward/mean": 0.49866482615470886, |
| "rewards/MCQ_Reward/std": 0.07562171667814255, |
| "step": 305, |
| "train_speed(iter/s)": 0.11921 |
| }, |
| { |
| "clip_ratio": 0.004300985252484679, |
| "epoch": 6.12, |
| "grad_norm": 2.1242995262145996, |
| "kl": 0.576171875, |
| "learning_rate": 3.3347053645005965e-07, |
| "loss": -0.008408917114138603, |
| "memory(GiB)": 18.17, |
| "step": 306, |
| "train_speed(iter/s)": 0.119519 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 159.5, |
| "completions/mean_length": 105.72265625, |
| "completions/min_length": 64.5, |
| "epoch": 6.14, |
| "grad_norm": 2.5641608238220215, |
| "kl": 0.560546875, |
| "learning_rate": 3.3048176234133963e-07, |
| "loss": 0.0034052138216793537, |
| "memory(GiB)": 18.17, |
| "reward": 0.3926085978746414, |
| "reward_std": 0.01911616325378418, |
| "rewards/MCQ_Reward/mean": 0.3926085978746414, |
| "rewards/MCQ_Reward/std": 0.06766298227012157, |
| "step": 307, |
| "train_speed(iter/s)": 0.119522 |
| }, |
| { |
| "clip_ratio": 0.007244990672916174, |
| "epoch": 6.16, |
| "grad_norm": 2.7589051723480225, |
| "kl": 0.572265625, |
| "learning_rate": 3.274998164025148e-07, |
| "loss": 0.0031583395320922136, |
| "memory(GiB)": 18.17, |
| "step": 308, |
| "train_speed(iter/s)": 0.119856 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.0, |
| "completions/mean_length": 119.86328125, |
| "completions/min_length": 57.0, |
| "epoch": 6.18, |
| "grad_norm": 2.9221317768096924, |
| "kl": 0.611328125, |
| "learning_rate": 3.245248187459323e-07, |
| "loss": -0.019380319863557816, |
| "memory(GiB)": 18.17, |
| "reward": 0.386982798576355, |
| "reward_std": 0.026672961190342903, |
| "rewards/MCQ_Reward/mean": 0.386982798576355, |
| "rewards/MCQ_Reward/std": 0.10517054051160812, |
| "step": 309, |
| "train_speed(iter/s)": 0.119747 |
| }, |
| { |
| "clip_ratio": 0.005416512954980135, |
| "epoch": 6.2, |
| "grad_norm": 2.7965259552001953, |
| "kl": 0.61328125, |
| "learning_rate": 3.215568892040641e-07, |
| "loss": -0.019356630742549896, |
| "memory(GiB)": 18.17, |
| "step": 310, |
| "train_speed(iter/s)": 0.120077 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 200.5, |
| "completions/mean_length": 118.21484375, |
| "completions/min_length": 57.0, |
| "epoch": 6.22, |
| "grad_norm": 2.8668336868286133, |
| "kl": 0.607421875, |
| "learning_rate": 3.1859614732467954e-07, |
| "loss": -0.013122756965458393, |
| "memory(GiB)": 18.17, |
| "reward": 0.4595968574285507, |
| "reward_std": 0.024624092504382133, |
| "rewards/MCQ_Reward/mean": 0.4595968574285507, |
| "rewards/MCQ_Reward/std": 0.08434771373867989, |
| "step": 311, |
| "train_speed(iter/s)": 0.119696 |
| }, |
| { |
| "clip_ratio": 0.00573662668466568, |
| "epoch": 6.24, |
| "grad_norm": 2.4580280780792236, |
| "kl": 0.609375, |
| "learning_rate": 3.156427123660297e-07, |
| "loss": -0.013560149818658829, |
| "memory(GiB)": 18.17, |
| "step": 312, |
| "train_speed(iter/s)": 0.120023 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 202.5, |
| "completions/mean_length": 121.68359375, |
| "completions/min_length": 73.5, |
| "epoch": 6.26, |
| "grad_norm": 2.6274502277374268, |
| "kl": 0.58984375, |
| "learning_rate": 3.1269670329204393e-07, |
| "loss": 0.0022671520709991455, |
| "memory(GiB)": 18.17, |
| "reward": 0.44664010405540466, |
| "reward_std": 0.024377938359975815, |
| "rewards/MCQ_Reward/mean": 0.44664010405540466, |
| "rewards/MCQ_Reward/std": 0.08575410395860672, |
| "step": 313, |
| "train_speed(iter/s)": 0.119945 |
| }, |
| { |
| "clip_ratio": 0.0052670135628432035, |
| "epoch": 6.28, |
| "grad_norm": 2.753713607788086, |
| "kl": 0.578125, |
| "learning_rate": 3.097582387675385e-07, |
| "loss": 0.0018416689708828926, |
| "memory(GiB)": 18.17, |
| "step": 314, |
| "train_speed(iter/s)": 0.120272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 207.5, |
| "completions/mean_length": 127.57421875, |
| "completions/min_length": 77.0, |
| "epoch": 6.3, |
| "grad_norm": 2.4003334045410156, |
| "kl": 0.583984375, |
| "learning_rate": 3.068274371534356e-07, |
| "loss": 0.0005114064551889896, |
| "memory(GiB)": 18.17, |
| "reward": 0.44641484320163727, |
| "reward_std": 0.024146192707121372, |
| "rewards/MCQ_Reward/mean": 0.44641484320163727, |
| "rewards/MCQ_Reward/std": 0.08713827468454838, |
| "step": 315, |
| "train_speed(iter/s)": 0.120168 |
| }, |
| { |
| "clip_ratio": 0.008136166725307703, |
| "epoch": 6.32, |
| "grad_norm": 2.3975117206573486, |
| "kl": 0.619140625, |
| "learning_rate": 3.039044165019972e-07, |
| "loss": 0.0004498562775552273, |
| "memory(GiB)": 18.17, |
| "step": 316, |
| "train_speed(iter/s)": 0.120495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 196.5, |
| "completions/mean_length": 117.9609375, |
| "completions/min_length": 58.5, |
| "epoch": 6.34, |
| "grad_norm": 2.348710060119629, |
| "kl": 0.548828125, |
| "learning_rate": 3.00989294552069e-07, |
| "loss": 0.00850888341665268, |
| "memory(GiB)": 18.17, |
| "reward": 0.42280539870262146, |
| "reward_std": 0.02416596282273531, |
| "rewards/MCQ_Reward/mean": 0.42280539870262146, |
| "rewards/MCQ_Reward/std": 0.0933729000389576, |
| "step": 317, |
| "train_speed(iter/s)": 0.120401 |
| }, |
| { |
| "clip_ratio": 0.005974379135295749, |
| "epoch": 6.36, |
| "grad_norm": 2.630732774734497, |
| "kl": 0.5390625, |
| "learning_rate": 2.9808218872433766e-07, |
| "loss": 0.008482606150209904, |
| "memory(GiB)": 18.17, |
| "step": 318, |
| "train_speed(iter/s)": 0.120723 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 200.0, |
| "completions/mean_length": 123.66796875, |
| "completions/min_length": 75.5, |
| "epoch": 6.38, |
| "grad_norm": 2.1341052055358887, |
| "kl": 0.517578125, |
| "learning_rate": 2.9518321611660234e-07, |
| "loss": -0.0021673766896128654, |
| "memory(GiB)": 18.17, |
| "reward": 0.4051154851913452, |
| "reward_std": 0.020906205289065838, |
| "rewards/MCQ_Reward/mean": 0.4051154851913452, |
| "rewards/MCQ_Reward/std": 0.09874700754880905, |
| "step": 319, |
| "train_speed(iter/s)": 0.12062 |
| }, |
| { |
| "clip_ratio": 0.00719631533138454, |
| "epoch": 6.4, |
| "grad_norm": 3.2350962162017822, |
| "kl": 0.5390625, |
| "learning_rate": 2.922924934990568e-07, |
| "loss": -0.0024176109582185745, |
| "memory(GiB)": 18.17, |
| "step": 320, |
| "train_speed(iter/s)": 0.120919 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 170.0, |
| "completions/mean_length": 117.234375, |
| "completions/min_length": 69.0, |
| "epoch": 6.42, |
| "grad_norm": 74.83729553222656, |
| "kl": 20.791015625, |
| "learning_rate": 2.894101373095867e-07, |
| "loss": 0.04349440336227417, |
| "memory(GiB)": 18.17, |
| "reward": 0.44527527689933777, |
| "reward_std": 0.021908948197960854, |
| "rewards/MCQ_Reward/mean": 0.44527527689933777, |
| "rewards/MCQ_Reward/std": 0.08160104416310787, |
| "step": 321, |
| "train_speed(iter/s)": 0.120602 |
| }, |
| { |
| "clip_ratio": 0.004950069589540362, |
| "epoch": 6.44, |
| "grad_norm": 99.64342498779297, |
| "kl": 26.54296875, |
| "learning_rate": 2.8653626364907914e-07, |
| "loss": 0.04914519935846329, |
| "memory(GiB)": 18.17, |
| "step": 322, |
| "train_speed(iter/s)": 0.120907 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 232.0, |
| "completions/mean_length": 128.45703125, |
| "completions/min_length": 52.5, |
| "epoch": 6.46, |
| "grad_norm": 2.5322988033294678, |
| "kl": 0.529296875, |
| "learning_rate": 2.8367098827674576e-07, |
| "loss": 0.009952299296855927, |
| "memory(GiB)": 18.17, |
| "reward": 0.4740261733531952, |
| "reward_std": 0.023401367478072643, |
| "rewards/MCQ_Reward/mean": 0.4740261733531952, |
| "rewards/MCQ_Reward/std": 0.08106581121683121, |
| "step": 323, |
| "train_speed(iter/s)": 0.12071 |
| }, |
| { |
| "clip_ratio": 0.005782874301075935, |
| "epoch": 6.48, |
| "grad_norm": 2.591923952102661, |
| "kl": 0.53125, |
| "learning_rate": 2.808144266054612e-07, |
| "loss": 0.009899303317070007, |
| "memory(GiB)": 18.17, |
| "step": 324, |
| "train_speed(iter/s)": 0.121029 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 202.5, |
| "completions/mean_length": 133.33203125, |
| "completions/min_length": 81.5, |
| "epoch": 6.5, |
| "grad_norm": 2.113783121109009, |
| "kl": 0.537109375, |
| "learning_rate": 2.779666936971129e-07, |
| "loss": -0.0006487010978162289, |
| "memory(GiB)": 18.17, |
| "reward": 0.39647024869918823, |
| "reward_std": 0.02249709703028202, |
| "rewards/MCQ_Reward/mean": 0.39647024869918823, |
| "rewards/MCQ_Reward/std": 0.0880400650203228, |
| "step": 325, |
| "train_speed(iter/s)": 0.120986 |
| }, |
| { |
| "clip_ratio": 0.006350549403578043, |
| "epoch": 6.52, |
| "grad_norm": 2.4789633750915527, |
| "kl": 0.525390625, |
| "learning_rate": 2.751279042579672e-07, |
| "loss": -0.0002095792442560196, |
| "memory(GiB)": 18.17, |
| "step": 326, |
| "train_speed(iter/s)": 0.121304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.0, |
| "completions/mean_length": 126.234375, |
| "completions/min_length": 54.0, |
| "epoch": 6.54, |
| "grad_norm": 2.4260339736938477, |
| "kl": 0.548828125, |
| "learning_rate": 2.7229817263404864e-07, |
| "loss": -0.0033088945783674717, |
| "memory(GiB)": 18.17, |
| "reward": 0.4554037004709244, |
| "reward_std": 0.02187604457139969, |
| "rewards/MCQ_Reward/mean": 0.4554037004709244, |
| "rewards/MCQ_Reward/std": 0.09804989397525787, |
| "step": 327, |
| "train_speed(iter/s)": 0.121167 |
| }, |
| { |
| "clip_ratio": 0.008008664939552546, |
| "epoch": 6.5600000000000005, |
| "grad_norm": 4.365505695343018, |
| "kl": 0.533203125, |
| "learning_rate": 2.6947761280653447e-07, |
| "loss": -0.00283604022115469, |
| "memory(GiB)": 18.17, |
| "step": 328, |
| "train_speed(iter/s)": 0.121483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.0, |
| "completions/mean_length": 117.9609375, |
| "completions/min_length": 69.5, |
| "epoch": 6.58, |
| "grad_norm": 2.2564356327056885, |
| "kl": 0.5283203125, |
| "learning_rate": 2.6666633838716314e-07, |
| "loss": -0.0077381255105137825, |
| "memory(GiB)": 18.17, |
| "reward": 0.4396722763776779, |
| "reward_std": 0.022700872272253036, |
| "rewards/MCQ_Reward/mean": 0.4396722763776779, |
| "rewards/MCQ_Reward/std": 0.10192850604653358, |
| "step": 329, |
| "train_speed(iter/s)": 0.12143 |
| }, |
| { |
| "clip_ratio": 0.0047557426150888205, |
| "epoch": 6.6, |
| "grad_norm": 2.172281503677368, |
| "kl": 0.5322265625, |
| "learning_rate": 2.638644626136587e-07, |
| "loss": -0.008173219859600067, |
| "memory(GiB)": 18.17, |
| "step": 330, |
| "train_speed(iter/s)": 0.121737 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 183.0, |
| "completions/mean_length": 126.0859375, |
| "completions/min_length": 68.5, |
| "epoch": 6.62, |
| "grad_norm": 2.167248010635376, |
| "kl": 0.4873046875, |
| "learning_rate": 2.610720983451685e-07, |
| "loss": 0.018461888656020164, |
| "memory(GiB)": 18.17, |
| "reward": 0.44843943417072296, |
| "reward_std": 0.02303914539515972, |
| "rewards/MCQ_Reward/mean": 0.44843943417072296, |
| "rewards/MCQ_Reward/std": 0.08497340604662895, |
| "step": 331, |
| "train_speed(iter/s)": 0.121397 |
| }, |
| { |
| "clip_ratio": 0.0052658268250525, |
| "epoch": 6.64, |
| "grad_norm": 2.136260509490967, |
| "kl": 0.4921875, |
| "learning_rate": 2.58289358057718e-07, |
| "loss": 0.01842992939054966, |
| "memory(GiB)": 18.17, |
| "step": 332, |
| "train_speed(iter/s)": 0.121707 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 225.5, |
| "completions/mean_length": 127.5546875, |
| "completions/min_length": 65.5, |
| "epoch": 6.66, |
| "grad_norm": 2.595977306365967, |
| "kl": 0.578125, |
| "learning_rate": 2.555163538396806e-07, |
| "loss": -0.011687211692333221, |
| "memory(GiB)": 18.17, |
| "reward": 0.4103027582168579, |
| "reward_std": 0.02552829496562481, |
| "rewards/MCQ_Reward/mean": 0.4103027582168579, |
| "rewards/MCQ_Reward/std": 0.0971563570201397, |
| "step": 333, |
| "train_speed(iter/s)": 0.1216 |
| }, |
| { |
| "clip_ratio": 0.0067884225863963366, |
| "epoch": 6.68, |
| "grad_norm": 3.2224881649017334, |
| "kl": 0.59765625, |
| "learning_rate": 2.5275319738726165e-07, |
| "loss": -0.011430272832512856, |
| "memory(GiB)": 18.17, |
| "step": 334, |
| "train_speed(iter/s)": 0.121912 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 220.0, |
| "completions/mean_length": 123.2578125, |
| "completions/min_length": 75.0, |
| "epoch": 6.7, |
| "grad_norm": 2.387573480606079, |
| "kl": 0.56640625, |
| "learning_rate": 2.500000000000001e-07, |
| "loss": -0.006422008387744427, |
| "memory(GiB)": 18.17, |
| "reward": 0.4134673774242401, |
| "reward_std": 0.022745592519640923, |
| "rewards/MCQ_Reward/mean": 0.4134673774242401, |
| "rewards/MCQ_Reward/std": 0.10698199272155762, |
| "step": 335, |
| "train_speed(iter/s)": 0.121789 |
| }, |
| { |
| "clip_ratio": 0.007158383261412382, |
| "epoch": 6.72, |
| "grad_norm": 2.7240705490112305, |
| "kl": 0.564453125, |
| "learning_rate": 2.472568725762853e-07, |
| "loss": -0.0065142130479216576, |
| "memory(GiB)": 18.17, |
| "step": 336, |
| "train_speed(iter/s)": 0.122088 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 153.5, |
| "completions/mean_length": 108.890625, |
| "completions/min_length": 63.5, |
| "epoch": 6.74, |
| "grad_norm": 2.2466800212860107, |
| "kl": 0.7421875, |
| "learning_rate": 2.4452392560888976e-07, |
| "loss": -0.00018489733338356018, |
| "memory(GiB)": 18.17, |
| "reward": 0.42812955379486084, |
| "reward_std": 0.0208740271627903, |
| "rewards/MCQ_Reward/mean": 0.42812955379486084, |
| "rewards/MCQ_Reward/std": 0.08048268780112267, |
| "step": 337, |
| "train_speed(iter/s)": 0.12208 |
| }, |
| { |
| "clip_ratio": 0.005281613674014807, |
| "epoch": 6.76, |
| "grad_norm": 2.0434200763702393, |
| "kl": 0.771484375, |
| "learning_rate": 2.418012691805191e-07, |
| "loss": -0.0005159445572644472, |
| "memory(GiB)": 18.17, |
| "step": 338, |
| "train_speed(iter/s)": 0.122388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 202.0, |
| "completions/mean_length": 117.3984375, |
| "completions/min_length": 65.0, |
| "epoch": 6.78, |
| "grad_norm": 2.669919729232788, |
| "kl": 0.572265625, |
| "learning_rate": 2.390890129593771e-07, |
| "loss": -0.009503326378762722, |
| "memory(GiB)": 18.17, |
| "reward": 0.41273191571235657, |
| "reward_std": 0.023225258104503155, |
| "rewards/MCQ_Reward/mean": 0.41273191571235657, |
| "rewards/MCQ_Reward/std": 0.08152876608073711, |
| "step": 339, |
| "train_speed(iter/s)": 0.122302 |
| }, |
| { |
| "clip_ratio": 0.005108103854581714, |
| "epoch": 6.8, |
| "grad_norm": 2.5069973468780518, |
| "kl": 0.576171875, |
| "learning_rate": 2.3638726619474875e-07, |
| "loss": -0.009927002713084221, |
| "memory(GiB)": 18.17, |
| "step": 340, |
| "train_speed(iter/s)": 0.122605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 205.5, |
| "completions/mean_length": 121.0703125, |
| "completions/min_length": 66.0, |
| "epoch": 6.82, |
| "grad_norm": 2.5319740772247314, |
| "kl": 0.59765625, |
| "learning_rate": 2.3369613771260005e-07, |
| "loss": 0.004871162120252848, |
| "memory(GiB)": 18.17, |
| "reward": 0.39162860810756683, |
| "reward_std": 0.022268068976700306, |
| "rewards/MCQ_Reward/mean": 0.39162860810756683, |
| "rewards/MCQ_Reward/std": 0.07392172142863274, |
| "step": 341, |
| "train_speed(iter/s)": 0.12225 |
| }, |
| { |
| "clip_ratio": 0.004840584937483072, |
| "epoch": 6.84, |
| "grad_norm": 2.547236204147339, |
| "kl": 0.60546875, |
| "learning_rate": 2.310157359111938e-07, |
| "loss": 0.004931057803332806, |
| "memory(GiB)": 18.17, |
| "step": 342, |
| "train_speed(iter/s)": 0.122534 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 201.5, |
| "completions/mean_length": 124.6328125, |
| "completions/min_length": 67.5, |
| "epoch": 6.86, |
| "grad_norm": 2.610426664352417, |
| "kl": 0.5419921875, |
| "learning_rate": 2.283461687567236e-07, |
| "loss": 0.012133005075156689, |
| "memory(GiB)": 18.17, |
| "reward": 0.38104377686977386, |
| "reward_std": 0.023476887494325638, |
| "rewards/MCQ_Reward/mean": 0.38104377686977386, |
| "rewards/MCQ_Reward/std": 0.13691367208957672, |
| "step": 343, |
| "train_speed(iter/s)": 0.122472 |
| }, |
| { |
| "clip_ratio": 0.005503881955519319, |
| "epoch": 6.88, |
| "grad_norm": 2.517308473587036, |
| "kl": 0.5458984375, |
| "learning_rate": 2.2568754377896515e-07, |
| "loss": 0.012206798419356346, |
| "memory(GiB)": 18.17, |
| "step": 344, |
| "train_speed(iter/s)": 0.122771 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 191.0, |
| "completions/mean_length": 122.8125, |
| "completions/min_length": 54.0, |
| "epoch": 6.9, |
| "grad_norm": 2.268815517425537, |
| "kl": 0.576171875, |
| "learning_rate": 2.2303996806694486e-07, |
| "loss": 0.005438795313239098, |
| "memory(GiB)": 18.17, |
| "reward": 0.41502565145492554, |
| "reward_std": 0.021418385207653046, |
| "rewards/MCQ_Reward/mean": 0.41502565145492554, |
| "rewards/MCQ_Reward/std": 0.09508999437093735, |
| "step": 345, |
| "train_speed(iter/s)": 0.122753 |
| }, |
| { |
| "clip_ratio": 0.005775286350399256, |
| "epoch": 6.92, |
| "grad_norm": 2.83811616897583, |
| "kl": 0.603515625, |
| "learning_rate": 2.2040354826462664e-07, |
| "loss": 0.005799311213195324, |
| "memory(GiB)": 18.17, |
| "step": 346, |
| "train_speed(iter/s)": 0.123049 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 164.5, |
| "completions/mean_length": 116.91796875, |
| "completions/min_length": 65.5, |
| "epoch": 6.9399999999999995, |
| "grad_norm": 2.334526777267456, |
| "kl": 0.564453125, |
| "learning_rate": 2.177783905666155e-07, |
| "loss": 0.0054929498583078384, |
| "memory(GiB)": 18.17, |
| "reward": 0.39654283225536346, |
| "reward_std": 0.022173049859702587, |
| "rewards/MCQ_Reward/mean": 0.39654283225536346, |
| "rewards/MCQ_Reward/std": 0.09505746513605118, |
| "step": 347, |
| "train_speed(iter/s)": 0.123026 |
| }, |
| { |
| "clip_ratio": 0.0045166281051933765, |
| "epoch": 6.96, |
| "grad_norm": 2.271827220916748, |
| "kl": 0.564453125, |
| "learning_rate": 2.151646007138806e-07, |
| "loss": 0.0055296882055699825, |
| "memory(GiB)": 18.17, |
| "step": 348, |
| "train_speed(iter/s)": 0.123296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 219.0, |
| "completions/mean_length": 130.65625, |
| "completions/min_length": 77.5, |
| "epoch": 6.98, |
| "grad_norm": 2.0946249961853027, |
| "kl": 0.55859375, |
| "learning_rate": 2.125622839894964e-07, |
| "loss": 0.003636482171714306, |
| "memory(GiB)": 18.17, |
| "reward": 0.43836964666843414, |
| "reward_std": 0.021374424919486046, |
| "rewards/MCQ_Reward/mean": 0.43836964666843414, |
| "rewards/MCQ_Reward/std": 0.06100250408053398, |
| "step": 349, |
| "train_speed(iter/s)": 0.123225 |
| }, |
| { |
| "clip_ratio": 0.0046428050845861435, |
| "epoch": 7.0, |
| "grad_norm": 2.23724365234375, |
| "kl": 0.57421875, |
| "learning_rate": 2.0997154521440097e-07, |
| "loss": 0.004051330033689737, |
| "memory(GiB)": 18.17, |
| "step": 350, |
| "train_speed(iter/s)": 0.123516 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 188.0, |
| "completions/mean_length": 121.21484375, |
| "completions/min_length": 72.0, |
| "epoch": 7.02, |
| "grad_norm": 2.815627336502075, |
| "kl": 0.5703125, |
| "learning_rate": 2.0739248874317438e-07, |
| "loss": -0.019233888015151024, |
| "memory(GiB)": 18.17, |
| "reward": 0.4290418028831482, |
| "reward_std": 0.022210314869880676, |
| "rewards/MCQ_Reward/mean": 0.4290418028831482, |
| "rewards/MCQ_Reward/std": 0.06661852076649666, |
| "step": 351, |
| "train_speed(iter/s)": 0.123139 |
| }, |
| { |
| "clip_ratio": 0.00514651439152658, |
| "epoch": 7.04, |
| "grad_norm": 3.0636136531829834, |
| "kl": 0.576171875, |
| "learning_rate": 2.048252184598352e-07, |
| "loss": -0.01901531219482422, |
| "memory(GiB)": 18.17, |
| "step": 352, |
| "train_speed(iter/s)": 0.12342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 169.0, |
| "completions/mean_length": 112.85546875, |
| "completions/min_length": 62.5, |
| "epoch": 7.06, |
| "grad_norm": 2.700939178466797, |
| "kl": 0.58203125, |
| "learning_rate": 2.0226983777365603e-07, |
| "loss": -0.007234710268676281, |
| "memory(GiB)": 18.17, |
| "reward": 0.43640220165252686, |
| "reward_std": 0.022726435214281082, |
| "rewards/MCQ_Reward/mean": 0.43640220165252686, |
| "rewards/MCQ_Reward/std": 0.08832718059420586, |
| "step": 353, |
| "train_speed(iter/s)": 0.123424 |
| }, |
| { |
| "clip_ratio": 0.00972440093755722, |
| "epoch": 7.08, |
| "grad_norm": 3.0179059505462646, |
| "kl": 0.564453125, |
| "learning_rate": 1.9972644961499853e-07, |
| "loss": -0.007274748291820288, |
| "memory(GiB)": 18.17, |
| "step": 354, |
| "train_speed(iter/s)": 0.123722 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 212.5, |
| "completions/mean_length": 115.6328125, |
| "completions/min_length": 68.0, |
| "epoch": 7.1, |
| "grad_norm": 2.484236240386963, |
| "kl": 0.619140625, |
| "learning_rate": 1.9719515643116674e-07, |
| "loss": 0.015900151804089546, |
| "memory(GiB)": 18.17, |
| "reward": 0.45114465057849884, |
| "reward_std": 0.024738659150898457, |
| "rewards/MCQ_Reward/mean": 0.45114465057849884, |
| "rewards/MCQ_Reward/std": 0.10900644585490227, |
| "step": 355, |
| "train_speed(iter/s)": 0.123607 |
| }, |
| { |
| "clip_ratio": 0.0064309455920010805, |
| "epoch": 7.12, |
| "grad_norm": 3.852499485015869, |
| "kl": 0.607421875, |
| "learning_rate": 1.9467606018228088e-07, |
| "loss": 0.01630295254290104, |
| "memory(GiB)": 18.17, |
| "step": 356, |
| "train_speed(iter/s)": 0.123891 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 195.0, |
| "completions/mean_length": 128.88671875, |
| "completions/min_length": 74.5, |
| "epoch": 7.14, |
| "grad_norm": 2.455781936645508, |
| "kl": 0.5478515625, |
| "learning_rate": 1.9216926233717084e-07, |
| "loss": -0.00730013195425272, |
| "memory(GiB)": 18.17, |
| "reward": 0.4758221060037613, |
| "reward_std": 0.024665928445756435, |
| "rewards/MCQ_Reward/mean": 0.4758221060037613, |
| "rewards/MCQ_Reward/std": 0.0809130035340786, |
| "step": 357, |
| "train_speed(iter/s)": 0.123852 |
| }, |
| { |
| "clip_ratio": 0.00344535568729043, |
| "epoch": 7.16, |
| "grad_norm": 2.2257754802703857, |
| "kl": 0.5576171875, |
| "learning_rate": 1.8967486386928817e-07, |
| "loss": -0.0074045369401574135, |
| "memory(GiB)": 18.17, |
| "step": 358, |
| "train_speed(iter/s)": 0.124151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 217.0, |
| "completions/mean_length": 130.06640625, |
| "completions/min_length": 67.5, |
| "epoch": 7.18, |
| "grad_norm": 2.7154037952423096, |
| "kl": 0.51171875, |
| "learning_rate": 1.8719296525263923e-07, |
| "loss": 0.019313501194119453, |
| "memory(GiB)": 18.17, |
| "reward": 0.4561205357313156, |
| "reward_std": 0.023944508284330368, |
| "rewards/MCQ_Reward/mean": 0.4561205357313156, |
| "rewards/MCQ_Reward/std": 0.10000644996762276, |
| "step": 359, |
| "train_speed(iter/s)": 0.124074 |
| }, |
| { |
| "clip_ratio": 0.006082270760089159, |
| "epoch": 7.2, |
| "grad_norm": 2.114431381225586, |
| "kl": 0.5234375, |
| "learning_rate": 1.847236664577389e-07, |
| "loss": 0.01907144859433174, |
| "memory(GiB)": 18.17, |
| "step": 360, |
| "train_speed(iter/s)": 0.124368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 223.5, |
| "completions/mean_length": 130.765625, |
| "completions/min_length": 79.0, |
| "epoch": 7.22, |
| "grad_norm": 2.2248895168304443, |
| "kl": 0.5390625, |
| "learning_rate": 1.8226706694758193e-07, |
| "loss": 0.012620393186807632, |
| "memory(GiB)": 18.17, |
| "reward": 0.44832468032836914, |
| "reward_std": 0.025768463499844074, |
| "rewards/MCQ_Reward/mean": 0.44832468032836914, |
| "rewards/MCQ_Reward/std": 0.09799568355083466, |
| "step": 361, |
| "train_speed(iter/s)": 0.123928 |
| }, |
| { |
| "clip_ratio": 0.006066091358661652, |
| "epoch": 7.24, |
| "grad_norm": 2.5757896900177, |
| "kl": 0.53515625, |
| "learning_rate": 1.7982326567363886e-07, |
| "loss": 0.013028541579842567, |
| "memory(GiB)": 18.17, |
| "step": 362, |
| "train_speed(iter/s)": 0.124219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 194.5, |
| "completions/mean_length": 122.546875, |
| "completions/min_length": 50.5, |
| "epoch": 7.26, |
| "grad_norm": 2.2651302814483643, |
| "kl": 0.5322265625, |
| "learning_rate": 1.7739236107186857e-07, |
| "loss": 0.009481780230998993, |
| "memory(GiB)": 18.17, |
| "reward": 0.4318048655986786, |
| "reward_std": 0.022731643170118332, |
| "rewards/MCQ_Reward/mean": 0.4318048655986786, |
| "rewards/MCQ_Reward/std": 0.09833444282412529, |
| "step": 363, |
| "train_speed(iter/s)": 0.124163 |
| }, |
| { |
| "clip_ratio": 0.0038783656200394034, |
| "epoch": 7.28, |
| "grad_norm": 2.2316813468933105, |
| "kl": 0.5302734375, |
| "learning_rate": 1.7497445105875374e-07, |
| "loss": 0.009487325325608253, |
| "memory(GiB)": 18.17, |
| "step": 364, |
| "train_speed(iter/s)": 0.124456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.5, |
| "completions/mean_length": 131.63671875, |
| "completions/min_length": 61.5, |
| "epoch": 7.3, |
| "grad_norm": 2.720024347305298, |
| "kl": 0.5517578125, |
| "learning_rate": 1.725696330273575e-07, |
| "loss": 0.0073198857717216015, |
| "memory(GiB)": 18.17, |
| "reward": 0.4407372921705246, |
| "reward_std": 0.019983571954071522, |
| "rewards/MCQ_Reward/mean": 0.4407372921705246, |
| "rewards/MCQ_Reward/std": 0.07775032892823219, |
| "step": 365, |
| "train_speed(iter/s)": 0.124298 |
| }, |
| { |
| "clip_ratio": 0.005759742809459567, |
| "epoch": 7.32, |
| "grad_norm": 2.4700775146484375, |
| "kl": 0.5556640625, |
| "learning_rate": 1.7017800384339924e-07, |
| "loss": 0.00751863420009613, |
| "memory(GiB)": 18.17, |
| "step": 366, |
| "train_speed(iter/s)": 0.124588 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 240.5, |
| "completions/mean_length": 122.73828125, |
| "completions/min_length": 64.5, |
| "epoch": 7.34, |
| "grad_norm": 2.3976547718048096, |
| "kl": 0.541015625, |
| "learning_rate": 1.6779965984135374e-07, |
| "loss": 0.015993405133485794, |
| "memory(GiB)": 18.17, |
| "reward": 0.41162461042404175, |
| "reward_std": 0.020391933619976044, |
| "rewards/MCQ_Reward/mean": 0.41162461042404175, |
| "rewards/MCQ_Reward/std": 0.0841926857829094, |
| "step": 367, |
| "train_speed(iter/s)": 0.124346 |
| }, |
| { |
| "clip_ratio": 0.005305928410962224, |
| "epoch": 7.36, |
| "grad_norm": 2.444512128829956, |
| "kl": 0.546875, |
| "learning_rate": 1.6543469682057104e-07, |
| "loss": 0.016359636560082436, |
| "memory(GiB)": 18.17, |
| "step": 368, |
| "train_speed(iter/s)": 0.124615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 156.0, |
| "completions/mean_length": 113.90234375, |
| "completions/min_length": 68.5, |
| "epoch": 7.38, |
| "grad_norm": 3.490565299987793, |
| "kl": 0.57421875, |
| "learning_rate": 1.6308321004141607e-07, |
| "loss": -0.0010942098451778293, |
| "memory(GiB)": 18.17, |
| "reward": 0.38713136315345764, |
| "reward_std": 0.021422830410301685, |
| "rewards/MCQ_Reward/mean": 0.38713136315345764, |
| "rewards/MCQ_Reward/std": 0.10617586970329285, |
| "step": 369, |
| "train_speed(iter/s)": 0.124639 |
| }, |
| { |
| "clip_ratio": 0.005288022803142667, |
| "epoch": 7.4, |
| "grad_norm": 2.881525754928589, |
| "kl": 0.564453125, |
| "learning_rate": 1.6074529422143396e-07, |
| "loss": -0.0009173217695206404, |
| "memory(GiB)": 18.17, |
| "step": 370, |
| "train_speed(iter/s)": 0.124914 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 238.5, |
| "completions/mean_length": 139.4375, |
| "completions/min_length": 87.0, |
| "epoch": 7.42, |
| "grad_norm": 2.1569535732269287, |
| "kl": 0.49609375, |
| "learning_rate": 1.5842104353153285e-07, |
| "loss": 0.014979809522628784, |
| "memory(GiB)": 18.17, |
| "reward": 0.4273018389940262, |
| "reward_std": 0.02148488350212574, |
| "rewards/MCQ_Reward/mean": 0.4273018389940262, |
| "rewards/MCQ_Reward/std": 0.13347461819648743, |
| "step": 371, |
| "train_speed(iter/s)": 0.124503 |
| }, |
| { |
| "clip_ratio": 0.006136654410511255, |
| "epoch": 7.44, |
| "grad_norm": 2.3948974609375, |
| "kl": 0.486328125, |
| "learning_rate": 1.561105515921915e-07, |
| "loss": 0.015109008178114891, |
| "memory(GiB)": 18.17, |
| "step": 372, |
| "train_speed(iter/s)": 0.124788 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.5, |
| "completions/mean_length": 117.00390625, |
| "completions/min_length": 69.5, |
| "epoch": 7.46, |
| "grad_norm": 2.3135647773742676, |
| "kl": 0.669921875, |
| "learning_rate": 1.5381391146968863e-07, |
| "loss": 0.006555130705237389, |
| "memory(GiB)": 18.17, |
| "reward": 0.4488084018230438, |
| "reward_std": 0.02006101794540882, |
| "rewards/MCQ_Reward/mean": 0.4488084018230438, |
| "rewards/MCQ_Reward/std": 0.07920502312481403, |
| "step": 373, |
| "train_speed(iter/s)": 0.124722 |
| }, |
| { |
| "clip_ratio": 0.007013680646196008, |
| "epoch": 7.48, |
| "grad_norm": 2.962529420852661, |
| "kl": 0.642578125, |
| "learning_rate": 1.5153121567235333e-07, |
| "loss": 0.006604420021176338, |
| "memory(GiB)": 18.17, |
| "step": 374, |
| "train_speed(iter/s)": 0.125001 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 169.0, |
| "completions/mean_length": 107.60546875, |
| "completions/min_length": 53.5, |
| "epoch": 7.5, |
| "grad_norm": 2.731383800506592, |
| "kl": 0.576171875, |
| "learning_rate": 1.492625561468393e-07, |
| "loss": -0.005473949480801821, |
| "memory(GiB)": 18.17, |
| "reward": 0.41762372851371765, |
| "reward_std": 0.019964593462646008, |
| "rewards/MCQ_Reward/mean": 0.41762372851371765, |
| "rewards/MCQ_Reward/std": 0.08107879385352135, |
| "step": 375, |
| "train_speed(iter/s)": 0.124937 |
| }, |
| { |
| "clip_ratio": 0.004663396626710892, |
| "epoch": 7.52, |
| "grad_norm": 2.615187406539917, |
| "kl": 0.576171875, |
| "learning_rate": 1.4700802427442178e-07, |
| "loss": -0.005246948450803757, |
| "memory(GiB)": 18.17, |
| "step": 376, |
| "train_speed(iter/s)": 0.125201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.5, |
| "completions/mean_length": 107.15625, |
| "completions/min_length": 50.5, |
| "epoch": 7.54, |
| "grad_norm": 2.796724557876587, |
| "kl": 0.640625, |
| "learning_rate": 1.4476771086731565e-07, |
| "loss": 0.01410718634724617, |
| "memory(GiB)": 18.17, |
| "reward": 0.4095290005207062, |
| "reward_std": 0.02420712448656559, |
| "rewards/MCQ_Reward/mean": 0.4095290005207062, |
| "rewards/MCQ_Reward/std": 0.07465272396802902, |
| "step": 377, |
| "train_speed(iter/s)": 0.125163 |
| }, |
| { |
| "clip_ratio": 0.006976983975619078, |
| "epoch": 7.5600000000000005, |
| "grad_norm": 2.945889711380005, |
| "kl": 0.66015625, |
| "learning_rate": 1.4254170616501827e-07, |
| "loss": 0.014726857654750347, |
| "memory(GiB)": 18.17, |
| "step": 378, |
| "train_speed(iter/s)": 0.125433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 174.0, |
| "completions/mean_length": 118.50390625, |
| "completions/min_length": 63.5, |
| "epoch": 7.58, |
| "grad_norm": 2.9761271476745605, |
| "kl": 0.607421875, |
| "learning_rate": 1.4033009983067452e-07, |
| "loss": -0.004153972025960684, |
| "memory(GiB)": 18.17, |
| "reward": 0.42119112610816956, |
| "reward_std": 0.02067422866821289, |
| "rewards/MCQ_Reward/mean": 0.42119112610816956, |
| "rewards/MCQ_Reward/std": 0.0681285560131073, |
| "step": 379, |
| "train_speed(iter/s)": 0.125369 |
| }, |
| { |
| "clip_ratio": 0.0061764034908264875, |
| "epoch": 7.6, |
| "grad_norm": 3.6120944023132324, |
| "kl": 0.6171875, |
| "learning_rate": 1.381329809474649e-07, |
| "loss": -0.0035073161125183105, |
| "memory(GiB)": 18.17, |
| "step": 380, |
| "train_speed(iter/s)": 0.125649 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 194.0, |
| "completions/mean_length": 130.67578125, |
| "completions/min_length": 79.0, |
| "epoch": 7.62, |
| "grad_norm": 2.3507981300354004, |
| "kl": 0.5419921875, |
| "learning_rate": 1.3595043801501794e-07, |
| "loss": -0.0032176347449421883, |
| "memory(GiB)": 18.17, |
| "reward": 0.43415170907974243, |
| "reward_std": 0.021646766923367977, |
| "rewards/MCQ_Reward/mean": 0.43415170907974243, |
| "rewards/MCQ_Reward/std": 0.11485166102647781, |
| "step": 381, |
| "train_speed(iter/s)": 0.125308 |
| }, |
| { |
| "clip_ratio": 0.006046550814062357, |
| "epoch": 7.64, |
| "grad_norm": 2.5917809009552, |
| "kl": 0.541015625, |
| "learning_rate": 1.3378255894584462e-07, |
| "loss": -0.0032573172356933355, |
| "memory(GiB)": 18.17, |
| "step": 382, |
| "train_speed(iter/s)": 0.125575 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 591.0, |
| "completions/mean_length": 111.87109375, |
| "completions/min_length": 62.5, |
| "epoch": 7.66, |
| "grad_norm": 3.2898316383361816, |
| "kl": 0.84375, |
| "learning_rate": 1.3162943106179748e-07, |
| "loss": 0.05431316792964935, |
| "memory(GiB)": 25.14, |
| "reward": 0.4442131072282791, |
| "reward_std": 0.02893070410937071, |
| "rewards/MCQ_Reward/mean": 0.4442131072282791, |
| "rewards/MCQ_Reward/std": 0.0882490873336792, |
| "step": 383, |
| "train_speed(iter/s)": 0.124772 |
| }, |
| { |
| "clip_ratio": 0.005024469457566738, |
| "epoch": 7.68, |
| "grad_norm": 3.0035033226013184, |
| "kl": 0.82421875, |
| "learning_rate": 1.2949114109055414e-07, |
| "loss": 0.054804857820272446, |
| "memory(GiB)": 25.14, |
| "step": 384, |
| "train_speed(iter/s)": 0.125047 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.5, |
| "completions/mean_length": 125.80078125, |
| "completions/min_length": 67.0, |
| "epoch": 7.7, |
| "grad_norm": 2.8262860774993896, |
| "kl": 0.55078125, |
| "learning_rate": 1.2736777516212267e-07, |
| "loss": -0.006510823965072632, |
| "memory(GiB)": 25.14, |
| "reward": 0.40428027510643005, |
| "reward_std": 0.025332522578537464, |
| "rewards/MCQ_Reward/mean": 0.40428027510643005, |
| "rewards/MCQ_Reward/std": 0.10921913757920265, |
| "step": 385, |
| "train_speed(iter/s)": 0.124957 |
| }, |
| { |
| "clip_ratio": 0.005720158107578754, |
| "epoch": 7.72, |
| "grad_norm": 2.3165252208709717, |
| "kl": 0.54296875, |
| "learning_rate": 1.2525941880537304e-07, |
| "loss": -0.006398671306669712, |
| "memory(GiB)": 25.14, |
| "step": 386, |
| "train_speed(iter/s)": 0.125223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 171.5, |
| "completions/mean_length": 115.546875, |
| "completions/min_length": 68.5, |
| "epoch": 7.74, |
| "grad_norm": 2.5941028594970703, |
| "kl": 0.650390625, |
| "learning_rate": 1.2316615694459186e-07, |
| "loss": 0.013789664953947067, |
| "memory(GiB)": 25.14, |
| "reward": 0.4454474151134491, |
| "reward_std": 0.02376528736203909, |
| "rewards/MCQ_Reward/mean": 0.4454474151134491, |
| "rewards/MCQ_Reward/std": 0.07124818488955498, |
| "step": 387, |
| "train_speed(iter/s)": 0.125174 |
| }, |
| { |
| "clip_ratio": 0.00573781062848866, |
| "epoch": 7.76, |
| "grad_norm": 2.886561393737793, |
| "kl": 0.634765625, |
| "learning_rate": 1.2108807389606158e-07, |
| "loss": 0.014278584159910679, |
| "memory(GiB)": 25.14, |
| "step": 388, |
| "train_speed(iter/s)": 0.125449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 183.0, |
| "completions/mean_length": 121.00390625, |
| "completions/min_length": 57.5, |
| "epoch": 7.78, |
| "grad_norm": 2.2996103763580322, |
| "kl": 0.6171875, |
| "learning_rate": 1.1902525336466462e-07, |
| "loss": 0.012145346030592918, |
| "memory(GiB)": 25.14, |
| "reward": 0.42450854182243347, |
| "reward_std": 0.021244493313133717, |
| "rewards/MCQ_Reward/mean": 0.42450854182243347, |
| "rewards/MCQ_Reward/std": 0.09635130688548088, |
| "step": 389, |
| "train_speed(iter/s)": 0.125399 |
| }, |
| { |
| "clip_ratio": 0.005426776595413685, |
| "epoch": 7.8, |
| "grad_norm": 2.1788930892944336, |
| "kl": 0.62890625, |
| "learning_rate": 1.1697777844051104e-07, |
| "loss": 0.011829939670860767, |
| "memory(GiB)": 25.14, |
| "step": 390, |
| "train_speed(iter/s)": 0.125672 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 216.5, |
| "completions/mean_length": 129.03125, |
| "completions/min_length": 70.0, |
| "epoch": 7.82, |
| "grad_norm": 2.2412619590759277, |
| "kl": 0.53515625, |
| "learning_rate": 1.1494573159559212e-07, |
| "loss": 9.762030094861984e-05, |
| "memory(GiB)": 25.14, |
| "reward": 0.4155340790748596, |
| "reward_std": 0.020521354861557484, |
| "rewards/MCQ_Reward/mean": 0.4155340790748596, |
| "rewards/MCQ_Reward/std": 0.12795967236161232, |
| "step": 391, |
| "train_speed(iter/s)": 0.125325 |
| }, |
| { |
| "clip_ratio": 0.005442213034257293, |
| "epoch": 7.84, |
| "grad_norm": 2.445225954055786, |
| "kl": 0.54296875, |
| "learning_rate": 1.1292919468045875e-07, |
| "loss": 0.0006964541971683502, |
| "memory(GiB)": 25.14, |
| "step": 392, |
| "train_speed(iter/s)": 0.125594 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.5, |
| "completions/mean_length": 129.45703125, |
| "completions/min_length": 68.5, |
| "epoch": 7.86, |
| "grad_norm": 2.254128932952881, |
| "kl": 0.607421875, |
| "learning_rate": 1.1092824892092373e-07, |
| "loss": -0.010345934890210629, |
| "memory(GiB)": 25.14, |
| "reward": 0.40340456366539, |
| "reward_std": 0.022636689245700836, |
| "rewards/MCQ_Reward/mean": 0.40340456366539, |
| "rewards/MCQ_Reward/std": 0.09724823385477066, |
| "step": 393, |
| "train_speed(iter/s)": 0.125579 |
| }, |
| { |
| "clip_ratio": 0.004930965369567275, |
| "epoch": 7.88, |
| "grad_norm": 2.3455586433410645, |
| "kl": 0.623046875, |
| "learning_rate": 1.0894297491479043e-07, |
| "loss": -0.009814320132136345, |
| "memory(GiB)": 25.14, |
| "step": 394, |
| "train_speed(iter/s)": 0.125852 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 211.5, |
| "completions/mean_length": 122.03125, |
| "completions/min_length": 72.5, |
| "epoch": 7.9, |
| "grad_norm": 2.7601866722106934, |
| "kl": 0.54296875, |
| "learning_rate": 1.0697345262860635e-07, |
| "loss": 0.011853070929646492, |
| "memory(GiB)": 25.14, |
| "reward": 0.44544240832328796, |
| "reward_std": 0.02559925615787506, |
| "rewards/MCQ_Reward/mean": 0.44544240832328796, |
| "rewards/MCQ_Reward/std": 0.09495911747217178, |
| "step": 395, |
| "train_speed(iter/s)": 0.125762 |
| }, |
| { |
| "clip_ratio": 0.004873325582593679, |
| "epoch": 7.92, |
| "grad_norm": 3.1385254859924316, |
| "kl": 0.541015625, |
| "learning_rate": 1.0501976139444191e-07, |
| "loss": 0.01212891936302185, |
| "memory(GiB)": 25.14, |
| "step": 396, |
| "train_speed(iter/s)": 0.126021 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 186.0, |
| "completions/mean_length": 131.75, |
| "completions/min_length": 80.0, |
| "epoch": 7.9399999999999995, |
| "grad_norm": 2.280336380004883, |
| "kl": 0.59765625, |
| "learning_rate": 1.0308197990669537e-07, |
| "loss": -0.0006723229307681322, |
| "memory(GiB)": 25.14, |
| "reward": 0.3935137987136841, |
| "reward_std": 0.0229948153719306, |
| "rewards/MCQ_Reward/mean": 0.3935137987136841, |
| "rewards/MCQ_Reward/std": 0.09170003235340118, |
| "step": 397, |
| "train_speed(iter/s)": 0.125959 |
| }, |
| { |
| "clip_ratio": 0.009115117136389017, |
| "epoch": 7.96, |
| "grad_norm": 2.6576101779937744, |
| "kl": 0.623046875, |
| "learning_rate": 1.0116018621892236e-07, |
| "loss": -0.0008128315676003695, |
| "memory(GiB)": 25.14, |
| "step": 398, |
| "train_speed(iter/s)": 0.126231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 195.5, |
| "completions/mean_length": 125.65625, |
| "completions/min_length": 67.0, |
| "epoch": 7.98, |
| "grad_norm": 2.7158310413360596, |
| "kl": 0.58203125, |
| "learning_rate": 9.92544577406923e-08, |
| "loss": 0.006697420962154865, |
| "memory(GiB)": 25.14, |
| "reward": 0.43207649886608124, |
| "reward_std": 0.02400553785264492, |
| "rewards/MCQ_Reward/mean": 0.43207649886608124, |
| "rewards/MCQ_Reward/std": 0.0867740847170353, |
| "step": 399, |
| "train_speed(iter/s)": 0.126178 |
| }, |
| { |
| "clip_ratio": 0.005927033722400665, |
| "epoch": 8.0, |
| "grad_norm": 2.416578769683838, |
| "kl": 0.580078125, |
| "learning_rate": 9.736487123447068e-08, |
| "loss": 0.006666385568678379, |
| "memory(GiB)": 25.14, |
| "step": 400, |
| "train_speed(iter/s)": 0.126428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 210.0, |
| "completions/mean_length": 128.03515625, |
| "completions/min_length": 68.0, |
| "epoch": 8.02, |
| "grad_norm": 2.4625000953674316, |
| "kl": 0.55078125, |
| "learning_rate": 9.549150281252632e-08, |
| "loss": 0.019197747111320496, |
| "memory(GiB)": 25.14, |
| "reward": 0.41131871938705444, |
| "reward_std": 0.02179474849253893, |
| "rewards/MCQ_Reward/mean": 0.41131871938705444, |
| "rewards/MCQ_Reward/std": 0.0903569795191288, |
| "step": 401, |
| "train_speed(iter/s)": 0.12607 |
| }, |
| { |
| "clip_ratio": 0.004682507831603289, |
| "epoch": 8.04, |
| "grad_norm": 2.4578921794891357, |
| "kl": 0.556640625, |
| "learning_rate": 9.363442793386606e-08, |
| "loss": 0.019492177292704582, |
| "memory(GiB)": 25.14, |
| "step": 402, |
| "train_speed(iter/s)": 0.126333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 179.5, |
| "completions/mean_length": 124.9453125, |
| "completions/min_length": 65.0, |
| "epoch": 8.06, |
| "grad_norm": 2.380934000015259, |
| "kl": 0.595703125, |
| "learning_rate": 9.179372140119524e-08, |
| "loss": 0.00032033398747444153, |
| "memory(GiB)": 25.14, |
| "reward": 0.45213624835014343, |
| "reward_std": 0.019670803099870682, |
| "rewards/MCQ_Reward/mean": 0.45213624835014343, |
| "rewards/MCQ_Reward/std": 0.05602107755839825, |
| "step": 403, |
| "train_speed(iter/s)": 0.126289 |
| }, |
| { |
| "clip_ratio": 0.005494384560734034, |
| "epoch": 8.08, |
| "grad_norm": 2.2825376987457275, |
| "kl": 0.59765625, |
| "learning_rate": 8.996945735790446e-08, |
| "loss": 0.00025699660181999207, |
| "memory(GiB)": 25.14, |
| "step": 404, |
| "train_speed(iter/s)": 0.126553 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 193.0, |
| "completions/mean_length": 113.30078125, |
| "completions/min_length": 66.0, |
| "epoch": 8.1, |
| "grad_norm": 2.4504525661468506, |
| "kl": 0.65234375, |
| "learning_rate": 8.816170928508365e-08, |
| "loss": 0.005521825514733791, |
| "memory(GiB)": 25.14, |
| "reward": 0.4200716018676758, |
| "reward_std": 0.02163711003959179, |
| "rewards/MCQ_Reward/mean": 0.4200716018676758, |
| "rewards/MCQ_Reward/std": 0.09177059680223465, |
| "step": 405, |
| "train_speed(iter/s)": 0.126487 |
| }, |
| { |
| "clip_ratio": 0.005122944712638855, |
| "epoch": 8.12, |
| "grad_norm": 2.5025854110717773, |
| "kl": 0.65234375, |
| "learning_rate": 8.637054999856147e-08, |
| "loss": 0.005893816705793142, |
| "memory(GiB)": 25.14, |
| "step": 406, |
| "train_speed(iter/s)": 0.126707 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 206.0, |
| "completions/mean_length": 131.84765625, |
| "completions/min_length": 84.0, |
| "epoch": 8.14, |
| "grad_norm": 2.2803900241851807, |
| "kl": 0.677734375, |
| "learning_rate": 8.459605164597267e-08, |
| "loss": 0.002506987191736698, |
| "memory(GiB)": 25.14, |
| "reward": 0.42351874709129333, |
| "reward_std": 0.019920101389288902, |
| "rewards/MCQ_Reward/mean": 0.42351874709129333, |
| "rewards/MCQ_Reward/std": 0.07087348401546478, |
| "step": 407, |
| "train_speed(iter/s)": 0.126629 |
| }, |
| { |
| "clip_ratio": 0.004146608873270452, |
| "epoch": 8.16, |
| "grad_norm": 2.197411060333252, |
| "kl": 0.693359375, |
| "learning_rate": 8.283828570385237e-08, |
| "loss": 0.0028184172697365284, |
| "memory(GiB)": 25.14, |
| "step": 408, |
| "train_speed(iter/s)": 0.126894 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 246.5, |
| "completions/mean_length": 126.35546875, |
| "completions/min_length": 55.0, |
| "epoch": 8.18, |
| "grad_norm": 3.133226156234741, |
| "kl": 0.54296875, |
| "learning_rate": 8.109732297475635e-08, |
| "loss": 0.003347148187458515, |
| "memory(GiB)": 25.14, |
| "reward": 0.4289032816886902, |
| "reward_std": 0.023678142577409744, |
| "rewards/MCQ_Reward/mean": 0.4289032816886902, |
| "rewards/MCQ_Reward/std": 0.08180082961916924, |
| "step": 409, |
| "train_speed(iter/s)": 0.126716 |
| }, |
| { |
| "clip_ratio": 0.004793429281562567, |
| "epoch": 8.2, |
| "grad_norm": 2.647909164428711, |
| "kl": 0.548828125, |
| "learning_rate": 7.937323358440934e-08, |
| "loss": 0.003219081088900566, |
| "memory(GiB)": 25.14, |
| "step": 410, |
| "train_speed(iter/s)": 0.126979 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 200.0, |
| "completions/mean_length": 120.65234375, |
| "completions/min_length": 66.0, |
| "epoch": 8.22, |
| "grad_norm": 2.844910144805908, |
| "kl": 1.08984375, |
| "learning_rate": 7.766608697888094e-08, |
| "loss": 0.00578346848487854, |
| "memory(GiB)": 25.14, |
| "reward": 0.40613003075122833, |
| "reward_std": 0.024234792217612267, |
| "rewards/MCQ_Reward/mean": 0.40613003075122833, |
| "rewards/MCQ_Reward/std": 0.10613492503762245, |
| "step": 411, |
| "train_speed(iter/s)": 0.126628 |
| }, |
| { |
| "clip_ratio": 0.008466396480798721, |
| "epoch": 8.24, |
| "grad_norm": 3.322730779647827, |
| "kl": 1.30859375, |
| "learning_rate": 7.597595192178702e-08, |
| "loss": 0.006200029980391264, |
| "memory(GiB)": 25.14, |
| "step": 412, |
| "train_speed(iter/s)": 0.126892 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.0, |
| "completions/mean_length": 120.59375, |
| "completions/min_length": 63.5, |
| "epoch": 8.26, |
| "grad_norm": 3.1121227741241455, |
| "kl": 0.57421875, |
| "learning_rate": 7.430289649152155e-08, |
| "loss": -0.005076010245829821, |
| "memory(GiB)": 25.14, |
| "reward": 0.4349597841501236, |
| "reward_std": 0.022311867214739323, |
| "rewards/MCQ_Reward/mean": 0.4349597841501236, |
| "rewards/MCQ_Reward/std": 0.0992676205933094, |
| "step": 413, |
| "train_speed(iter/s)": 0.126827 |
| }, |
| { |
| "clip_ratio": 0.005325015634298325, |
| "epoch": 8.28, |
| "grad_norm": 3.336932897567749, |
| "kl": 0.5859375, |
| "learning_rate": 7.264698807851327e-08, |
| "loss": -0.004951636306941509, |
| "memory(GiB)": 25.14, |
| "step": 414, |
| "train_speed(iter/s)": 0.127083 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.0, |
| "completions/mean_length": 122.34765625, |
| "completions/min_length": 80.0, |
| "epoch": 8.3, |
| "grad_norm": 2.32357120513916, |
| "kl": 0.576171875, |
| "learning_rate": 7.100829338251146e-08, |
| "loss": 0.010018033906817436, |
| "memory(GiB)": 25.14, |
| "reward": 0.46219733357429504, |
| "reward_std": 0.023064136505126953, |
| "rewards/MCQ_Reward/mean": 0.46219733357429504, |
| "rewards/MCQ_Reward/std": 0.10461203381419182, |
| "step": 415, |
| "train_speed(iter/s)": 0.127059 |
| }, |
| { |
| "clip_ratio": 0.004823329858481884, |
| "epoch": 8.32, |
| "grad_norm": 2.399235486984253, |
| "kl": 0.56640625, |
| "learning_rate": 6.938687840989971e-08, |
| "loss": 0.010338631458580494, |
| "memory(GiB)": 25.14, |
| "step": 416, |
| "train_speed(iter/s)": 0.127319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 205.0, |
| "completions/mean_length": 126.6484375, |
| "completions/min_length": 59.5, |
| "epoch": 8.34, |
| "grad_norm": 2.3096046447753906, |
| "kl": 0.59765625, |
| "learning_rate": 6.778280847103667e-08, |
| "loss": 0.007643429096788168, |
| "memory(GiB)": 25.14, |
| "reward": 0.45115791261196136, |
| "reward_std": 0.026236201636493206, |
| "rewards/MCQ_Reward/mean": 0.45115791261196136, |
| "rewards/MCQ_Reward/std": 0.07101332768797874, |
| "step": 417, |
| "train_speed(iter/s)": 0.127229 |
| }, |
| { |
| "clip_ratio": 0.00613890727981925, |
| "epoch": 8.36, |
| "grad_norm": 2.6392662525177, |
| "kl": 0.599609375, |
| "learning_rate": 6.619614817762536e-08, |
| "loss": 0.00813712365925312, |
| "memory(GiB)": 25.14, |
| "step": 418, |
| "train_speed(iter/s)": 0.127474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 203.5, |
| "completions/mean_length": 128.6484375, |
| "completions/min_length": 70.5, |
| "epoch": 8.38, |
| "grad_norm": 2.6424126625061035, |
| "kl": 0.5546875, |
| "learning_rate": 6.462696144011148e-08, |
| "loss": 0.01095396839082241, |
| "memory(GiB)": 25.14, |
| "reward": 0.43093007802963257, |
| "reward_std": 0.021352089941501617, |
| "rewards/MCQ_Reward/mean": 0.43093007802963257, |
| "rewards/MCQ_Reward/std": 0.09322765283286572, |
| "step": 419, |
| "train_speed(iter/s)": 0.127401 |
| }, |
| { |
| "clip_ratio": 0.005334047833457589, |
| "epoch": 8.4, |
| "grad_norm": 2.514528751373291, |
| "kl": 0.560546875, |
| "learning_rate": 6.307531146510753e-08, |
| "loss": 0.011139345355331898, |
| "memory(GiB)": 25.14, |
| "step": 420, |
| "train_speed(iter/s)": 0.127655 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 198.0, |
| "completions/mean_length": 121.234375, |
| "completions/min_length": 61.5, |
| "epoch": 8.42, |
| "grad_norm": 2.6931869983673096, |
| "kl": 0.576171875, |
| "learning_rate": 6.154126075284855e-08, |
| "loss": -0.004434285219758749, |
| "memory(GiB)": 25.14, |
| "reward": 0.47386451065540314, |
| "reward_std": 0.02479046955704689, |
| "rewards/MCQ_Reward/mean": 0.47386451065540314, |
| "rewards/MCQ_Reward/std": 0.08362133055925369, |
| "step": 421, |
| "train_speed(iter/s)": 0.127304 |
| }, |
| { |
| "clip_ratio": 0.004985473584383726, |
| "epoch": 8.44, |
| "grad_norm": 2.623483896255493, |
| "kl": 0.5859375, |
| "learning_rate": 6.002487109467347e-08, |
| "loss": -0.004044556524604559, |
| "memory(GiB)": 25.14, |
| "step": 422, |
| "train_speed(iter/s)": 0.12756 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 167.0, |
| "completions/mean_length": 120.3359375, |
| "completions/min_length": 57.0, |
| "epoch": 8.46, |
| "grad_norm": 2.4557580947875977, |
| "kl": 0.54296875, |
| "learning_rate": 5.8526203570536504e-08, |
| "loss": -0.0014804373495280743, |
| "memory(GiB)": 25.14, |
| "reward": 0.38437609374523163, |
| "reward_std": 0.019576413556933403, |
| "rewards/MCQ_Reward/mean": 0.38437609374523163, |
| "rewards/MCQ_Reward/std": 0.08220572769641876, |
| "step": 423, |
| "train_speed(iter/s)": 0.12751 |
| }, |
| { |
| "clip_ratio": 0.005047354847192764, |
| "epoch": 8.48, |
| "grad_norm": 2.414680004119873, |
| "kl": 0.548828125, |
| "learning_rate": 5.70453185465472e-08, |
| "loss": -0.0010703507578000426, |
| "memory(GiB)": 25.14, |
| "step": 424, |
| "train_speed(iter/s)": 0.127763 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 171.0, |
| "completions/mean_length": 109.29296875, |
| "completions/min_length": 59.0, |
| "epoch": 8.5, |
| "grad_norm": 2.3690483570098877, |
| "kl": 0.59375, |
| "learning_rate": 5.5582275672538316e-08, |
| "loss": 0.0056993430480360985, |
| "memory(GiB)": 25.14, |
| "reward": 0.404767170548439, |
| "reward_std": 0.024388392455875874, |
| "rewards/MCQ_Reward/mean": 0.404767170548439, |
| "rewards/MCQ_Reward/std": 0.09245007485151291, |
| "step": 425, |
| "train_speed(iter/s)": 0.127734 |
| }, |
| { |
| "clip_ratio": 0.004816505592316389, |
| "epoch": 8.52, |
| "grad_norm": 2.3456268310546875, |
| "kl": 0.59765625, |
| "learning_rate": 5.4137133879663287e-08, |
| "loss": 0.005467045586556196, |
| "memory(GiB)": 25.14, |
| "step": 426, |
| "train_speed(iter/s)": 0.127977 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 259.5, |
| "completions/mean_length": 131.4375, |
| "completions/min_length": 65.5, |
| "epoch": 8.54, |
| "grad_norm": 2.3816792964935303, |
| "kl": 0.55078125, |
| "learning_rate": 5.270995137802314e-08, |
| "loss": 0.0031818237621337175, |
| "memory(GiB)": 25.14, |
| "reward": 0.38306334614753723, |
| "reward_std": 0.02167375199496746, |
| "rewards/MCQ_Reward/mean": 0.38306334614753723, |
| "rewards/MCQ_Reward/std": 0.12913303077220917, |
| "step": 427, |
| "train_speed(iter/s)": 0.12777 |
| }, |
| { |
| "clip_ratio": 0.005708938697353005, |
| "epoch": 8.56, |
| "grad_norm": 2.7459070682525635, |
| "kl": 0.560546875, |
| "learning_rate": 5.1300785654320886e-08, |
| "loss": 0.0036508457269519567, |
| "memory(GiB)": 25.14, |
| "step": 428, |
| "train_speed(iter/s)": 0.128012 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 216.5, |
| "completions/mean_length": 141.1796875, |
| "completions/min_length": 63.5, |
| "epoch": 8.58, |
| "grad_norm": 2.546011447906494, |
| "kl": 0.560546875, |
| "learning_rate": 4.9909693469546097e-08, |
| "loss": -0.0037225554697215557, |
| "memory(GiB)": 25.14, |
| "reward": 0.4553868919610977, |
| "reward_std": 0.024206943809986115, |
| "rewards/MCQ_Reward/mean": 0.4553868919610977, |
| "rewards/MCQ_Reward/std": 0.10913475230336189, |
| "step": 429, |
| "train_speed(iter/s)": 0.127896 |
| }, |
| { |
| "clip_ratio": 0.005615573842078447, |
| "epoch": 8.6, |
| "grad_norm": 2.4503653049468994, |
| "kl": 0.552734375, |
| "learning_rate": 4.853673085668947e-08, |
| "loss": -0.0035459164064377546, |
| "memory(GiB)": 25.14, |
| "step": 430, |
| "train_speed(iter/s)": 0.128133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 170.5, |
| "completions/mean_length": 121.25, |
| "completions/min_length": 68.0, |
| "epoch": 8.62, |
| "grad_norm": 2.6130316257476807, |
| "kl": 0.560546875, |
| "learning_rate": 4.718195311848455e-08, |
| "loss": 0.006583400070667267, |
| "memory(GiB)": 25.14, |
| "reward": 0.4170517176389694, |
| "reward_std": 0.022290964610874653, |
| "rewards/MCQ_Reward/mean": 0.4170517176389694, |
| "rewards/MCQ_Reward/std": 0.10183962434530258, |
| "step": 431, |
| "train_speed(iter/s)": 0.12785 |
| }, |
| { |
| "clip_ratio": 0.0055829116608947515, |
| "epoch": 8.64, |
| "grad_norm": 2.6913576126098633, |
| "kl": 0.572265625, |
| "learning_rate": 4.5845414825181394e-08, |
| "loss": 0.006918736733496189, |
| "memory(GiB)": 25.14, |
| "step": 432, |
| "train_speed(iter/s)": 0.128096 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 185.5, |
| "completions/mean_length": 113.8046875, |
| "completions/min_length": 74.0, |
| "epoch": 8.66, |
| "grad_norm": 2.4241960048675537, |
| "kl": 0.6201171875, |
| "learning_rate": 4.452716981234744e-08, |
| "loss": 0.011290742084383965, |
| "memory(GiB)": 25.14, |
| "reward": 0.4250094145536423, |
| "reward_std": 0.022951221093535423, |
| "rewards/MCQ_Reward/mean": 0.4250094145536423, |
| "rewards/MCQ_Reward/std": 0.10084276273846626, |
| "step": 433, |
| "train_speed(iter/s)": 0.128069 |
| }, |
| { |
| "clip_ratio": 0.005609560292214155, |
| "epoch": 8.68, |
| "grad_norm": 2.5790963172912598, |
| "kl": 0.650390625, |
| "learning_rate": 4.322727117869951e-08, |
| "loss": 0.011948860250413418, |
| "memory(GiB)": 25.14, |
| "step": 434, |
| "train_speed(iter/s)": 0.128291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 196.0, |
| "completions/mean_length": 126.3515625, |
| "completions/min_length": 83.5, |
| "epoch": 8.7, |
| "grad_norm": 2.430708885192871, |
| "kl": 0.5390625, |
| "learning_rate": 4.19457712839652e-08, |
| "loss": -0.008761925622820854, |
| "memory(GiB)": 25.14, |
| "reward": 0.43507225811481476, |
| "reward_std": 0.024821095168590546, |
| "rewards/MCQ_Reward/mean": 0.43507225811481476, |
| "rewards/MCQ_Reward/std": 0.10436990112066269, |
| "step": 435, |
| "train_speed(iter/s)": 0.128196 |
| }, |
| { |
| "clip_ratio": 0.004881069879047573, |
| "epoch": 8.72, |
| "grad_norm": 2.439311981201172, |
| "kl": 0.5400390625, |
| "learning_rate": 4.068272174677334e-08, |
| "loss": -0.00834021344780922, |
| "memory(GiB)": 25.14, |
| "step": 436, |
| "train_speed(iter/s)": 0.128446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 169.0, |
| "completions/mean_length": 118.14453125, |
| "completions/min_length": 67.5, |
| "epoch": 8.74, |
| "grad_norm": 2.607220411300659, |
| "kl": 0.619140625, |
| "learning_rate": 3.9438173442575e-08, |
| "loss": 0.005073768552392721, |
| "memory(GiB)": 25.14, |
| "reward": 0.4522544592618942, |
| "reward_std": 0.024327417835593224, |
| "rewards/MCQ_Reward/mean": 0.4522544592618942, |
| "rewards/MCQ_Reward/std": 0.08557374030351639, |
| "step": 437, |
| "train_speed(iter/s)": 0.128414 |
| }, |
| { |
| "clip_ratio": 0.005367731209844351, |
| "epoch": 8.76, |
| "grad_norm": 2.472538709640503, |
| "kl": 0.626953125, |
| "learning_rate": 3.821217650159453e-08, |
| "loss": 0.005441693589091301, |
| "memory(GiB)": 25.14, |
| "step": 438, |
| "train_speed(iter/s)": 0.128664 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 177.0, |
| "completions/mean_length": 117.36328125, |
| "completions/min_length": 65.5, |
| "epoch": 8.78, |
| "grad_norm": 2.8752048015594482, |
| "kl": 0.62109375, |
| "learning_rate": 3.700478030680987e-08, |
| "loss": 0.001543362159281969, |
| "memory(GiB)": 25.14, |
| "reward": 0.44734521210193634, |
| "reward_std": 0.02054190542548895, |
| "rewards/MCQ_Reward/mean": 0.44734521210193634, |
| "rewards/MCQ_Reward/std": 0.09018547832965851, |
| "step": 439, |
| "train_speed(iter/s)": 0.128624 |
| }, |
| { |
| "clip_ratio": 0.006753503577783704, |
| "epoch": 8.8, |
| "grad_norm": 2.822502374649048, |
| "kl": 0.625, |
| "learning_rate": 3.581603349196371e-08, |
| "loss": 0.0017494899220764637, |
| "memory(GiB)": 25.14, |
| "step": 440, |
| "train_speed(iter/s)": 0.128861 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 217.5, |
| "completions/mean_length": 117.40234375, |
| "completions/min_length": 62.0, |
| "epoch": 8.82, |
| "grad_norm": 2.5104751586914062, |
| "kl": 0.59375, |
| "learning_rate": 3.464598393960449e-08, |
| "loss": -0.004553473554551601, |
| "memory(GiB)": 25.14, |
| "reward": 0.39943838119506836, |
| "reward_std": 0.023083772510290146, |
| "rewards/MCQ_Reward/mean": 0.39943838119506836, |
| "rewards/MCQ_Reward/std": 0.08860309049487114, |
| "step": 441, |
| "train_speed(iter/s)": 0.128489 |
| }, |
| { |
| "clip_ratio": 0.00470179901458323, |
| "epoch": 8.84, |
| "grad_norm": 2.480741500854492, |
| "kl": 0.58984375, |
| "learning_rate": 3.349467877915746e-08, |
| "loss": -0.004542327020317316, |
| "memory(GiB)": 25.14, |
| "step": 442, |
| "train_speed(iter/s)": 0.128733 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 254.0, |
| "completions/mean_length": 127.69140625, |
| "completions/min_length": 50.0, |
| "epoch": 8.86, |
| "grad_norm": 2.399143934249878, |
| "kl": 0.607421875, |
| "learning_rate": 3.23621643850267e-08, |
| "loss": -0.004238632973283529, |
| "memory(GiB)": 25.14, |
| "reward": 0.40998475253582, |
| "reward_std": 0.02201936673372984, |
| "rewards/MCQ_Reward/mean": 0.40998475253582, |
| "rewards/MCQ_Reward/std": 0.0800128486007452, |
| "step": 443, |
| "train_speed(iter/s)": 0.128561 |
| }, |
| { |
| "clip_ratio": 0.006211797473952174, |
| "epoch": 8.88, |
| "grad_norm": 2.5745253562927246, |
| "kl": 0.603515625, |
| "learning_rate": 3.124848637472688e-08, |
| "loss": -0.003581822384148836, |
| "memory(GiB)": 25.14, |
| "step": 444, |
| "train_speed(iter/s)": 0.128809 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 187.5, |
| "completions/mean_length": 128.41015625, |
| "completions/min_length": 71.0, |
| "epoch": 8.9, |
| "grad_norm": 2.989118814468384, |
| "kl": 0.6640625, |
| "learning_rate": 3.015368960704584e-08, |
| "loss": 0.0020642182789742947, |
| "memory(GiB)": 25.14, |
| "reward": 0.45626600086688995, |
| "reward_std": 0.022524941712617874, |
| "rewards/MCQ_Reward/mean": 0.45626600086688995, |
| "rewards/MCQ_Reward/std": 0.08293722942471504, |
| "step": 445, |
| "train_speed(iter/s)": 0.128751 |
| }, |
| { |
| "clip_ratio": 0.0053639879915863276, |
| "epoch": 8.92, |
| "grad_norm": 2.226865291595459, |
| "kl": 0.65234375, |
| "learning_rate": 2.907781818023769e-08, |
| "loss": 0.0022344959434121847, |
| "memory(GiB)": 25.14, |
| "step": 446, |
| "train_speed(iter/s)": 0.128997 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 198.5, |
| "completions/mean_length": 114.81640625, |
| "completions/min_length": 69.5, |
| "epoch": 8.94, |
| "grad_norm": 2.5736968517303467, |
| "kl": 0.626953125, |
| "learning_rate": 2.8020915430246706e-08, |
| "loss": 0.00543589424341917, |
| "memory(GiB)": 25.14, |
| "reward": 0.4480299800634384, |
| "reward_std": 0.021618574857711792, |
| "rewards/MCQ_Reward/mean": 0.4480299800634384, |
| "rewards/MCQ_Reward/std": 0.08090543001890182, |
| "step": 447, |
| "train_speed(iter/s)": 0.128968 |
| }, |
| { |
| "clip_ratio": 0.005519783589988947, |
| "epoch": 8.96, |
| "grad_norm": 2.7313241958618164, |
| "kl": 0.62890625, |
| "learning_rate": 2.69830239289614e-08, |
| "loss": 0.005457316525280476, |
| "memory(GiB)": 25.14, |
| "step": 448, |
| "train_speed(iter/s)": 0.12921 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 163.0, |
| "completions/mean_length": 114.08203125, |
| "completions/min_length": 69.5, |
| "epoch": 8.98, |
| "grad_norm": 3.3176426887512207, |
| "kl": 0.658203125, |
| "learning_rate": 2.596418548250029e-08, |
| "loss": -0.006901263725012541, |
| "memory(GiB)": 25.14, |
| "reward": 0.4552987068891525, |
| "reward_std": 0.02576339803636074, |
| "rewards/MCQ_Reward/mean": 0.4552987068891525, |
| "rewards/MCQ_Reward/std": 0.09829828701913357, |
| "step": 449, |
| "train_speed(iter/s)": 0.129186 |
| }, |
| { |
| "clip_ratio": 0.005895850248634815, |
| "epoch": 9.0, |
| "grad_norm": 3.1435494422912598, |
| "kl": 0.65625, |
| "learning_rate": 2.4964441129527335e-08, |
| "loss": -0.006242312025278807, |
| "memory(GiB)": 25.14, |
| "step": 450, |
| "train_speed(iter/s)": 0.129418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.5, |
| "completions/mean_length": 107.38671875, |
| "completions/min_length": 61.0, |
| "epoch": 9.02, |
| "grad_norm": 2.6646904945373535, |
| "kl": 0.60546875, |
| "learning_rate": 2.3983831139599286e-08, |
| "loss": 0.006207154132425785, |
| "memory(GiB)": 25.14, |
| "reward": 0.39446285367012024, |
| "reward_std": 0.022946057841181755, |
| "rewards/MCQ_Reward/mean": 0.39446285367012024, |
| "rewards/MCQ_Reward/std": 0.1063094437122345, |
| "step": 451, |
| "train_speed(iter/s)": 0.129116 |
| }, |
| { |
| "clip_ratio": 0.005521278129890561, |
| "epoch": 9.04, |
| "grad_norm": 2.453953504562378, |
| "kl": 0.619140625, |
| "learning_rate": 2.3022395011543682e-08, |
| "loss": 0.006389847490936518, |
| "memory(GiB)": 25.14, |
| "step": 452, |
| "train_speed(iter/s)": 0.129358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 210.5, |
| "completions/mean_length": 128.57421875, |
| "completions/min_length": 55.0, |
| "epoch": 9.06, |
| "grad_norm": 2.812540054321289, |
| "kl": 0.580078125, |
| "learning_rate": 2.208017147186736e-08, |
| "loss": -0.005320190917700529, |
| "memory(GiB)": 25.14, |
| "reward": 0.41816772520542145, |
| "reward_std": 0.023720718920230865, |
| "rewards/MCQ_Reward/mean": 0.41816772520542145, |
| "rewards/MCQ_Reward/std": 0.11730682849884033, |
| "step": 453, |
| "train_speed(iter/s)": 0.129235 |
| }, |
| { |
| "clip_ratio": 0.005719892680644989, |
| "epoch": 9.08, |
| "grad_norm": 2.8398780822753906, |
| "kl": 0.578125, |
| "learning_rate": 2.1157198473197413e-08, |
| "loss": -0.004547153599560261, |
| "memory(GiB)": 25.14, |
| "step": 454, |
| "train_speed(iter/s)": 0.129473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 209.5, |
| "completions/mean_length": 121.10546875, |
| "completions/min_length": 61.0, |
| "epoch": 9.1, |
| "grad_norm": 2.6457087993621826, |
| "kl": 0.623046875, |
| "learning_rate": 2.025351319275137e-08, |
| "loss": 0.006458953022956848, |
| "memory(GiB)": 25.14, |
| "reward": 0.4360807240009308, |
| "reward_std": 0.023424276150763035, |
| "rewards/MCQ_Reward/mean": 0.4360807240009308, |
| "rewards/MCQ_Reward/std": 0.08403830602765083, |
| "step": 455, |
| "train_speed(iter/s)": 0.129418 |
| }, |
| { |
| "clip_ratio": 0.007413617800921202, |
| "epoch": 9.12, |
| "grad_norm": 3.019871473312378, |
| "kl": 0.615234375, |
| "learning_rate": 1.936915203084055e-08, |
| "loss": 0.007484931964427233, |
| "memory(GiB)": 25.14, |
| "step": 456, |
| "train_speed(iter/s)": 0.129657 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.5, |
| "completions/mean_length": 115.48828125, |
| "completions/min_length": 62.0, |
| "epoch": 9.14, |
| "grad_norm": 2.869127035140991, |
| "kl": 0.5703125, |
| "learning_rate": 1.8504150609403856e-08, |
| "loss": 0.002277131425216794, |
| "memory(GiB)": 25.14, |
| "reward": 0.42605504393577576, |
| "reward_std": 0.02147796005010605, |
| "rewards/MCQ_Reward/mean": 0.42605504393577576, |
| "rewards/MCQ_Reward/std": 0.09400845319032669, |
| "step": 457, |
| "train_speed(iter/s)": 0.129623 |
| }, |
| { |
| "clip_ratio": 0.00495463190600276, |
| "epoch": 9.16, |
| "grad_norm": 2.7837038040161133, |
| "kl": 0.564453125, |
| "learning_rate": 1.7658543770572186e-08, |
| "loss": 0.0023261206224560738, |
| "memory(GiB)": 25.14, |
| "step": 458, |
| "train_speed(iter/s)": 0.129859 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 226.5, |
| "completions/mean_length": 131.125, |
| "completions/min_length": 63.0, |
| "epoch": 9.18, |
| "grad_norm": 2.4485437870025635, |
| "kl": 0.564453125, |
| "learning_rate": 1.683236557526574e-08, |
| "loss": -0.001264197751879692, |
| "memory(GiB)": 25.14, |
| "reward": 0.43159276247024536, |
| "reward_std": 0.02392040565609932, |
| "rewards/MCQ_Reward/mean": 0.43159276247024536, |
| "rewards/MCQ_Reward/std": 0.10159046202898026, |
| "step": 459, |
| "train_speed(iter/s)": 0.129693 |
| }, |
| { |
| "clip_ratio": 0.004053628304973245, |
| "epoch": 9.2, |
| "grad_norm": 2.3056235313415527, |
| "kl": 0.5625, |
| "learning_rate": 1.6025649301821875e-08, |
| "loss": -0.000987461768090725, |
| "memory(GiB)": 25.14, |
| "step": 460, |
| "train_speed(iter/s)": 0.129933 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.5, |
| "completions/mean_length": 113.18359375, |
| "completions/min_length": 65.5, |
| "epoch": 9.22, |
| "grad_norm": 2.3913767337799072, |
| "kl": 0.544921875, |
| "learning_rate": 1.5238427444654367e-08, |
| "loss": 0.012515128590166569, |
| "memory(GiB)": 25.14, |
| "reward": 0.4141518771648407, |
| "reward_std": 0.019386641681194305, |
| "rewards/MCQ_Reward/mean": 0.4141518771648407, |
| "rewards/MCQ_Reward/std": 0.09657716751098633, |
| "step": 461, |
| "train_speed(iter/s)": 0.129665 |
| }, |
| { |
| "clip_ratio": 0.005686681717634201, |
| "epoch": 9.24, |
| "grad_norm": 2.5303232669830322, |
| "kl": 0.544921875, |
| "learning_rate": 1.4470731712944883e-08, |
| "loss": 0.013128566555678844, |
| "memory(GiB)": 25.14, |
| "step": 462, |
| "train_speed(iter/s)": 0.129891 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.0, |
| "completions/mean_length": 113.08984375, |
| "completions/min_length": 68.0, |
| "epoch": 9.26, |
| "grad_norm": 2.9452006816864014, |
| "kl": 0.578125, |
| "learning_rate": 1.3722593029365459e-08, |
| "loss": 0.01786494255065918, |
| "memory(GiB)": 25.14, |
| "reward": 0.4347621351480484, |
| "reward_std": 0.023103663697838783, |
| "rewards/MCQ_Reward/mean": 0.4347621351480484, |
| "rewards/MCQ_Reward/std": 0.10107803344726562, |
| "step": 463, |
| "train_speed(iter/s)": 0.129821 |
| }, |
| { |
| "clip_ratio": 0.004837532993406057, |
| "epoch": 9.28, |
| "grad_norm": 3.270838499069214, |
| "kl": 0.576171875, |
| "learning_rate": 1.2994041528833267e-08, |
| "loss": 0.01855536922812462, |
| "memory(GiB)": 25.14, |
| "step": 464, |
| "train_speed(iter/s)": 0.130055 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 197.0, |
| "completions/mean_length": 130.0703125, |
| "completions/min_length": 61.0, |
| "epoch": 9.3, |
| "grad_norm": 2.5287396907806396, |
| "kl": 0.5703125, |
| "learning_rate": 1.2285106557296476e-08, |
| "loss": -0.009716257452964783, |
| "memory(GiB)": 25.14, |
| "reward": 0.4242394268512726, |
| "reward_std": 0.024817454628646374, |
| "rewards/MCQ_Reward/mean": 0.4242394268512726, |
| "rewards/MCQ_Reward/std": 0.11753027141094208, |
| "step": 465, |
| "train_speed(iter/s)": 0.129996 |
| }, |
| { |
| "clip_ratio": 0.0049513031262904406, |
| "epoch": 9.32, |
| "grad_norm": 2.6941351890563965, |
| "kl": 0.56640625, |
| "learning_rate": 1.1595816670552428e-08, |
| "loss": -0.009578550234436989, |
| "memory(GiB)": 25.14, |
| "step": 466, |
| "train_speed(iter/s)": 0.130232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.0, |
| "completions/mean_length": 120.80859375, |
| "completions/min_length": 79.0, |
| "epoch": 9.34, |
| "grad_norm": 2.4061837196350098, |
| "kl": 0.580078125, |
| "learning_rate": 1.0926199633097154e-08, |
| "loss": 0.009803004562854767, |
| "memory(GiB)": 25.14, |
| "reward": 0.4236748516559601, |
| "reward_std": 0.020633171312510967, |
| "rewards/MCQ_Reward/mean": 0.4236748516559601, |
| "rewards/MCQ_Reward/std": 0.10525783523917198, |
| "step": 467, |
| "train_speed(iter/s)": 0.130202 |
| }, |
| { |
| "clip_ratio": 0.0038570521865040064, |
| "epoch": 9.36, |
| "grad_norm": 2.538754463195801, |
| "kl": 0.576171875, |
| "learning_rate": 1.0276282417007399e-08, |
| "loss": 0.010506462305784225, |
| "memory(GiB)": 25.14, |
| "step": 468, |
| "train_speed(iter/s)": 0.130419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 163.0, |
| "completions/mean_length": 115.359375, |
| "completions/min_length": 72.5, |
| "epoch": 9.38, |
| "grad_norm": 2.767404317855835, |
| "kl": 0.58203125, |
| "learning_rate": 9.646091200853801e-09, |
| "loss": 0.002447181846946478, |
| "memory(GiB)": 25.14, |
| "reward": 0.4558543264865875, |
| "reward_std": 0.023351009003818035, |
| "rewards/MCQ_Reward/mean": 0.4558543264865875, |
| "rewards/MCQ_Reward/std": 0.10045822337269783, |
| "step": 469, |
| "train_speed(iter/s)": 0.130376 |
| }, |
| { |
| "clip_ratio": 0.003978088265284896, |
| "epoch": 9.4, |
| "grad_norm": 2.3947746753692627, |
| "kl": 0.58984375, |
| "learning_rate": 9.035651368646646e-09, |
| "loss": 0.0025905624497681856, |
| "memory(GiB)": 25.14, |
| "step": 470, |
| "train_speed(iter/s)": 0.130609 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.0, |
| "completions/mean_length": 120.05078125, |
| "completions/min_length": 61.0, |
| "epoch": 9.42, |
| "grad_norm": 2.2213082313537598, |
| "kl": 0.595703125, |
| "learning_rate": 8.44498750881345e-09, |
| "loss": 0.022836437448859215, |
| "memory(GiB)": 25.14, |
| "reward": 0.4252375066280365, |
| "reward_std": 0.02044745907187462, |
| "rewards/MCQ_Reward/mean": 0.4252375066280365, |
| "rewards/MCQ_Reward/std": 0.0874844454228878, |
| "step": 471, |
| "train_speed(iter/s)": 0.130308 |
| }, |
| { |
| "clip_ratio": 0.004947596346028149, |
| "epoch": 9.44, |
| "grad_norm": 2.374445676803589, |
| "kl": 0.599609375, |
| "learning_rate": 7.874123413208145e-09, |
| "loss": 0.02313510701060295, |
| "memory(GiB)": 25.14, |
| "step": 472, |
| "train_speed(iter/s)": 0.130541 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 202.5, |
| "completions/mean_length": 122.046875, |
| "completions/min_length": 59.0, |
| "epoch": 9.46, |
| "grad_norm": 2.6664299964904785, |
| "kl": 0.626953125, |
| "learning_rate": 7.323082076153508e-09, |
| "loss": 0.0047410172410309315, |
| "memory(GiB)": 25.14, |
| "reward": 0.42370498180389404, |
| "reward_std": 0.021436103619635105, |
| "rewards/MCQ_Reward/mean": 0.42370498180389404, |
| "rewards/MCQ_Reward/std": 0.11163535714149475, |
| "step": 473, |
| "train_speed(iter/s)": 0.130462 |
| }, |
| { |
| "clip_ratio": 0.005457588471472263, |
| "epoch": 9.48, |
| "grad_norm": 2.7726047039031982, |
| "kl": 0.626953125, |
| "learning_rate": 6.791885693514132e-09, |
| "loss": 0.005159153137356043, |
| "memory(GiB)": 25.14, |
| "step": 474, |
| "train_speed(iter/s)": 0.130692 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 244.5, |
| "completions/mean_length": 136.453125, |
| "completions/min_length": 83.0, |
| "epoch": 9.5, |
| "grad_norm": 2.2565746307373047, |
| "kl": 0.595703125, |
| "learning_rate": 6.280555661802856e-09, |
| "loss": 0.011247138492763042, |
| "memory(GiB)": 25.14, |
| "reward": 0.4296618103981018, |
| "reward_std": 0.021635888144373894, |
| "rewards/MCQ_Reward/mean": 0.4296618103981018, |
| "rewards/MCQ_Reward/std": 0.06789225153625011, |
| "step": 475, |
| "train_speed(iter/s)": 0.130512 |
| }, |
| { |
| "clip_ratio": 0.005767492577433586, |
| "epoch": 9.52, |
| "grad_norm": 2.250284433364868, |
| "kl": 0.6015625, |
| "learning_rate": 5.789112577318789e-09, |
| "loss": 0.011374367401003838, |
| "memory(GiB)": 25.14, |
| "step": 476, |
| "train_speed(iter/s)": 0.130746 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 185.5, |
| "completions/mean_length": 118.5703125, |
| "completions/min_length": 73.5, |
| "epoch": 9.54, |
| "grad_norm": 2.5178654193878174, |
| "kl": 0.728515625, |
| "learning_rate": 5.317576235317756e-09, |
| "loss": 0.007045174017548561, |
| "memory(GiB)": 25.14, |
| "reward": 0.44049952924251556, |
| "reward_std": 0.02334336470812559, |
| "rewards/MCQ_Reward/mean": 0.44049952924251556, |
| "rewards/MCQ_Reward/std": 0.0808117426931858, |
| "step": 477, |
| "train_speed(iter/s)": 0.130671 |
| }, |
| { |
| "clip_ratio": 0.004105736967176199, |
| "epoch": 9.56, |
| "grad_norm": 2.5065832138061523, |
| "kl": 0.6953125, |
| "learning_rate": 4.865965629214819e-09, |
| "loss": 0.007527303881943226, |
| "memory(GiB)": 25.14, |
| "step": 478, |
| "train_speed(iter/s)": 0.130887 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 191.0, |
| "completions/mean_length": 117.73046875, |
| "completions/min_length": 75.0, |
| "epoch": 9.58, |
| "grad_norm": 3.128554105758667, |
| "kl": 0.59765625, |
| "learning_rate": 4.434298949819448e-09, |
| "loss": -0.021542608737945557, |
| "memory(GiB)": 25.14, |
| "reward": 0.4070900082588196, |
| "reward_std": 0.023668975569307804, |
| "rewards/MCQ_Reward/mean": 0.4070900082588196, |
| "rewards/MCQ_Reward/std": 0.08471970073878765, |
| "step": 479, |
| "train_speed(iter/s)": 0.130803 |
| }, |
| { |
| "clip_ratio": 0.00539792119525373, |
| "epoch": 9.6, |
| "grad_norm": 3.067028045654297, |
| "kl": 0.59765625, |
| "learning_rate": 4.022593584602329e-09, |
| "loss": -0.02082860842347145, |
| "memory(GiB)": 25.14, |
| "step": 480, |
| "train_speed(iter/s)": 0.131034 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 251.0, |
| "completions/mean_length": 130.140625, |
| "completions/min_length": 54.0, |
| "epoch": 9.62, |
| "grad_norm": 2.8921902179718018, |
| "kl": 0.59375, |
| "learning_rate": 3.6308661169957565e-09, |
| "loss": -0.0016225441358983517, |
| "memory(GiB)": 25.14, |
| "reward": 0.42697805166244507, |
| "reward_std": 0.0217811968177557, |
| "rewards/MCQ_Reward/mean": 0.42697805166244507, |
| "rewards/MCQ_Reward/std": 0.0660354271531105, |
| "step": 481, |
| "train_speed(iter/s)": 0.130674 |
| }, |
| { |
| "clip_ratio": 0.007906233426183462, |
| "epoch": 9.64, |
| "grad_norm": 2.9274981021881104, |
| "kl": 0.595703125, |
| "learning_rate": 3.2591323257248894e-09, |
| "loss": -0.0016696015372872353, |
| "memory(GiB)": 25.14, |
| "step": 482, |
| "train_speed(iter/s)": 0.130879 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 222.0, |
| "completions/mean_length": 135.94921875, |
| "completions/min_length": 71.0, |
| "epoch": 9.66, |
| "grad_norm": 2.4433958530426025, |
| "kl": 0.5546875, |
| "learning_rate": 2.9074071841727054e-09, |
| "loss": 0.019563939422369003, |
| "memory(GiB)": 25.14, |
| "reward": 0.42691025137901306, |
| "reward_std": 0.020791654475033283, |
| "rewards/MCQ_Reward/mean": 0.42691025137901306, |
| "rewards/MCQ_Reward/std": 0.0828494131565094, |
| "step": 483, |
| "train_speed(iter/s)": 0.13078 |
| }, |
| { |
| "clip_ratio": 0.004861004883423448, |
| "epoch": 9.68, |
| "grad_norm": 2.2269864082336426, |
| "kl": 0.55859375, |
| "learning_rate": 2.5757048597765395e-09, |
| "loss": 0.019545655697584152, |
| "memory(GiB)": 25.14, |
| "step": 484, |
| "train_speed(iter/s)": 0.131008 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.0, |
| "completions/mean_length": 141.06640625, |
| "completions/min_length": 89.0, |
| "epoch": 9.7, |
| "grad_norm": 2.19620418548584, |
| "kl": 0.513671875, |
| "learning_rate": 2.2640387134577053e-09, |
| "loss": 0.010847845114767551, |
| "memory(GiB)": 25.14, |
| "reward": 0.42219071090221405, |
| "reward_std": 0.022757427766919136, |
| "rewards/MCQ_Reward/mean": 0.42219071090221405, |
| "rewards/MCQ_Reward/std": 0.0853536631911993, |
| "step": 485, |
| "train_speed(iter/s)": 0.130923 |
| }, |
| { |
| "clip_ratio": 0.006320674438029528, |
| "epoch": 9.72, |
| "grad_norm": 2.1190598011016846, |
| "kl": 0.5048828125, |
| "learning_rate": 1.9724212990830936e-09, |
| "loss": 0.010512834414839745, |
| "memory(GiB)": 25.14, |
| "step": 486, |
| "train_speed(iter/s)": 0.13115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.5, |
| "completions/mean_length": 111.921875, |
| "completions/min_length": 63.5, |
| "epoch": 9.74, |
| "grad_norm": 2.5479891300201416, |
| "kl": 0.59765625, |
| "learning_rate": 1.7008643629596864e-09, |
| "loss": -0.008141995407640934, |
| "memory(GiB)": 25.14, |
| "reward": 0.41020119190216064, |
| "reward_std": 0.022871771827340126, |
| "rewards/MCQ_Reward/mean": 0.41020119190216064, |
| "rewards/MCQ_Reward/std": 0.10586465150117874, |
| "step": 487, |
| "train_speed(iter/s)": 0.131123 |
| }, |
| { |
| "clip_ratio": 0.004743925994262099, |
| "epoch": 9.76, |
| "grad_norm": 2.7629165649414062, |
| "kl": 0.591796875, |
| "learning_rate": 1.4493788433612708e-09, |
| "loss": -0.008076684549450874, |
| "memory(GiB)": 25.14, |
| "step": 488, |
| "train_speed(iter/s)": 0.131348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 177.0, |
| "completions/mean_length": 116.10546875, |
| "completions/min_length": 67.0, |
| "epoch": 9.78, |
| "grad_norm": 2.770082950592041, |
| "kl": 0.576171875, |
| "learning_rate": 1.217974870087901e-09, |
| "loss": 0.010374639183282852, |
| "memory(GiB)": 25.14, |
| "reward": 0.47805055975914, |
| "reward_std": 0.023321266286075115, |
| "rewards/MCQ_Reward/mean": 0.47805055975914, |
| "rewards/MCQ_Reward/std": 0.1008174680173397, |
| "step": 489, |
| "train_speed(iter/s)": 0.131298 |
| }, |
| { |
| "clip_ratio": 0.005443725967779756, |
| "epoch": 9.8, |
| "grad_norm": 2.5658154487609863, |
| "kl": 0.583984375, |
| "learning_rate": 1.0066617640578368e-09, |
| "loss": 0.010389911010861397, |
| "memory(GiB)": 25.14, |
| "step": 490, |
| "train_speed(iter/s)": 0.131523 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.5, |
| "completions/mean_length": 128.69921875, |
| "completions/min_length": 71.5, |
| "epoch": 9.82, |
| "grad_norm": 2.3105576038360596, |
| "kl": 0.90625, |
| "learning_rate": 8.154480369321759e-10, |
| "loss": -0.004896960221230984, |
| "memory(GiB)": 25.14, |
| "reward": 0.43206796050071716, |
| "reward_std": 0.02110449317842722, |
| "rewards/MCQ_Reward/mean": 0.43206796050071716, |
| "rewards/MCQ_Reward/std": 0.10026764124631882, |
| "step": 491, |
| "train_speed(iter/s)": 0.13119 |
| }, |
| { |
| "clip_ratio": 0.004017886472865939, |
| "epoch": 9.84, |
| "grad_norm": 2.2543957233428955, |
| "kl": 0.892578125, |
| "learning_rate": 6.443413907720186e-10, |
| "loss": -0.004858216270804405, |
| "memory(GiB)": 25.14, |
| "step": 492, |
| "train_speed(iter/s)": 0.131415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 221.5, |
| "completions/mean_length": 131.3203125, |
| "completions/min_length": 58.0, |
| "epoch": 9.86, |
| "grad_norm": 2.459817409515381, |
| "kl": 0.5390625, |
| "learning_rate": 4.933487177280482e-10, |
| "loss": 0.0025399066507816315, |
| "memory(GiB)": 25.14, |
| "reward": 0.47691330313682556, |
| "reward_std": 0.022764784283936024, |
| "rewards/MCQ_Reward/mean": 0.47691330313682556, |
| "rewards/MCQ_Reward/std": 0.09778410196304321, |
| "step": 493, |
| "train_speed(iter/s)": 0.131346 |
| }, |
| { |
| "clip_ratio": 0.004864038084633648, |
| "epoch": 9.88, |
| "grad_norm": 2.518949508666992, |
| "kl": 0.537109375, |
| "learning_rate": 3.6247609976319817e-10, |
| "loss": 0.0027223415672779083, |
| "memory(GiB)": 25.14, |
| "step": 494, |
| "train_speed(iter/s)": 0.131569 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 184.0, |
| "completions/mean_length": 113.7421875, |
| "completions/min_length": 57.5, |
| "epoch": 9.9, |
| "grad_norm": 2.7932207584381104, |
| "kl": 0.640625, |
| "learning_rate": 2.517288084074587e-10, |
| "loss": -0.008804459124803543, |
| "memory(GiB)": 25.14, |
| "reward": 0.45272429287433624, |
| "reward_std": 0.02382285613566637, |
| "rewards/MCQ_Reward/mean": 0.45272429287433624, |
| "rewards/MCQ_Reward/std": 0.08811983093619347, |
| "step": 495, |
| "train_speed(iter/s)": 0.13153 |
| }, |
| { |
| "clip_ratio": 0.005316317779943347, |
| "epoch": 9.92, |
| "grad_norm": 2.3468141555786133, |
| "kl": 0.634765625, |
| "learning_rate": 1.6111130454543597e-10, |
| "loss": -0.00884802732616663, |
| "memory(GiB)": 25.14, |
| "step": 496, |
| "train_speed(iter/s)": 0.131752 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 175.0, |
| "completions/mean_length": 111.8515625, |
| "completions/min_length": 57.5, |
| "epoch": 9.94, |
| "grad_norm": 2.973198413848877, |
| "kl": 0.642578125, |
| "learning_rate": 9.06272382371065e-11, |
| "loss": 0.002287194598466158, |
| "memory(GiB)": 25.14, |
| "reward": 0.4001469016075134, |
| "reward_std": 0.0235411636531353, |
| "rewards/MCQ_Reward/mean": 0.4001469016075134, |
| "rewards/MCQ_Reward/std": 0.07189228385686874, |
| "step": 497, |
| "train_speed(iter/s)": 0.131698 |
| }, |
| { |
| "clip_ratio": 0.0034996896283701062, |
| "epoch": 9.96, |
| "grad_norm": 3.0021812915802, |
| "kl": 0.6484375, |
| "learning_rate": 4.0279448570323946e-11, |
| "loss": 0.002919801976531744, |
| "memory(GiB)": 25.14, |
| "step": 498, |
| "train_speed(iter/s)": 0.131924 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 225.0, |
| "completions/mean_length": 135.265625, |
| "completions/min_length": 68.5, |
| "epoch": 9.98, |
| "grad_norm": 2.244234085083008, |
| "kl": 0.55078125, |
| "learning_rate": 1.0069963546743831e-11, |
| "loss": -0.0014414777979254723, |
| "memory(GiB)": 25.14, |
| "reward": 0.46473294496536255, |
| "reward_std": 0.02351410035043955, |
| "rewards/MCQ_Reward/mean": 0.46473294496536255, |
| "rewards/MCQ_Reward/std": 0.06907243467867374, |
| "step": 499, |
| "train_speed(iter/s)": 0.131777 |
| }, |
| { |
| "clip_ratio": 0.0020644072210416198, |
| "epoch": 10.0, |
| "grad_norm": 2.3687548637390137, |
| "kl": 0.55078125, |
| "learning_rate": 0.0, |
| "loss": -0.0014774189330637455, |
| "memory(GiB)": 25.14, |
| "step": 500, |
| "train_speed(iter/s)": 0.131993 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|