LLucass commited on Jun 7, 2025

Commit

28bad33

verified ·

1 Parent(s): 8ce4bf7

Training in progress, step 100, checkpoint

Browse files

Files changed (21) hide show

.gitattributes +1 -0
checkpoint-100/config.json +29 -0
checkpoint-100/generation_config.json +9 -0
checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
checkpoint-100/global_step100/mp_rank_00_model_states.pt +3 -0
checkpoint-100/latest +1 -0
checkpoint-100/model.safetensors +3 -0
checkpoint-100/rng_state_0.pth +3 -0
checkpoint-100/rng_state_1.pth +3 -0
checkpoint-100/rng_state_2.pth +3 -0
checkpoint-100/rng_state_3.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/special_tokens_map.json +23 -0
checkpoint-100/tokenizer.json +3 -0
checkpoint-100/tokenizer_config.json +195 -0
checkpoint-100/trainer_state.json +2534 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-100/zero_to_fp32.py +760 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint-100/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoint-100/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}

checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e45b81e0570ac4dad65fef34f2d39d15813993edc39e0f26d0d87c0019eefd5
+size 5331274140

checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b661201b7ec521193a6f246c5301e534687d4b10af256ffe139737a755631035
+size 5331276572

checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68258ff49f6be6fea13d948d614ca340b27ba9b87e71942514da3cc9923ad306
+size 5331276892

checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62d2ac8100f40b9d2df4490a683471aee3c539db6b541be2403c5225129e7b67
+size 5331273884

checkpoint-100/global_step100/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b2ff44c14e457e2a07945817ed32a149ddb3fc81ca127c990b48d3caf7ebfa9
+size 3554267640

checkpoint-100/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step100

checkpoint-100/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef007667c12770de5da20819c3e1d762e1bca3fb66efb70b8bc2ab43749d46ec
+size 3554214752

checkpoint-100/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0dcd1219e2c412ef0fd5c590b7d66a85991f28359265fe2d4f83803387fadf8
+size 14960

checkpoint-100/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd64afd047ba34f9fc02eb451169eefe4271319044a6704e3cbd0d0e54e709d1
+size 14960

checkpoint-100/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b148f4c9f4f33bba5e5283cc51321b00293b8a34f61458a192f7bded182f5936
+size 14960

checkpoint-100/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d30b837de0197e3b5d9d6df85728783fb526ccc9a45068a4db9e5d52e01d42d
+size 14960

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b0d6ed3e119807e165d53a18d2ec22befd359c1465f6aeaa69d1d7eb1452246
+size 1064

checkpoint-100/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-100/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
+size 11422959

checkpoint-100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin��>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}

checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2534 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.11428571428571428,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2837817668914795,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": -0.09800112247467041,
+      "reward_std": 0.3028089702129364,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2421981245279312,
+      "learning_rate": 2e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.020556632429361343,
+      "reward_std": 0.3545936942100525,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 1952.234375,
+      "completions/mean_terminated_length": 822.2000122070312,
+      "completions/min_length": 703.0,
+      "completions/min_terminated_length": 703.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24851329624652863,
+      "learning_rate": 4e-08,
+      "loss": -0.0,
+      "num_tokens": 375163.0,
+      "reward": -0.22721199691295624,
+      "reward_std": 0.14563649892807007,
+      "rewards/cosine_scaled_reward/mean": -0.22721199691295624,
+      "rewards/cosine_scaled_reward/std": 0.1709199845790863,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1685.0,
+      "completions/mean_length": 1554.109375,
+      "completions/mean_terminated_length": 958.0344848632812,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29272863268852234,
+      "learning_rate": 6e-08,
+      "loss": -0.0,
+      "num_tokens": 484434.0,
+      "reward": -0.17542189359664917,
+      "reward_std": 0.18219107389450073,
+      "rewards/cosine_scaled_reward/mean": -0.17542189359664917,
+      "rewards/cosine_scaled_reward/std": 0.27975013852119446,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1930.0,
+      "completions/mean_length": 1943.0625,
+      "completions/mean_terminated_length": 1088.571533203125,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2773251533508301,
+      "learning_rate": 8e-08,
+      "loss": 0.0,
+      "num_tokens": 619606.0,
+      "reward": -0.2648562788963318,
+      "reward_std": 0.21638144552707672,
+      "rewards/cosine_scaled_reward/mean": -0.2648562788963318,
+      "rewards/cosine_scaled_reward/std": 0.23959198594093323,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1824.0,
+      "completions/mean_length": 1854.21875,
+      "completions/mean_terminated_length": 920.5454711914062,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27399909496307373,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 749924.0,
+      "reward": -0.19292885065078735,
+      "reward_std": 0.2666770815849304,
+      "rewards/cosine_scaled_reward/mean": -0.19292885065078735,
+      "rewards/cosine_scaled_reward/std": 0.295730322599411,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1589.0,
+      "completions/mean_length": 1940.5625,
+      "completions/mean_terminated_length": 1065.71435546875,
+      "completions/min_length": 773.0,
+      "completions/min_terminated_length": 773.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23362359404563904,
+      "learning_rate": 1.2e-07,
+      "loss": 0.0,
+      "num_tokens": 884528.0,
+      "reward": -0.18198424577713013,
+      "reward_std": 0.18540163338184357,
+      "rewards/cosine_scaled_reward/mean": -0.18198424577713013,
+      "rewards/cosine_scaled_reward/std": 0.32407456636428833,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2048.0,
+      "completions/mean_length": 1708.5625,
+      "completions/mean_terminated_length": 1013.5238037109375,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24677562713623047,
+      "learning_rate": 1.4e-07,
+      "loss": -0.0,
+      "num_tokens": 1004292.0,
+      "reward": -0.09573853015899658,
+      "reward_std": 0.22485454380512238,
+      "rewards/cosine_scaled_reward/mean": -0.09573852270841599,
+      "rewards/cosine_scaled_reward/std": 0.449250191450119,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1221.0,
+      "completions/mean_length": 1979.359375,
+      "completions/mean_terminated_length": 949.75,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26966309547424316,
+      "learning_rate": 1.6e-07,
+      "loss": 0.0,
+      "num_tokens": 1142427.0,
+      "reward": -0.19992578029632568,
+      "reward_std": 0.20190927386283875,
+      "rewards/cosine_scaled_reward/mean": -0.19992581009864807,
+      "rewards/cosine_scaled_reward/std": 0.23785534501075745,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1652.59375,
+      "completions/mean_terminated_length": 897.727294921875,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3011312484741211,
+      "learning_rate": 1.8e-07,
+      "loss": 0.0,
+      "num_tokens": 1259025.0,
+      "reward": -0.11706389486789703,
+      "reward_std": 0.2934548258781433,
+      "rewards/cosine_scaled_reward/mean": -0.11706390231847763,
+      "rewards/cosine_scaled_reward/std": 0.3601698577404022,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1333.0,
+      "completions/mean_length": 1946.6875,
+      "completions/mean_terminated_length": 967.3333740234375,
+      "completions/min_length": 599.0,
+      "completions/min_terminated_length": 599.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2451399564743042,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 1395285.0,
+      "reward": -0.2866281270980835,
+      "reward_std": 0.12184012681245804,
+      "rewards/cosine_scaled_reward/mean": -0.2866281270980835,
+      "rewards/cosine_scaled_reward/std": 0.15141677856445312,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1659.28125,
+      "completions/mean_terminated_length": 1190.137939453125,
+      "completions/min_length": 535.0,
+      "completions/min_terminated_length": 535.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2733561396598816,
+      "learning_rate": 2.1999999999999998e-07,
+      "loss": 0.0,
+      "num_tokens": 1512423.0,
+      "reward": -0.13816070556640625,
+      "reward_std": 0.2968980073928833,
+      "rewards/cosine_scaled_reward/mean": -0.13816070556640625,
+      "rewards/cosine_scaled_reward/std": 0.3597467839717865,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1770.0,
+      "completions/mean_length": 1807.796875,
+      "completions/mean_terminated_length": 1023.1333618164062,
+      "completions/min_length": 697.0,
+      "completions/min_terminated_length": 697.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25238803029060364,
+      "learning_rate": 2.4e-07,
+      "loss": 0.0,
+      "num_tokens": 1639162.0,
+      "reward": -0.13488636910915375,
+      "reward_std": 0.2661236524581909,
+      "rewards/cosine_scaled_reward/mean": -0.13488635420799255,
+      "rewards/cosine_scaled_reward/std": 0.3444243371486664,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1866.0,
+      "completions/mean_length": 1846.921875,
+      "completions/mean_terminated_length": 1243.6875,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2201598882675171,
+      "learning_rate": 2.6e-07,
+      "loss": -0.0,
+      "num_tokens": 1767973.0,
+      "reward": -0.20591925084590912,
+      "reward_std": 0.21505361795425415,
+      "rewards/cosine_scaled_reward/mean": -0.20591923594474792,
+      "rewards/cosine_scaled_reward/std": 0.323749840259552,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1713.0,
+      "completions/mean_length": 1710.421875,
+      "completions/mean_terminated_length": 847.7222290039062,
+      "completions/min_length": 450.0,
+      "completions/min_terminated_length": 450.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2665213644504547,
+      "learning_rate": 2.8e-07,
+      "loss": 0.0,
+      "num_tokens": 1888360.0,
+      "reward": -0.0778750479221344,
+      "reward_std": 0.17502948641777039,
+      "rewards/cosine_scaled_reward/mean": -0.0778750628232956,
+      "rewards/cosine_scaled_reward/std": 0.47343766689300537,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 2031.03125,
+      "completions/mean_terminated_length": 962.0,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23009927570819855,
+      "learning_rate": 3e-07,
+      "loss": -0.0,
+      "num_tokens": 2028786.0,
+      "reward": -0.2619968056678772,
+      "reward_std": 0.16954168677330017,
+      "rewards/cosine_scaled_reward/mean": -0.2619968056678772,
+      "rewards/cosine_scaled_reward/std": 0.18357795476913452,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1533.15625,
+      "completions/mean_terminated_length": 780.6923217773438,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3392995297908783,
+      "learning_rate": 3.2e-07,
+      "loss": -0.0,
+      "num_tokens": 2137428.0,
+      "reward": -0.11706461012363434,
+      "reward_std": 0.3096129894256592,
+      "rewards/cosine_scaled_reward/mean": -0.11706460267305374,
+      "rewards/cosine_scaled_reward/std": 0.3810974657535553,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1626.0,
+      "completions/mean_length": 1774.46875,
+      "completions/mean_terminated_length": 1018.2352905273438,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23254038393497467,
+      "learning_rate": 3.4000000000000003e-07,
+      "loss": 0.0,
+      "num_tokens": 2261370.0,
+      "reward": -0.18709540367126465,
+      "reward_std": 0.2795025110244751,
+      "rewards/cosine_scaled_reward/mean": -0.18709540367126465,
+      "rewards/cosine_scaled_reward/std": 0.3359416127204895,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1859.0,
+      "completions/mean_length": 1719.0,
+      "completions/mean_terminated_length": 995.2000122070312,
+      "completions/min_length": 577.0,
+      "completions/min_terminated_length": 577.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262045681476593,
+      "learning_rate": 3.6e-07,
+      "loss": -0.0,
+      "num_tokens": 2382642.0,
+      "reward": -0.02329203486442566,
+      "reward_std": 0.34684932231903076,
+      "rewards/cosine_scaled_reward/mean": -0.02329203486442566,
+      "rewards/cosine_scaled_reward/std": 0.47637447714805603,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1630.90625,
+      "completions/mean_terminated_length": 935.75,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.250532329082489,
+      "learning_rate": 3.7999999999999996e-07,
+      "loss": 0.0,
+      "num_tokens": 2498372.0,
+      "reward": -0.06319350004196167,
+      "reward_std": 0.2394939512014389,
+      "rewards/cosine_scaled_reward/mean": -0.06319350004196167,
+      "rewards/cosine_scaled_reward/std": 0.3889789879322052,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1818.0,
+      "completions/mean_length": 1735.96875,
+      "completions/mean_terminated_length": 1140.272705078125,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2773231565952301,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 2620282.0,
+      "reward": -0.20884393155574799,
+      "reward_std": 0.20233216881752014,
+      "rewards/cosine_scaled_reward/mean": -0.20884393155574799,
+      "rewards/cosine_scaled_reward/std": 0.28432920575141907,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1790.0,
+      "completions/mean_length": 1342.953125,
+      "completions/mean_terminated_length": 919.9249877929688,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34627005457878113,
+      "learning_rate": 4.1999999999999995e-07,
+      "loss": 0.0,
+      "num_tokens": 2715247.0,
+      "reward": -0.09092864394187927,
+      "reward_std": 0.21042926609516144,
+      "rewards/cosine_scaled_reward/mean": -0.09092865139245987,
+      "rewards/cosine_scaled_reward/std": 0.43559205532073975,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2038.0,
+      "completions/mean_length": 1661.9375,
+      "completions/mean_terminated_length": 1132.888916015625,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2705242335796356,
+      "learning_rate": 4.3999999999999997e-07,
+      "loss": 0.0,
+      "num_tokens": 2832403.0,
+      "reward": -0.13339249789714813,
+      "reward_std": 0.2433384656906128,
+      "rewards/cosine_scaled_reward/mean": -0.13339248299598694,
+      "rewards/cosine_scaled_reward/std": 0.3815627098083496,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1802.296875,
+      "completions/mean_terminated_length": 1065.1875,
+      "completions/min_length": 572.0,
+      "completions/min_terminated_length": 572.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24961258471012115,
+      "learning_rate": 4.6e-07,
+      "loss": 0.0,
+      "num_tokens": 2958678.0,
+      "reward": -0.18733163177967072,
+      "reward_std": 0.2773033380508423,
+      "rewards/cosine_scaled_reward/mean": -0.1873316466808319,
+      "rewards/cosine_scaled_reward/std": 0.37051624059677124,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1848.0,
+      "completions/mean_length": 1731.53125,
+      "completions/mean_terminated_length": 982.0,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2662124037742615,
+      "learning_rate": 4.8e-07,
+      "loss": 0.0,
+      "num_tokens": 3079792.0,
+      "reward": -0.12407588213682175,
+      "reward_std": 0.25581949949264526,
+      "rewards/cosine_scaled_reward/mean": -0.12407589703798294,
+      "rewards/cosine_scaled_reward/std": 0.39043793082237244,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1965.46875,
+      "completions/mean_terminated_length": 1567.8182373046875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23202598094940186,
+      "learning_rate": 5e-07,
+      "loss": 0.0,
+      "num_tokens": 3216214.0,
+      "reward": -0.0963105633854866,
+      "reward_std": 0.30887559056282043,
+      "rewards/cosine_scaled_reward/mean": -0.0963105633854866,
+      "rewards/cosine_scaled_reward/std": 0.39396020770072937,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1886.96875,
+      "completions/mean_terminated_length": 1111.0909423828125,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2878379225730896,
+      "learning_rate": 5.2e-07,
+      "loss": -0.0,
+      "num_tokens": 3347268.0,
+      "reward": -0.1645491123199463,
+      "reward_std": 0.28629785776138306,
+      "rewards/cosine_scaled_reward/mean": -0.1645491123199463,
+      "rewards/cosine_scaled_reward/std": 0.35050687193870544,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1995.0,
+      "completions/mean_length": 1843.640625,
+      "completions/mean_terminated_length": 1230.5625,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24996496737003326,
+      "learning_rate": 5.4e-07,
+      "loss": 0.0,
+      "num_tokens": 3475597.0,
+      "reward": -0.06605555862188339,
+      "reward_std": 0.2643629312515259,
+      "rewards/cosine_scaled_reward/mean": -0.06605555862188339,
+      "rewards/cosine_scaled_reward/std": 0.438128799200058,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2005.0,
+      "completions/mean_length": 2020.5,
+      "completions/mean_terminated_length": 1608.0,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23316837847232819,
+      "learning_rate": 5.6e-07,
+      "loss": -0.0,
+      "num_tokens": 3615381.0,
+      "reward": -0.2015206664800644,
+      "reward_std": 0.15312039852142334,
+      "rewards/cosine_scaled_reward/mean": -0.2015206664800644,
+      "rewards/cosine_scaled_reward/std": 0.1648881882429123,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1839.0,
+      "completions/mean_length": 1826.046875,
+      "completions/mean_terminated_length": 955.3077392578125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2410832792520523,
+      "learning_rate": 5.8e-07,
+      "loss": -0.0,
+      "num_tokens": 3742784.0,
+      "reward": -0.17509159445762634,
+      "reward_std": 0.18994277715682983,
+      "rewards/cosine_scaled_reward/mean": -0.17509159445762634,
+      "rewards/cosine_scaled_reward/std": 0.22516494989395142,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1678.0,
+      "completions/mean_length": 1781.4375,
+      "completions/mean_terminated_length": 910.6666870117188,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2693414092063904,
+      "learning_rate": 6e-07,
+      "loss": 0.0,
+      "num_tokens": 3867292.0,
+      "reward": -0.24513831734657288,
+      "reward_std": 0.28315529227256775,
+      "rewards/cosine_scaled_reward/mean": -0.24513831734657288,
+      "rewards/cosine_scaled_reward/std": 0.3480584919452667,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1969.28125,
+      "completions/mean_terminated_length": 1488.2222900390625,
+      "completions/min_length": 1088.0,
+      "completions/min_terminated_length": 1088.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24202018976211548,
+      "learning_rate": 6.2e-07,
+      "loss": 0.0,
+      "num_tokens": 4003678.0,
+      "reward": -0.18968716263771057,
+      "reward_std": 0.28299200534820557,
+      "rewards/cosine_scaled_reward/mean": -0.18968716263771057,
+      "rewards/cosine_scaled_reward/std": 0.3119950294494629,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22288212180137634,
+      "learning_rate": 6.4e-07,
+      "loss": 0.0,
+      "num_tokens": 4145966.0,
+      "reward": -0.2955162525177002,
+      "reward_std": 0.17793573439121246,
+      "rewards/cosine_scaled_reward/mean": -0.2955162525177002,
+      "rewards/cosine_scaled_reward/std": 0.22786569595336914,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1809.0,
+      "completions/mean_length": 1589.640625,
+      "completions/mean_terminated_length": 1036.4482421875,
+      "completions/min_length": 515.0,
+      "completions/min_terminated_length": 515.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31030499935150146,
+      "learning_rate": 6.6e-07,
+      "loss": 0.0,
+      "num_tokens": 4257255.0,
+      "reward": 0.008002171292901039,
+      "reward_std": 0.3413254916667938,
+      "rewards/cosine_scaled_reward/mean": 0.008002176880836487,
+      "rewards/cosine_scaled_reward/std": 0.4431404769420624,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1987.0,
+      "completions/mean_length": 1785.921875,
+      "completions/mean_terminated_length": 757.769287109375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3145958483219147,
+      "learning_rate": 6.800000000000001e-07,
+      "loss": -0.0,
+      "num_tokens": 4383050.0,
+      "reward": -0.16386553645133972,
+      "reward_std": 0.2818174958229065,
+      "rewards/cosine_scaled_reward/mean": -0.16386555135250092,
+      "rewards/cosine_scaled_reward/std": 0.3242056965827942,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1195.0,
+      "completions/mean_length": 2000.421875,
+      "completions/mean_terminated_length": 1033.0,
+      "completions/min_length": 863.0,
+      "completions/min_terminated_length": 863.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25796815752983093,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 4522189.0,
+      "reward": -0.2470606118440628,
+      "reward_std": 0.15509279072284698,
+      "rewards/cosine_scaled_reward/mean": -0.2470606118440628,
+      "rewards/cosine_scaled_reward/std": 0.16412879526615143,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1964.46875,
+      "completions/mean_terminated_length": 1284.2857666015625,
+      "completions/min_length": 931.0,
+      "completions/min_terminated_length": 931.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22452199459075928,
+      "learning_rate": 7.2e-07,
+      "loss": 0.0,
+      "num_tokens": 4658939.0,
+      "reward": -0.24706938862800598,
+      "reward_std": 0.18499845266342163,
+      "rewards/cosine_scaled_reward/mean": -0.24706941843032837,
+      "rewards/cosine_scaled_reward/std": 0.21092188358306885,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1840.0,
+      "completions/mean_length": 1925.234375,
+      "completions/mean_terminated_length": 1175.0,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23703666031360626,
+      "learning_rate": 7.4e-07,
+      "loss": -0.0,
+      "num_tokens": 4793866.0,
+      "reward": -0.11504355818033218,
+      "reward_std": 0.20660358667373657,
+      "rewards/cosine_scaled_reward/mean": -0.11504356563091278,
+      "rewards/cosine_scaled_reward/std": 0.3190351724624634,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1412.0,
+      "completions/mean_length": 1740.546875,
+      "completions/mean_terminated_length": 642.5,
+      "completions/min_length": 339.0,
+      "completions/min_terminated_length": 339.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23829001188278198,
+      "learning_rate": 7.599999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 4916045.0,
+      "reward": -0.12095541507005692,
+      "reward_std": 0.1958026885986328,
+      "rewards/cosine_scaled_reward/mean": -0.12095542997121811,
+      "rewards/cosine_scaled_reward/std": 0.340241402387619,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1713.203125,
+      "completions/mean_terminated_length": 920.26318359375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24145744740962982,
+      "learning_rate": 7.799999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 5035762.0,
+      "reward": -0.10936243832111359,
+      "reward_std": 0.14468500018119812,
+      "rewards/cosine_scaled_reward/mean": -0.10936242341995239,
+      "rewards/cosine_scaled_reward/std": 0.4288744330406189,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1909.71875,
+      "completions/mean_terminated_length": 1367.2308349609375,
+      "completions/min_length": 1138.0,
+      "completions/min_terminated_length": 1138.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22317881882190704,
+      "learning_rate": 8e-07,
+      "loss": 0.0,
+      "num_tokens": 5169136.0,
+      "reward": -0.2058967649936676,
+      "reward_std": 0.2325170338153839,
+      "rewards/cosine_scaled_reward/mean": -0.20589673519134521,
+      "rewards/cosine_scaled_reward/std": 0.28897321224212646,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1752.0,
+      "completions/mean_length": 1727.71875,
+      "completions/mean_terminated_length": 583.857177734375,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.44688937067985535,
+      "learning_rate": 8.199999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5290070.0,
+      "reward": -0.2254919707775116,
+      "reward_std": 0.1687203049659729,
+      "rewards/cosine_scaled_reward/mean": -0.2254919707775116,
+      "rewards/cosine_scaled_reward/std": 0.18203677237033844,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1082.0,
+      "completions/mean_length": 1855.328125,
+      "completions/mean_terminated_length": 814.9000244140625,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430828958749771,
+      "learning_rate": 8.399999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5420427.0,
+      "reward": -0.09104865789413452,
+      "reward_std": 0.18217626214027405,
+      "rewards/cosine_scaled_reward/mean": -0.09104865789413452,
+      "rewards/cosine_scaled_reward/std": 0.3521345257759094,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1675.0,
+      "completions/mean_length": 1727.9375,
+      "completions/mean_terminated_length": 767.75,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32065215706825256,
+      "learning_rate": 8.599999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5541711.0,
+      "reward": -0.17701950669288635,
+      "reward_std": 0.2957555055618286,
+      "rewards/cosine_scaled_reward/mean": -0.17701953649520874,
+      "rewards/cosine_scaled_reward/std": 0.38460060954093933,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 2013.9375,
+      "completions/mean_terminated_length": 1321.3333740234375,
+      "completions/min_length": 740.0,
+      "completions/min_terminated_length": 740.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22363637387752533,
+      "learning_rate": 8.799999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5682259.0,
+      "reward": -0.20341511070728302,
+      "reward_std": 0.23104795813560486,
+      "rewards/cosine_scaled_reward/mean": -0.20341511070728302,
+      "rewards/cosine_scaled_reward/std": 0.3092363774776459,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1224.0,
+      "completions/mean_length": 1909.0,
+      "completions/mean_terminated_length": 936.0,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26306217908859253,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 5815603.0,
+      "reward": -0.26145532727241516,
+      "reward_std": 0.17108051478862762,
+      "rewards/cosine_scaled_reward/mean": -0.2614552974700928,
+      "rewards/cosine_scaled_reward/std": 0.18312901258468628,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1668.0,
+      "completions/mean_length": 1757.1875,
+      "completions/mean_terminated_length": 884.75,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2856813371181488,
+      "learning_rate": 9.2e-07,
+      "loss": 0.0,
+      "num_tokens": 5938463.0,
+      "reward": -0.20879247784614563,
+      "reward_std": 0.23861759901046753,
+      "rewards/cosine_scaled_reward/mean": -0.20879246294498444,
+      "rewards/cosine_scaled_reward/std": 0.39607998728752136,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1708.0,
+      "completions/mean_length": 1756.5,
+      "completions/mean_terminated_length": 1011.5555419921875,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27563413977622986,
+      "learning_rate": 9.399999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 6061423.0,
+      "reward": -0.16147920489311218,
+      "reward_std": 0.24055320024490356,
+      "rewards/cosine_scaled_reward/mean": -0.16147920489311218,
+      "rewards/cosine_scaled_reward/std": 0.3948959410190582,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1458.0,
+      "completions/mean_length": 1538.078125,
+      "completions/mean_terminated_length": 839.2963256835938,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27617642283439636,
+      "learning_rate": 9.6e-07,
+      "loss": -0.0,
+      "num_tokens": 6169924.0,
+      "reward": -0.18436825275421143,
+      "reward_std": 0.27141550183296204,
+      "rewards/cosine_scaled_reward/mean": -0.18436823785305023,
+      "rewards/cosine_scaled_reward/std": 0.3920196294784546,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1938.0,
+      "completions/mean_length": 1749.0625,
+      "completions/mean_terminated_length": 772.5333862304688,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23394836485385895,
+      "learning_rate": 9.8e-07,
+      "loss": 0.0,
+      "num_tokens": 6292680.0,
+      "reward": -0.10770958662033081,
+      "reward_std": 0.22513547539710999,
+      "rewards/cosine_scaled_reward/mean": -0.10770957916975021,
+      "rewards/cosine_scaled_reward/std": 0.421062707901001,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1482.25,
+      "completions/mean_terminated_length": 841.0667114257812,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3268967568874359,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 6397752.0,
+      "reward": -0.09745607525110245,
+      "reward_std": 0.25210899114608765,
+      "rewards/cosine_scaled_reward/mean": -0.09745605289936066,
+      "rewards/cosine_scaled_reward/std": 0.3351369798183441,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1743.953125,
+      "completions/mean_terminated_length": 750.7333984375,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2918722927570343,
+      "learning_rate": 9.999890338174275e-07,
+      "loss": -0.0,
+      "num_tokens": 6520717.0,
+      "reward": -0.1890830397605896,
+      "reward_std": 0.21916288137435913,
+      "rewards/cosine_scaled_reward/mean": -0.1890830546617508,
+      "rewards/cosine_scaled_reward/std": 0.32568052411079407,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1757.0,
+      "completions/mean_length": 1772.421875,
+      "completions/mean_terminated_length": 1010.5294189453125,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24523264169692993,
+      "learning_rate": 9.999561358041868e-07,
+      "loss": 0.0,
+      "num_tokens": 6644984.0,
+      "reward": -0.20969681441783905,
+      "reward_std": 0.1810423731803894,
+      "rewards/cosine_scaled_reward/mean": -0.20969681441783905,
+      "rewards/cosine_scaled_reward/std": 0.2371566891670227,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1961.0,
+      "completions/mean_length": 1838.859375,
+      "completions/mean_terminated_length": 1304.388916015625,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23284469544887543,
+      "learning_rate": 9.999013075636804e-07,
+      "loss": 0.0,
+      "num_tokens": 6773815.0,
+      "reward": -0.06641622632741928,
+      "reward_std": 0.30815836787223816,
+      "rewards/cosine_scaled_reward/mean": -0.06641621887683868,
+      "rewards/cosine_scaled_reward/std": 0.46219584345817566,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1750.125,
+      "completions/mean_terminated_length": 856.5,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2651103734970093,
+      "learning_rate": 9.998245517681593e-07,
+      "loss": -0.0,
+      "num_tokens": 6896111.0,
+      "reward": -0.10750342905521393,
+      "reward_std": 0.2286185324192047,
+      "rewards/cosine_scaled_reward/mean": -0.10750342160463333,
+      "rewards/cosine_scaled_reward/std": 0.43372800946235657,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1840.078125,
+      "completions/mean_terminated_length": 1097.5,
+      "completions/min_length": 526.0,
+      "completions/min_terminated_length": 526.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22967560589313507,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 7024836.0,
+      "reward": -0.10045827925205231,
+      "reward_std": 0.2548004388809204,
+      "rewards/cosine_scaled_reward/mean": -0.10045827925205231,
+      "rewards/cosine_scaled_reward/std": 0.41444358229637146,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1810.0,
+      "completions/mean_length": 1991.1875,
+      "completions/mean_terminated_length": 1442.0,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20479348301887512,
+      "learning_rate": 9.996052735444862e-07,
+      "loss": 0.0,
+      "num_tokens": 7163840.0,
+      "reward": -0.27901512384414673,
+      "reward_std": 0.2130473554134369,
+      "rewards/cosine_scaled_reward/mean": -0.27901512384414673,
+      "rewards/cosine_scaled_reward/std": 0.2583855092525482,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1617.421875,
+      "completions/mean_terminated_length": 1129.433349609375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2690146267414093,
+      "learning_rate": 9.994627618036452e-07,
+      "loss": -0.0,
+      "num_tokens": 7277451.0,
+      "reward": -0.04198366403579712,
+      "reward_std": 0.4036104083061218,
+      "rewards/cosine_scaled_reward/mean": -0.04198366031050682,
+      "rewards/cosine_scaled_reward/std": 0.5008736252784729,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1736.09375,
+      "completions/mean_terminated_length": 997.368408203125,
+      "completions/min_length": 478.0,
+      "completions/min_terminated_length": 478.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2184475064277649,
+      "learning_rate": 9.992983438818915e-07,
+      "loss": -0.0,
+      "num_tokens": 7399025.0,
+      "reward": -0.1564982533454895,
+      "reward_std": 0.19560785591602325,
+      "rewards/cosine_scaled_reward/mean": -0.1564982533454895,
+      "rewards/cosine_scaled_reward/std": 0.3402426540851593,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1512.0,
+      "completions/mean_length": 1785.40625,
+      "completions/mean_terminated_length": 847.5714721679688,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23538637161254883,
+      "learning_rate": 9.991120277927223e-07,
+      "loss": -0.0,
+      "num_tokens": 7524179.0,
+      "reward": -0.2697012424468994,
+      "reward_std": 0.17935499548912048,
+      "rewards/cosine_scaled_reward/mean": -0.2697012424468994,
+      "rewards/cosine_scaled_reward/std": 0.19757980108261108,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1884.484375,
+      "completions/mean_terminated_length": 1001.5,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.225452721118927,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": 0.0,
+      "num_tokens": 7656306.0,
+      "reward": -0.1635127067565918,
+      "reward_std": 0.1931447982788086,
+      "rewards/cosine_scaled_reward/mean": -0.1635127067565918,
+      "rewards/cosine_scaled_reward/std": 0.23563610017299652,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1994.0,
+      "completions/mean_length": 1739.46875,
+      "completions/mean_terminated_length": 1060.7000732421875,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23771661520004272,
+      "learning_rate": 9.98673738502114e-07,
+      "loss": 0.0,
+      "num_tokens": 7777864.0,
+      "reward": -0.10127441585063934,
+      "reward_std": 0.2957979142665863,
+      "rewards/cosine_scaled_reward/mean": -0.10127442330121994,
+      "rewards/cosine_scaled_reward/std": 0.34053224325180054,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1965.0,
+      "completions/mean_length": 1522.953125,
+      "completions/mean_terminated_length": 1163.7105712890625,
+      "completions/min_length": 531.0,
+      "completions/min_terminated_length": 531.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27804723381996155,
+      "learning_rate": 9.98421786662277e-07,
+      "loss": 0.0,
+      "num_tokens": 7885589.0,
+      "reward": -0.036153122782707214,
+      "reward_std": 0.3305097818374634,
+      "rewards/cosine_scaled_reward/mean": -0.03615312650799751,
+      "rewards/cosine_scaled_reward/std": 0.4355940818786621,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1558.0,
+      "completions/mean_length": 1760.390625,
+      "completions/mean_terminated_length": 1025.388916015625,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2333846092224121,
+      "learning_rate": 9.981479793771866e-07,
+      "loss": -0.0,
+      "num_tokens": 8009206.0,
+      "reward": -0.14333069324493408,
+      "reward_std": 0.28757935762405396,
+      "rewards/cosine_scaled_reward/mean": -0.14333069324493408,
+      "rewards/cosine_scaled_reward/std": 0.41007620096206665,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1532.0,
+      "completions/mean_length": 1651.515625,
+      "completions/mean_terminated_length": 638.2777709960938,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26348626613616943,
+      "learning_rate": 9.97852329991824e-07,
+      "loss": 0.0,
+      "num_tokens": 8125607.0,
+      "reward": -0.2117859125137329,
+      "reward_std": 0.15534773468971252,
+      "rewards/cosine_scaled_reward/mean": -0.2117859125137329,
+      "rewards/cosine_scaled_reward/std": 0.37395453453063965,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1350.0,
+      "completions/mean_length": 1254.125,
+      "completions/mean_terminated_length": 596.3428344726562,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33443817496299744,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": 0.0,
+      "num_tokens": 8216103.0,
+      "reward": 0.028336994349956512,
+      "reward_std": 0.25119709968566895,
+      "rewards/cosine_scaled_reward/mean": 0.02833697199821472,
+      "rewards/cosine_scaled_reward/std": 0.4882389008998871,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1431.0,
+      "completions/mean_length": 1966.21875,
+      "completions/mean_terminated_length": 1175.666748046875,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2199370563030243,
+      "learning_rate": 9.971955636222684e-07,
+      "loss": -0.0,
+      "num_tokens": 8352677.0,
+      "reward": -0.28747493028640747,
+      "reward_std": 0.15530282258987427,
+      "rewards/cosine_scaled_reward/mean": -0.28747493028640747,
+      "rewards/cosine_scaled_reward/std": 0.16220521926879883,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2024.0,
+      "completions/mean_length": 1357.109375,
+      "completions/mean_terminated_length": 747.5,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3341590464115143,
+      "learning_rate": 9.968344786479415e-07,
+      "loss": -0.0,
+      "num_tokens": 8448788.0,
+      "reward": -0.06672946363687515,
+      "reward_std": 0.28790342807769775,
+      "rewards/cosine_scaled_reward/mean": -0.06672945618629456,
+      "rewards/cosine_scaled_reward/std": 0.35960128903388977,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1654.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 944.107177734375,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.35159721970558167,
+      "learning_rate": 9.964516155915151e-07,
+      "loss": -0.0,
+      "num_tokens": 8559295.0,
+      "reward": -0.27992868423461914,
+      "reward_std": 0.20264248549938202,
+      "rewards/cosine_scaled_reward/mean": -0.27992868423461914,
+      "rewards/cosine_scaled_reward/std": 0.23891927301883698,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 1867.765625,
+      "completions/mean_terminated_length": 606.125,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23989427089691162,
+      "learning_rate": 9.960469931131936e-07,
+      "loss": -0.0,
+      "num_tokens": 8690288.0,
+      "reward": -0.2498025894165039,
+      "reward_std": 0.15823513269424438,
+      "rewards/cosine_scaled_reward/mean": -0.2498025894165039,
+      "rewards/cosine_scaled_reward/std": 0.17978127300739288,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1908.0,
+      "completions/mean_length": 1669.125,
+      "completions/mean_terminated_length": 945.8182373046875,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.335510790348053,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": -0.0,
+      "num_tokens": 8807832.0,
+      "reward": -0.1673138290643692,
+      "reward_std": 0.2547321915626526,
+      "rewards/cosine_scaled_reward/mean": -0.1673138290643692,
+      "rewards/cosine_scaled_reward/std": 0.39353805780410767,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1957.0,
+      "completions/mean_length": 1632.59375,
+      "completions/mean_terminated_length": 892.0869750976562,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30721575021743774,
+      "learning_rate": 9.951725498333448e-07,
+      "loss": 0.0,
+      "num_tokens": 8922670.0,
+      "reward": -0.1493685096502304,
+      "reward_std": 0.23021411895751953,
+      "rewards/cosine_scaled_reward/mean": -0.1493685096502304,
+      "rewards/cosine_scaled_reward/std": 0.27729952335357666,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1852.0,
+      "completions/mean_length": 2020.59375,
+      "completions/mean_terminated_length": 1463.3333740234375,
+      "completions/min_length": 888.0,
+      "completions/min_terminated_length": 888.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20856839418411255,
+      "learning_rate": 9.947027716509488e-07,
+      "loss": 0.0,
+      "num_tokens": 9062716.0,
+      "reward": -0.25696587562561035,
+      "reward_std": 0.19847074151039124,
+      "rewards/cosine_scaled_reward/mean": -0.25696590542793274,
+      "rewards/cosine_scaled_reward/std": 0.23918035626411438,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1957.0,
+      "completions/mean_length": 1926.984375,
+      "completions/mean_terminated_length": 1273.5,
+      "completions/min_length": 740.0,
+      "completions/min_terminated_length": 740.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23241353034973145,
+      "learning_rate": 9.942113192828444e-07,
+      "loss": -0.0,
+      "num_tokens": 9195971.0,
+      "reward": -0.12904082238674164,
+      "reward_std": 0.23554545640945435,
+      "rewards/cosine_scaled_reward/mean": -0.12904080748558044,
+      "rewards/cosine_scaled_reward/std": 0.4280695915222168,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1677.0,
+      "completions/mean_length": 1868.890625,
+      "completions/mean_terminated_length": 1092.75,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.19846303761005402,
+      "learning_rate": 9.93698216681727e-07,
+      "loss": -0.0,
+      "num_tokens": 9326540.0,
+      "reward": -0.03926669806241989,
+      "reward_std": 0.2044709324836731,
+      "rewards/cosine_scaled_reward/mean": -0.039266690611839294,
+      "rewards/cosine_scaled_reward/std": 0.49658530950546265,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1805.296875,
+      "completions/mean_terminated_length": 1077.1875,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23998627066612244,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 9452479.0,
+      "reward": -0.23065510392189026,
+      "reward_std": 0.17413878440856934,
+      "rewards/cosine_scaled_reward/mean": -0.23065511882305145,
+      "rewards/cosine_scaled_reward/std": 0.21896763145923615,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1871.0,
+      "completions/mean_length": 1857.328125,
+      "completions/mean_terminated_length": 1285.3125,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20421437919139862,
+      "learning_rate": 9.926071618660237e-07,
+      "loss": 0.0,
+      "num_tokens": 9582924.0,
+      "reward": -0.17972718179225922,
+      "reward_std": 0.209285706281662,
+      "rewards/cosine_scaled_reward/mean": -0.17972716689109802,
+      "rewards/cosine_scaled_reward/std": 0.2716500163078308,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1883.921875,
+      "completions/mean_terminated_length": 1093.3636474609375,
+      "completions/min_length": 712.0,
+      "completions/min_terminated_length": 712.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2156875878572464,
+      "learning_rate": 9.9202926282791e-07,
+      "loss": -0.0,
+      "num_tokens": 9714215.0,
+      "reward": -0.14897406101226807,
+      "reward_std": 0.2451157122850418,
+      "rewards/cosine_scaled_reward/mean": -0.14897406101226807,
+      "rewards/cosine_scaled_reward/std": 0.38884180784225464,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1878.0,
+      "completions/mean_length": 1507.65625,
+      "completions/mean_terminated_length": 767.1851806640625,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29943305253982544,
+      "learning_rate": 9.91429819907136e-07,
+      "loss": -0.0,
+      "num_tokens": 9820801.0,
+      "reward": -0.17114077508449554,
+      "reward_std": 0.23199111223220825,
+      "rewards/cosine_scaled_reward/mean": -0.17114077508449554,
+      "rewards/cosine_scaled_reward/std": 0.3217289447784424,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2007.0,
+      "completions/mean_length": 1976.125,
+      "completions/mean_terminated_length": 1536.888916015625,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26230743527412415,
+      "learning_rate": 9.908088623197048e-07,
+      "loss": 0.0,
+      "num_tokens": 9957665.0,
+      "reward": -0.21115826070308685,
+      "reward_std": 0.2435196340084076,
+      "rewards/cosine_scaled_reward/mean": -0.21115827560424805,
+      "rewards/cosine_scaled_reward/std": 0.28258123993873596,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2042.0,
+      "completions/mean_length": 1779.28125,
+      "completions/mean_terminated_length": 901.4667358398438,
+      "completions/min_length": 320.0,
+      "completions/min_terminated_length": 320.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33359771966934204,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 10082811.0,
+      "reward": -0.1508273482322693,
+      "reward_std": 0.2594776749610901,
+      "rewards/cosine_scaled_reward/mean": -0.1508273482322693,
+      "rewards/cosine_scaled_reward/std": 0.33812451362609863,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1831.0,
+      "completions/mean_length": 1711.609375,
+      "completions/mean_terminated_length": 851.9444580078125,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2805767059326172,
+      "learning_rate": 9.895025252503755e-07,
+      "loss": -0.0,
+      "num_tokens": 10202682.0,
+      "reward": -0.11850972473621368,
+      "reward_std": 0.2631937861442566,
+      "rewards/cosine_scaled_reward/mean": -0.11850972473621368,
+      "rewards/cosine_scaled_reward/std": 0.4419197142124176,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1925.0,
+      "completions/mean_length": 1749.984375,
+      "completions/mean_terminated_length": 1044.157958984375,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3109220266342163,
+      "learning_rate": 9.888172094375033e-07,
+      "loss": -0.0,
+      "num_tokens": 10325769.0,
+      "reward": -0.10190614312887192,
+      "reward_std": 0.2739119529724121,
+      "rewards/cosine_scaled_reward/mean": -0.10190614312887192,
+      "rewards/cosine_scaled_reward/std": 0.39238420128822327,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1756.0,
+      "completions/mean_length": 1800.390625,
+      "completions/mean_terminated_length": 829.0000610351562,
+      "completions/min_length": 420.0,
+      "completions/min_terminated_length": 420.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23385629057884216,
+      "learning_rate": 9.881105062929221e-07,
+      "loss": 0.0,
+      "num_tokens": 10451690.0,
+      "reward": -0.21778321266174316,
+      "reward_std": 0.25428956747055054,
+      "rewards/cosine_scaled_reward/mean": -0.21778322756290436,
+      "rewards/cosine_scaled_reward/std": 0.30295974016189575,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1842.0,
+      "completions/mean_length": 1870.46875,
+      "completions/mean_terminated_length": 1337.875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21526271104812622,
+      "learning_rate": 9.873824502603459e-07,
+      "loss": -0.0,
+      "num_tokens": 10581720.0,
+      "reward": -0.19906702637672424,
+      "reward_std": 0.23402772843837738,
+      "rewards/cosine_scaled_reward/mean": -0.19906699657440186,
+      "rewards/cosine_scaled_reward/std": 0.28999006748199463,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1369.0,
+      "completions/mean_length": 1734.875,
+      "completions/mean_terminated_length": 795.5,
+      "completions/min_length": 581.0,
+      "completions/min_terminated_length": 581.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24285966157913208,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": 0.0,
+      "num_tokens": 10703608.0,
+      "reward": -0.16528445482254028,
+      "reward_std": 0.2592755854129791,
+      "rewards/cosine_scaled_reward/mean": -0.16528445482254028,
+      "rewards/cosine_scaled_reward/std": 0.37110546231269836,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1626.0,
+      "completions/mean_length": 1577.921875,
+      "completions/mean_terminated_length": 973.5357666015625,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30273520946502686,
+      "learning_rate": 9.85862422507884e-07,
+      "loss": -0.0,
+      "num_tokens": 10814715.0,
+      "reward": -0.20241931080818176,
+      "reward_std": 0.2693288326263428,
+      "rewards/cosine_scaled_reward/mean": -0.20241928100585938,
+      "rewards/cosine_scaled_reward/std": 0.33345305919647217,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1948.0,
+      "completions/mean_length": 1680.546875,
+      "completions/mean_terminated_length": 1068.125,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2649252116680145,
+      "learning_rate": 9.850705248720068e-07,
+      "loss": -0.0,
+      "num_tokens": 10932782.0,
+      "reward": -0.018871163949370384,
+      "reward_std": 0.3073042631149292,
+      "rewards/cosine_scaled_reward/mean": -0.018871165812015533,
+      "rewards/cosine_scaled_reward/std": 0.3826298415660858,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1754.0,
+      "completions/mean_length": 1683.703125,
+      "completions/mean_terminated_length": 1151.269287109375,
+      "completions/min_length": 667.0,
+      "completions/min_terminated_length": 667.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24950510263442993,
+      "learning_rate": 9.8425742251254e-07,
+      "loss": -0.0,
+      "num_tokens": 11051539.0,
+      "reward": -0.11818082630634308,
+      "reward_std": 0.2949528694152832,
+      "rewards/cosine_scaled_reward/mean": -0.11818082630634308,
+      "rewards/cosine_scaled_reward/std": 0.34418320655822754,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1958.0,
+      "completions/mean_length": 1558.546875,
+      "completions/mean_terminated_length": 967.8275756835938,
+      "completions/min_length": 377.0,
+      "completions/min_terminated_length": 377.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.36593058705329895,
+      "learning_rate": 9.83423155058946e-07,
+      "loss": 0.0,
+      "num_tokens": 11161286.0,
+      "reward": -0.26082760095596313,
+      "reward_std": 0.1802712082862854,
+      "rewards/cosine_scaled_reward/mean": -0.26082760095596313,
+      "rewards/cosine_scaled_reward/std": 0.2037661075592041,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1505.0,
+      "completions/mean_length": 1827.9375,
+      "completions/mean_terminated_length": 1109.0667724609375,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24167831242084503,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": 0.0,
+      "num_tokens": 11288842.0,
+      "reward": -0.11456942558288574,
+      "reward_std": 0.26296502351760864,
+      "rewards/cosine_scaled_reward/mean": -0.11456942558288574,
+      "rewards/cosine_scaled_reward/std": 0.3274599611759186,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 1581.546875,
+      "completions/mean_terminated_length": 899.8077392578125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2570616602897644,
+      "learning_rate": 9.816912885430258e-07,
+      "loss": 0.0,
+      "num_tokens": 11400053.0,
+      "reward": -0.17942462861537933,
+      "reward_std": 0.2633644640445709,
+      "rewards/cosine_scaled_reward/mean": -0.17942462861537933,
+      "rewards/cosine_scaled_reward/std": 0.30215632915496826,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.96875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1562.0,
+      "completions/mean_length": 2022.328125,
+      "completions/mean_terminated_length": 1226.5,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25331902503967285,
+      "learning_rate": 9.807937738894303e-07,
+      "loss": 0.0,
+      "num_tokens": 11540826.0,
+      "reward": -0.26418450474739075,
+      "reward_std": 0.1380012035369873,
+      "rewards/cosine_scaled_reward/mean": -0.26418450474739075,
+      "rewards/cosine_scaled_reward/std": 0.17390060424804688,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1702.0,
+      "completions/mean_length": 1769.546875,
+      "completions/mean_terminated_length": 934.1875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29503753781318665,
+      "learning_rate": 9.798752629550546e-07,
+      "loss": 0.0,
+      "num_tokens": 11663845.0,
+      "reward": -0.08299511671066284,
+      "reward_std": 0.18226617574691772,
+      "rewards/cosine_scaled_reward/mean": -0.08299513161182404,
+      "rewards/cosine_scaled_reward/std": 0.46436113119125366,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.96875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1300.0,
+      "completions/mean_length": 2021.5,
+      "completions/mean_terminated_length": 1200.0,
+      "completions/min_length": 1100.0,
+      "completions/min_terminated_length": 1100.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20416001975536346,
+      "learning_rate": 9.78935800506826e-07,
+      "loss": -0.0,
+      "num_tokens": 11803749.0,
+      "reward": -0.22345861792564392,
+      "reward_std": 0.18781372904777527,
+      "rewards/cosine_scaled_reward/mean": -0.22345861792564392,
+      "rewards/cosine_scaled_reward/std": 0.24531956017017365,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1440.0,
+      "completions/mean_length": 1582.890625,
+      "completions/mean_terminated_length": 903.1154174804688,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2593792974948883,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": -0.0,
+      "num_tokens": 11916190.0,
+      "reward": 0.00020215287804603577,
+      "reward_std": 0.24673128128051758,
+      "rewards/cosine_scaled_reward/mean": 0.00020216405391693115,
+      "rewards/cosine_scaled_reward/std": 0.49432000517845154,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1972.0,
+      "completions/mean_length": 1748.859375,
+      "completions/mean_terminated_length": 1177.772705078125,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2480001151561737,
+      "learning_rate": 9.769942052400235e-07,
+      "loss": 0.0,
+      "num_tokens": 12038381.0,
+      "reward": -0.19425566494464874,
+      "reward_std": 0.21240204572677612,
+      "rewards/cosine_scaled_reward/mean": -0.19425567984580994,
+      "rewards/cosine_scaled_reward/std": 0.29181501269340515,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1632.171875,
+      "completions/mean_terminated_length": 1062.3333740234375,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2797771692276001,
+      "learning_rate": 9.759921670520634e-07,
+      "loss": -0.0,
+      "num_tokens": 12153904.0,
+      "reward": -0.11104464530944824,
+      "reward_std": 0.2755987048149109,
+      "rewards/cosine_scaled_reward/mean": -0.11104465276002884,
+      "rewards/cosine_scaled_reward/std": 0.4012855887413025,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 1651.078125,
+      "completions/mean_terminated_length": 553.7058715820312,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3114299476146698,
+      "learning_rate": 9.749693666068663e-07,
+      "loss": -0.0,
+      "num_tokens": 12270741.0,
+      "reward": -0.1317199319601059,
+      "reward_std": 0.14237020909786224,
+      "rewards/cosine_scaled_reward/mean": -0.1317199319601059,
+      "rewards/cosine_scaled_reward/std": 0.3707720935344696,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2034.0,
+      "completions/mean_length": 1544.765625,
+      "completions/mean_terminated_length": 937.413818359375,
+      "completions/min_length": 457.0,
+      "completions/min_terminated_length": 457.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2654109001159668,
+      "learning_rate": 9.739258537542835e-07,
+      "loss": 0.0,
+      "num_tokens": 12379318.0,
+      "reward": -0.018167953938245773,
+      "reward_std": 0.29768484830856323,
+      "rewards/cosine_scaled_reward/mean": -0.01816795952618122,
+      "rewards/cosine_scaled_reward/std": 0.44200995564460754,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 500,
+  "num_input_tokens_seen": 12379318,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4639e85d2a55fd05c0491ef4a075a1d0d0059852d9fc8f59c4aaa80933edfcd5
+size 8824

checkpoint-100/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)