Intermediate checkpoint upload step=90 (generator_train)

Browse files

Files changed (12) hide show

.gitattributes +1 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/chat_template.jinja +54 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/config.json +57 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/generation_config.json +13 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/model.safetensors +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/optimizer.pt +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/rng_state.pth +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/scheduler.pt +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/tokenizer.json +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/tokenizer_config.json +32 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/trainer_state.json +1683 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -54,3 +54,4 @@ self_play_hf_l40s_full/round_001/answerer_train/checkpoint-120/tokenizer.json fi
 self_play_hf_l40s_full/round_001/answerer_train/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_002/generator_train/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 self_play_hf_l40s_full/round_001/answerer_train/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_002/generator_train/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:795d4902fb523a56e1fdadec29b397f6d85823a6db98628452bb6706cf986a88
+size 1976163472

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:646da4572e89e1f3601f400342c6ebefb01ba9e33a8d08181d56c6956a9d7e93
+size 3952509771

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5001a1413e890182ec459c1edbf745017684c89f471fbb0cafcaadbc1070a9f
+size 14645

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9466ecaa07ea9f7aa3b70a452615802059d7a87e6220a892a7f18c7703f928c0
+size 1465

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1683 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.75,
+  "eval_steps": 500,
+  "global_step": 90,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2909066677093506,
+      "epoch": 0.041666666666666664,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.419240951538086,
+      "kl": 0.0005526020540855825,
+      "learning_rate": 5e-06,
+      "loss": 0.12301453948020935,
+      "num_tokens": 25288.0,
+      "reward": -0.13312500715255737,
+      "reward_std": 0.1580703854560852,
+      "rewards/GeneratorRewardFunction/mean": -0.13312500715255737,
+      "rewards/GeneratorRewardFunction/std": 0.1580704003572464,
+      "step": 1,
+      "step_time": 12.38097839000011
+    },
+    {
+      "clip_ratio/high_max": 0.025390625,
+      "clip_ratio/high_mean": 0.025390625,
+      "clip_ratio/low_mean": 0.0481770820915699,
+      "clip_ratio/low_min": 0.0481770820915699,
+      "clip_ratio/region_mean": 0.0735677108168602,
+      "entropy": 2.2825264930725098,
+      "epoch": 0.08333333333333333,
+      "grad_norm": 3.043118476867676,
+      "kl": 0.017308469861745834,
+      "learning_rate": 4.958333333333334e-06,
+      "loss": 0.12409868091344833,
+      "step": 2,
+      "step_time": 0.22687196500010032
+    },
+    {
+      "clip_ratio/high_max": 0.068359375,
+      "clip_ratio/high_mean": 0.068359375,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.068359375,
+      "entropy": 1.7901748418807983,
+      "epoch": 0.125,
+      "grad_norm": 1.792228102684021,
+      "kl": 0.042044539004564285,
+      "learning_rate": 4.9166666666666665e-06,
+      "loss": -0.11261526495218277,
+      "step": 3,
+      "step_time": 0.22754332899967267
+    },
+    {
+      "clip_ratio/high_max": 0.1139322891831398,
+      "clip_ratio/high_mean": 0.1139322891831398,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.1139322891831398,
+      "entropy": 2.698014974594116,
+      "epoch": 0.16666666666666666,
+      "grad_norm": 2.728670835494995,
+      "kl": 0.11385751515626907,
+      "learning_rate": 4.875e-06,
+      "loss": -0.11856894940137863,
+      "step": 4,
+      "step_time": 0.22612155700016956
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.2396337985992432,
+      "epoch": 0.20833333333333334,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 3.7660365104675293,
+      "kl": 0.027865365147590637,
+      "learning_rate": 4.833333333333333e-06,
+      "loss": 0.07652663439512253,
+      "num_tokens": 50484.0,
+      "reward": -0.1990624964237213,
+      "reward_std": 0.22889748215675354,
+      "rewards/GeneratorRewardFunction/mean": -0.1990624964237213,
+      "rewards/GeneratorRewardFunction/std": 0.22889748215675354,
+      "step": 5,
+      "step_time": 12.267690716000288
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.025390625,
+      "clip_ratio/low_min": 0.025390625,
+      "clip_ratio/region_mean": 0.0319010429084301,
+      "entropy": 1.3415206670761108,
+      "epoch": 0.25,
+      "grad_norm": 1.7361595630645752,
+      "kl": 0.03342495858669281,
+      "learning_rate": 4.791666666666668e-06,
+      "loss": 0.06931179016828537,
+      "step": 6,
+      "step_time": 0.22098127500021292
+    },
+    {
+      "clip_ratio/high_max": 0.01888020895421505,
+      "clip_ratio/high_mean": 0.01888020895421505,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.01888020895421505,
+      "entropy": 1.5484552383422852,
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.7583158612251282,
+      "kl": 0.0418914258480072,
+      "learning_rate": 4.75e-06,
+      "loss": -0.051571328192949295,
+      "step": 7,
+      "step_time": 0.22065312600034304
+    },
+    {
+      "clip_ratio/high_max": 0.1178385391831398,
+      "clip_ratio/high_mean": 0.1178385391831398,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.1178385391831398,
+      "entropy": 2.2375190258026123,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.100609302520752,
+      "kl": 0.05581093952059746,
+      "learning_rate": 4.708333333333334e-06,
+      "loss": -0.09243377298116684,
+      "step": 8,
+      "step_time": 0.2210044349999407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8528798818588257,
+      "epoch": 0.375,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.863710880279541,
+      "kl": 0.13026608526706696,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": -0.01838863641023636,
+      "num_tokens": 75740.0,
+      "reward": -0.1399218738079071,
+      "reward_std": 0.2057529091835022,
+      "rewards/GeneratorRewardFunction/mean": -0.1399218738079071,
+      "rewards/GeneratorRewardFunction/std": 0.20575293898582458,
+      "step": 9,
+      "step_time": 12.445934573000159
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 1.8978748321533203,
+      "epoch": 0.4166666666666667,
+      "grad_norm": 2.6196653842926025,
+      "kl": 0.0548124723136425,
+      "learning_rate": 4.625000000000001e-06,
+      "loss": -0.02614506147801876,
+      "step": 10,
+      "step_time": 0.2266816760002257
+    },
+    {
+      "clip_ratio/high_max": 0.013671875,
+      "clip_ratio/high_mean": 0.013671875,
+      "clip_ratio/low_mean": 0.013020833022892475,
+      "clip_ratio/low_min": 0.013020833022892475,
+      "clip_ratio/region_mean": 0.02669270895421505,
+      "entropy": 1.9386297464370728,
+      "epoch": 0.4583333333333333,
+      "grad_norm": 3.514759063720703,
+      "kl": 0.0901331901550293,
+      "learning_rate": 4.583333333333333e-06,
+      "loss": 0.12482144683599472,
+      "step": 11,
+      "step_time": 0.22661321499981568
+    },
+    {
+      "clip_ratio/high_max": 0.04296875,
+      "clip_ratio/high_mean": 0.04296875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.04296875,
+      "entropy": 2.489102602005005,
+      "epoch": 0.5,
+      "grad_norm": 1.1873637437820435,
+      "kl": 0.06463827937841415,
+      "learning_rate": 4.541666666666667e-06,
+      "loss": -0.07826828956604004,
+      "step": 12,
+      "step_time": 0.22605949299986605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2705869674682617,
+      "epoch": 0.5416666666666666,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5841100215911865,
+      "kl": 0.07928314059972763,
+      "learning_rate": 4.5e-06,
+      "loss": -0.05041670799255371,
+      "num_tokens": 100992.0,
+      "reward": -0.06781250238418579,
+      "reward_std": 0.02886570803821087,
+      "rewards/GeneratorRewardFunction/mean": -0.06781250238418579,
+      "rewards/GeneratorRewardFunction/std": 0.028865709900856018,
+      "step": 13,
+      "step_time": 12.360951734000082
+    },
+    {
+      "clip_ratio/high_max": 0.015625,
+      "clip_ratio/high_mean": 0.015625,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.01627604104578495,
+      "entropy": 1.7322059869766235,
+      "epoch": 0.5833333333333334,
+      "grad_norm": 2.0741848945617676,
+      "kl": 0.060624316334724426,
+      "learning_rate": 4.4583333333333336e-06,
+      "loss": -0.04468907043337822,
+      "step": 14,
+      "step_time": 0.21993768200036357
+    },
+    {
+      "clip_ratio/high_max": 0.01627604104578495,
+      "clip_ratio/high_mean": 0.01627604104578495,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.01888020895421505,
+      "entropy": 2.18737530708313,
+      "epoch": 0.625,
+      "grad_norm": 4.066506385803223,
+      "kl": 0.07642111927270889,
+      "learning_rate": 4.416666666666667e-06,
+      "loss": 0.1044829785823822,
+      "step": 15,
+      "step_time": 0.22013131699986843
+    },
+    {
+      "clip_ratio/high_max": 0.0442708320915699,
+      "clip_ratio/high_mean": 0.0442708320915699,
+      "clip_ratio/low_mean": 0.009765625,
+      "clip_ratio/low_min": 0.009765625,
+      "clip_ratio/region_mean": 0.0540364570915699,
+      "entropy": 2.622234582901001,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 3.0252487659454346,
+      "kl": 0.10498789697885513,
+      "learning_rate": 4.3750000000000005e-06,
+      "loss": -0.003130403347313404,
+      "step": 16,
+      "step_time": 0.21912707499996031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9028136134147644,
+      "epoch": 0.7083333333333334,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.3516546487808228,
+      "kl": 0.08489418029785156,
+      "learning_rate": 4.333333333333334e-06,
+      "loss": 0.03129524365067482,
+      "num_tokens": 126380.0,
+      "reward": -0.18468749523162842,
+      "reward_std": 0.27848079800605774,
+      "rewards/GeneratorRewardFunction/mean": -0.18468749523162842,
+      "rewards/GeneratorRewardFunction/std": 0.27848079800605774,
+      "step": 17,
+      "step_time": 12.429075290000128
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 2.785872220993042,
+      "epoch": 0.75,
+      "grad_norm": 1.737163782119751,
+      "kl": 0.10512515157461166,
+      "learning_rate": 4.2916666666666665e-06,
+      "loss": -0.0846581980586052,
+      "step": 18,
+      "step_time": 0.2297946270000466
+    },
+    {
+      "clip_ratio/high_max": 0.01171875,
+      "clip_ratio/high_mean": 0.01171875,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.012369791977107525,
+      "entropy": 2.066610097885132,
+      "epoch": 0.7916666666666666,
+      "grad_norm": 2.454716920852661,
+      "kl": 0.10243833065032959,
+      "learning_rate": 4.25e-06,
+      "loss": 0.031895026564598083,
+      "step": 19,
+      "step_time": 0.23007028499978333
+    },
+    {
+      "clip_ratio/high_max": 0.013671875,
+      "clip_ratio/high_mean": 0.013671875,
+      "clip_ratio/low_mean": 0.0032552082557231188,
+      "clip_ratio/low_min": 0.0032552082557231188,
+      "clip_ratio/region_mean": 0.01692708395421505,
+      "entropy": 1.9649995565414429,
+      "epoch": 0.8333333333333334,
+      "grad_norm": 2.563176155090332,
+      "kl": 0.07682739198207855,
+      "learning_rate": 4.208333333333333e-06,
+      "loss": 0.022962143644690514,
+      "step": 20,
+      "step_time": 0.23112243600007787
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.3113908767700195,
+      "epoch": 0.875,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.672222375869751,
+      "kl": 0.09574653953313828,
+      "learning_rate": 4.166666666666667e-06,
+      "loss": -0.04630495235323906,
+      "num_tokens": 151916.0,
+      "reward": -0.18390625715255737,
+      "reward_std": 0.24478043615818024,
+      "rewards/GeneratorRewardFunction/mean": -0.18390625715255737,
+      "rewards/GeneratorRewardFunction/std": 0.24478045105934143,
+      "step": 21,
+      "step_time": 12.423363883000093
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.0052083334885537624,
+      "entropy": 1.7172327041625977,
+      "epoch": 0.9166666666666666,
+      "grad_norm": 2.3833675384521484,
+      "kl": 0.10857907682657242,
+      "learning_rate": 4.125e-06,
+      "loss": 0.08606918901205063,
+      "step": 22,
+      "step_time": 0.23097956299989164
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.009114583022892475,
+      "entropy": 2.305208444595337,
+      "epoch": 0.9583333333333334,
+      "grad_norm": 1.6843225955963135,
+      "kl": 0.14590007066726685,
+      "learning_rate": 4.083333333333334e-06,
+      "loss": -0.037479907274246216,
+      "step": 23,
+      "step_time": 0.2308707210004286
+    },
+    {
+      "clip_ratio/high_max": 0.014973958022892475,
+      "clip_ratio/high_mean": 0.014973958022892475,
+      "clip_ratio/low_mean": 0.0052083334885537624,
+      "clip_ratio/low_min": 0.0052083334885537624,
+      "clip_ratio/region_mean": 0.02018229104578495,
+      "entropy": 1.4579397439956665,
+      "epoch": 1.0,
+      "grad_norm": 2.526630163192749,
+      "kl": 0.10583696514368057,
+      "learning_rate": 4.041666666666667e-06,
+      "loss": -0.0003783749125432223,
+      "step": 24,
+      "step_time": 0.23123665799994342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.00585675239563,
+      "epoch": 1.0416666666666667,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.705661952495575,
+      "kl": 0.10460541397333145,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": -0.030740460380911827,
+      "num_tokens": 177032.0,
+      "reward": -0.054375000298023224,
+      "reward_std": 0.012499997392296791,
+      "rewards/GeneratorRewardFunction/mean": -0.054375000298023224,
+      "rewards/GeneratorRewardFunction/std": 0.012500000186264515,
+      "step": 25,
+      "step_time": 12.342589669000063
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.3793340921401978,
+      "epoch": 1.0833333333333333,
+      "grad_norm": 1.3432904481887817,
+      "kl": 0.10281199216842651,
+      "learning_rate": 3.958333333333333e-06,
+      "loss": -0.0930887758731842,
+      "step": 26,
+      "step_time": 0.22512248300017745
+    },
+    {
+      "clip_ratio/high_max": 0.005859375,
+      "clip_ratio/high_mean": 0.005859375,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.005859375,
+      "entropy": 1.6825193166732788,
+      "epoch": 1.125,
+      "grad_norm": 2.242509603500366,
+      "kl": 0.10501086711883545,
+      "learning_rate": 3.916666666666667e-06,
+      "loss": 0.031402457505464554,
+      "step": 27,
+      "step_time": 0.22399900899972636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 1.8964576721191406,
+      "epoch": 1.1666666666666667,
+      "grad_norm": 2.502171039581299,
+      "kl": 0.12420836836099625,
+      "learning_rate": 3.875e-06,
+      "loss": 0.09433440119028091,
+      "step": 28,
+      "step_time": 0.22452458999987357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.6724086999893188,
+      "epoch": 1.2083333333333333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0264368057250977,
+      "kl": 0.15805059671401978,
+      "learning_rate": 3.833333333333334e-06,
+      "loss": -0.06202007457613945,
+      "num_tokens": 202536.0,
+      "reward": -0.125,
+      "reward_std": 0.20493900775909424,
+      "rewards/GeneratorRewardFunction/mean": -0.125,
+      "rewards/GeneratorRewardFunction/std": 0.20493900775909424,
+      "step": 29,
+      "step_time": 12.317649789999905
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 1.8093080520629883,
+      "epoch": 1.25,
+      "grad_norm": 1.1415506601333618,
+      "kl": 0.12885110080242157,
+      "learning_rate": 3.7916666666666666e-06,
+      "loss": -0.06245793402194977,
+      "step": 30,
+      "step_time": 0.22891983299996355
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 2.2301156520843506,
+      "epoch": 1.2916666666666667,
+      "grad_norm": 2.2105777263641357,
+      "kl": 0.19130028784275055,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 0.06395368278026581,
+      "step": 31,
+      "step_time": 0.22761336599978677
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.005859375,
+      "clip_ratio/low_min": 0.005859375,
+      "clip_ratio/region_mean": 0.013020833022892475,
+      "entropy": 1.473127007484436,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 2.4992551803588867,
+      "kl": 0.17668001353740692,
+      "learning_rate": 3.708333333333334e-06,
+      "loss": 0.06458758562803268,
+      "step": 32,
+      "step_time": 0.22745083499967222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.6607534885406494,
+      "epoch": 1.375,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.440457582473755,
+      "kl": 0.16098029911518097,
+      "learning_rate": 3.6666666666666666e-06,
+      "loss": 0.08189795166254044,
+      "num_tokens": 227804.0,
+      "reward": -0.06499999761581421,
+      "reward_std": 0.03568379953503609,
+      "rewards/GeneratorRewardFunction/mean": -0.06499999761581421,
+      "rewards/GeneratorRewardFunction/std": 0.03568379580974579,
+      "step": 33,
+      "step_time": 12.333147503999953
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 2.67105770111084,
+      "epoch": 1.4166666666666667,
+      "grad_norm": 1.7682558298110962,
+      "kl": 0.15667910873889923,
+      "learning_rate": 3.625e-06,
+      "loss": -0.11175209283828735,
+      "step": 34,
+      "step_time": 0.2226674130001811
+    },
+    {
+      "clip_ratio/high_max": 0.011067708022892475,
+      "clip_ratio/high_mean": 0.011067708022892475,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.011067708022892475,
+      "entropy": 2.243622064590454,
+      "epoch": 1.4583333333333333,
+      "grad_norm": 1.3864362239837646,
+      "kl": 0.1985555738210678,
+      "learning_rate": 3.5833333333333335e-06,
+      "loss": -0.08945892006158829,
+      "step": 35,
+      "step_time": 0.2224721780003165
+    },
+    {
+      "clip_ratio/high_max": 0.01888020895421505,
+      "clip_ratio/high_mean": 0.01888020895421505,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.02278645895421505,
+      "entropy": 2.7333145141601562,
+      "epoch": 1.5,
+      "grad_norm": 3.525688886642456,
+      "kl": 0.1788162738084793,
+      "learning_rate": 3.5416666666666673e-06,
+      "loss": 0.12119659781455994,
+      "step": 36,
+      "step_time": 0.22364275199970507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.323140859603882,
+      "epoch": 1.5416666666666665,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.3344190120697021,
+      "kl": 0.19980604946613312,
+      "learning_rate": 3.5e-06,
+      "loss": -0.08966616541147232,
+      "num_tokens": 253140.0,
+      "reward": -0.05218750238418579,
+      "reward_std": 0.006316314451396465,
+      "rewards/GeneratorRewardFunction/mean": -0.05218750238418579,
+      "rewards/GeneratorRewardFunction/std": 0.00631631538271904,
+      "step": 37,
+      "step_time": 12.417013897999823
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 2.036484956741333,
+      "epoch": 1.5833333333333335,
+      "grad_norm": 2.302572250366211,
+      "kl": 0.17958642542362213,
+      "learning_rate": 3.4583333333333334e-06,
+      "loss": 0.05969787761569023,
+      "step": 38,
+      "step_time": 0.23123699799998576
+    },
+    {
+      "clip_ratio/high_max": 0.010416666977107525,
+      "clip_ratio/high_mean": 0.010416666977107525,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.010416666977107525,
+      "entropy": 2.5753612518310547,
+      "epoch": 1.625,
+      "grad_norm": 2.368976354598999,
+      "kl": 0.1637336164712906,
+      "learning_rate": 3.416666666666667e-06,
+      "loss": -0.0014251094544306397,
+      "step": 39,
+      "step_time": 0.23036246999981813
+    },
+    {
+      "clip_ratio/high_max": 0.02213541604578495,
+      "clip_ratio/high_mean": 0.02213541604578495,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.02278645895421505,
+      "entropy": 2.702845573425293,
+      "epoch": 1.6666666666666665,
+      "grad_norm": 2.3713412284851074,
+      "kl": 0.1807432919740677,
+      "learning_rate": 3.3750000000000003e-06,
+      "loss": 0.034005194902420044,
+      "step": 40,
+      "step_time": 0.23214888699976655
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8684877157211304,
+      "epoch": 1.7083333333333335,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.169064521789551,
+      "kl": 0.15921029448509216,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 0.24258767068386078,
+      "num_tokens": 278460.0,
+      "reward": -0.07437500357627869,
+      "reward_std": 0.07449552416801453,
+      "rewards/GeneratorRewardFunction/mean": -0.07437500357627869,
+      "rewards/GeneratorRewardFunction/std": 0.07449552416801453,
+      "step": 41,
+      "step_time": 12.523336845000358
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.8914880752563477,
+      "epoch": 1.75,
+      "grad_norm": 2.1851789951324463,
+      "kl": 0.16675592958927155,
+      "learning_rate": 3.2916666666666668e-06,
+      "loss": 0.002008717739954591,
+      "step": 42,
+      "step_time": 0.22089864800000214
+    },
+    {
+      "clip_ratio/high_max": 0.0052083334885537624,
+      "clip_ratio/high_mean": 0.0052083334885537624,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0052083334885537624,
+      "entropy": 2.0609495639801025,
+      "epoch": 1.7916666666666665,
+      "grad_norm": 1.6196449995040894,
+      "kl": 0.1755555123090744,
+      "learning_rate": 3.2500000000000002e-06,
+      "loss": -0.11282453685998917,
+      "step": 43,
+      "step_time": 0.22081435700010843
+    },
+    {
+      "clip_ratio/high_max": 0.021484375,
+      "clip_ratio/high_mean": 0.021484375,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.021484375,
+      "entropy": 1.6751976013183594,
+      "epoch": 1.8333333333333335,
+      "grad_norm": 1.4664008617401123,
+      "kl": 0.1903287172317505,
+      "learning_rate": 3.2083333333333337e-06,
+      "loss": -0.12895803153514862,
+      "step": 44,
+      "step_time": 0.22197585999992953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.793825626373291,
+      "epoch": 1.875,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.486213445663452,
+      "kl": 0.1978270262479782,
+      "learning_rate": 3.1666666666666667e-06,
+      "loss": 0.02904059924185276,
+      "num_tokens": 303832.0,
+      "reward": -0.06187500059604645,
+      "reward_std": 0.018427787348628044,
+      "rewards/GeneratorRewardFunction/mean": -0.06187500059604645,
+      "rewards/GeneratorRewardFunction/std": 0.018427787348628044,
+      "step": 45,
+      "step_time": 12.442938505999791
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0045572915114462376,
+      "entropy": 1.8483558893203735,
+      "epoch": 1.9166666666666665,
+      "grad_norm": 1.7908422946929932,
+      "kl": 0.18597811460494995,
+      "learning_rate": 3.125e-06,
+      "loss": -0.035078153014183044,
+      "step": 46,
+      "step_time": 0.23107473800018852
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0045572915114462376,
+      "entropy": 2.0933265686035156,
+      "epoch": 1.9583333333333335,
+      "grad_norm": 2.3887951374053955,
+      "kl": 0.24478764832019806,
+      "learning_rate": 3.0833333333333336e-06,
+      "loss": 0.03933021053671837,
+      "step": 47,
+      "step_time": 0.2313891750000039
+    },
+    {
+      "clip_ratio/high_max": 0.010416666977107525,
+      "clip_ratio/high_mean": 0.010416666977107525,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.011067708022892475,
+      "entropy": 1.6857815980911255,
+      "epoch": 2.0,
+      "grad_norm": 1.1888960599899292,
+      "kl": 0.2191638946533203,
+      "learning_rate": 3.0416666666666666e-06,
+      "loss": -0.03249615058302879,
+      "step": 48,
+      "step_time": 0.23116914000001998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.1281673908233643,
+      "epoch": 2.0416666666666665,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 3.3418045043945312,
+      "kl": 0.2323862910270691,
+      "learning_rate": 3e-06,
+      "loss": 0.12015211582183838,
+      "num_tokens": 329212.0,
+      "reward": -0.09343749284744263,
+      "reward_std": 0.14919470250606537,
+      "rewards/GeneratorRewardFunction/mean": -0.09343749284744263,
+      "rewards/GeneratorRewardFunction/std": 0.14919471740722656,
+      "step": 49,
+      "step_time": 12.334705416999896
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 2.0504140853881836,
+      "epoch": 2.0833333333333335,
+      "grad_norm": 1.1310595273971558,
+      "kl": 0.21666662395000458,
+      "learning_rate": 2.9583333333333335e-06,
+      "loss": -0.06185801699757576,
+      "step": 50,
+      "step_time": 0.22278345899985652
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 2.090566635131836,
+      "epoch": 2.125,
+      "grad_norm": 0.8605477213859558,
+      "kl": 0.19241023063659668,
+      "learning_rate": 2.916666666666667e-06,
+      "loss": -0.05228700116276741,
+      "step": 51,
+      "step_time": 0.22478994999983115
+    },
+    {
+      "clip_ratio/high_max": 0.01888020895421505,
+      "clip_ratio/high_mean": 0.01888020895421505,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.021484375,
+      "entropy": 1.8061343431472778,
+      "epoch": 2.1666666666666665,
+      "grad_norm": 2.3467957973480225,
+      "kl": 0.20559708774089813,
+      "learning_rate": 2.875e-06,
+      "loss": -0.004945727530866861,
+      "step": 52,
+      "step_time": 0.2233186100002058
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.0986216068267822,
+      "epoch": 2.2083333333333335,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.6935897469520569,
+      "kl": 0.2905842065811157,
+      "learning_rate": 2.8333333333333335e-06,
+      "loss": -0.030335674062371254,
+      "num_tokens": 354388.0,
+      "reward": -0.05218750238418579,
+      "reward_std": 0.008750000968575478,
+      "rewards/GeneratorRewardFunction/mean": -0.05218750238418579,
+      "rewards/GeneratorRewardFunction/std": 0.008750000037252903,
+      "step": 53,
+      "step_time": 12.331060373999662
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8389402627944946,
+      "epoch": 2.25,
+      "grad_norm": 0.056280333548784256,
+      "kl": 0.3166055381298065,
+      "learning_rate": 2.791666666666667e-06,
+      "loss": 0.0007915138266980648,
+      "step": 54,
+      "step_time": 0.22823307299995577
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 2.0672857761383057,
+      "epoch": 2.2916666666666665,
+      "grad_norm": 0.6982938051223755,
+      "kl": 0.255536288022995,
+      "learning_rate": 2.7500000000000004e-06,
+      "loss": -0.030553629621863365,
+      "step": 55,
+      "step_time": 0.2274559459997363
+    },
+    {
+      "clip_ratio/high_max": 0.014322916977107525,
+      "clip_ratio/high_mean": 0.014322916977107525,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.014973958022892475,
+      "entropy": 1.800961971282959,
+      "epoch": 2.3333333333333335,
+      "grad_norm": 1.8801226615905762,
+      "kl": 0.20345354080200195,
+      "learning_rate": 2.7083333333333334e-06,
+      "loss": 0.06233185529708862,
+      "step": 56,
+      "step_time": 0.2291451330002019
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.1910431385040283,
+      "epoch": 2.375,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.6606095433235168,
+      "kl": 0.2666972875595093,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": -0.029419520869851112,
+      "num_tokens": 379740.0,
+      "reward": -0.050312504172325134,
+      "reward_std": 0.0012500007869675756,
+      "rewards/GeneratorRewardFunction/mean": -0.050312504172325134,
+      "rewards/GeneratorRewardFunction/std": 0.001249999739229679,
+      "step": 57,
+      "step_time": 12.390588525000112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.6509644985198975,
+      "epoch": 2.4166666666666665,
+      "grad_norm": 0.7344714403152466,
+      "kl": 0.27095064520835876,
+      "learning_rate": 2.6250000000000003e-06,
+      "loss": -0.02949327416718006,
+      "step": 58,
+      "step_time": 0.22986735099993894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "entropy": 1.9508165121078491,
+      "epoch": 2.4583333333333335,
+      "grad_norm": 2.0245089530944824,
+      "kl": 0.24057161808013916,
+      "learning_rate": 2.5833333333333337e-06,
+      "loss": 0.09111762046813965,
+      "step": 59,
+      "step_time": 0.22998812399964663
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 2.048118829727173,
+      "epoch": 2.5,
+      "grad_norm": 0.641863226890564,
+      "kl": 0.30453336238861084,
+      "learning_rate": 2.5416666666666668e-06,
+      "loss": -0.02960226871073246,
+      "step": 60,
+      "step_time": 0.2306224280000606
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2969672679901123,
+      "epoch": 2.5416666666666665,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.7382878661155701,
+      "kl": 0.296942800283432,
+      "learning_rate": 2.5e-06,
+      "loss": -0.030259618535637856,
+      "num_tokens": 405216.0,
+      "reward": -0.05218750238418579,
+      "reward_std": 0.006574888247996569,
+      "rewards/GeneratorRewardFunction/mean": -0.05218750238418579,
+      "rewards/GeneratorRewardFunction/std": 0.006574889644980431,
+      "step": 61,
+      "step_time": 12.311173853000128
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "entropy": 2.1786930561065674,
+      "epoch": 2.5833333333333335,
+      "grad_norm": 2.3152403831481934,
+      "kl": 0.3079628050327301,
+      "learning_rate": 2.4583333333333332e-06,
+      "loss": 0.06318899989128113,
+      "step": 62,
+      "step_time": 0.22889465099979134
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 2.3085124492645264,
+      "epoch": 2.625,
+      "grad_norm": 1.1132763624191284,
+      "kl": 0.30134913325309753,
+      "learning_rate": 2.4166666666666667e-06,
+      "loss": -0.06140845641493797,
+      "step": 63,
+      "step_time": 0.22866916800012405
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 2.3601491451263428,
+      "epoch": 2.6666666666666665,
+      "grad_norm": 2.493839740753174,
+      "kl": 0.3362214267253876,
+      "learning_rate": 2.375e-06,
+      "loss": 0.030866017565131187,
+      "step": 64,
+      "step_time": 0.2289339420003671
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.1568443775177,
+      "epoch": 2.7083333333333335,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3307244777679443,
+      "kl": 0.34504207968711853,
+      "learning_rate": 2.3333333333333336e-06,
+      "loss": -0.09026249498128891,
+      "num_tokens": 430464.0,
+      "reward": -0.051875002682209015,
+      "reward_std": 0.006291529163718224,
+      "rewards/GeneratorRewardFunction/mean": -0.051875002682209015,
+      "rewards/GeneratorRewardFunction/std": 0.006291529163718224,
+      "step": 65,
+      "step_time": 12.406595935000041
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 2.1828343868255615,
+      "epoch": 2.75,
+      "grad_norm": 0.7081091403961182,
+      "kl": 0.29117047786712646,
+      "learning_rate": 2.2916666666666666e-06,
+      "loss": -0.029572701081633568,
+      "step": 66,
+      "step_time": 0.21885032299996965
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 2.5054445266723633,
+      "epoch": 2.7916666666666665,
+      "grad_norm": 2.2795095443725586,
+      "kl": 0.2908760607242584,
+      "learning_rate": 2.25e-06,
+      "loss": 0.06366726756095886,
+      "step": 67,
+      "step_time": 0.21955299000001105
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.008463541977107525,
+      "entropy": 2.5866119861602783,
+      "epoch": 2.8333333333333335,
+      "grad_norm": 2.5023953914642334,
+      "kl": 0.32891178131103516,
+      "learning_rate": 2.2083333333333335e-06,
+      "loss": 0.06120677292346954,
+      "step": 68,
+      "step_time": 0.2204980289998275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.6892755031585693,
+      "epoch": 2.875,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 2.4671924114227295,
+      "kl": 0.4024137556552887,
+      "learning_rate": 2.166666666666667e-06,
+      "loss": 0.0010057635372504592,
+      "num_tokens": 455748.0,
+      "reward": -0.050312504172325134,
+      "reward_std": 0.0012500007869675756,
+      "rewards/GeneratorRewardFunction/mean": -0.050312504172325134,
+      "rewards/GeneratorRewardFunction/std": 0.001249999739229679,
+      "step": 69,
+      "step_time": 12.337975863999873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.2920382022857666,
+      "epoch": 2.9166666666666665,
+      "grad_norm": 0.050838783383369446,
+      "kl": 0.3717615604400635,
+      "learning_rate": 2.125e-06,
+      "loss": 0.0009294038754887879,
+      "step": 70,
+      "step_time": 0.22696916699987923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.742776870727539,
+      "epoch": 2.9583333333333335,
+      "grad_norm": 0.05694516375660896,
+      "kl": 0.32230833172798157,
+      "learning_rate": 2.0833333333333334e-06,
+      "loss": 0.0008057708037085831,
+      "step": 71,
+      "step_time": 0.226955056999941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.6485788822174072,
+      "epoch": 3.0,
+      "grad_norm": 0.04752276465296745,
+      "kl": 0.3216642141342163,
+      "learning_rate": 2.041666666666667e-06,
+      "loss": 0.000804160488769412,
+      "step": 72,
+      "step_time": 0.22688675500012323
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 3.031399965286255,
+      "epoch": 3.0416666666666665,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04752670228481293,
+      "kl": 0.4015932083129883,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.0010039829649031162,
+      "num_tokens": 481324.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 73,
+      "step_time": 12.331203256000208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.7322070598602295,
+      "epoch": 3.0833333333333335,
+      "grad_norm": 0.0566370002925396,
+      "kl": 0.4283183515071869,
+      "learning_rate": 1.9583333333333334e-06,
+      "loss": 0.0010707959299907088,
+      "step": 74,
+      "step_time": 0.23062532400035707
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.501298189163208,
+      "epoch": 3.125,
+      "grad_norm": 0.10160665214061737,
+      "kl": 0.4141906201839447,
+      "learning_rate": 1.916666666666667e-06,
+      "loss": 0.0010354764526709914,
+      "step": 75,
+      "step_time": 0.23127166999984183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.740558385848999,
+      "epoch": 3.1666666666666665,
+      "grad_norm": 0.05294595658779144,
+      "kl": 0.3685241937637329,
+      "learning_rate": 1.8750000000000003e-06,
+      "loss": 0.0009213103912770748,
+      "step": 76,
+      "step_time": 0.23059906500020588
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.565075159072876,
+      "epoch": 3.2083333333333335,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.2318193912506104,
+      "kl": 0.39917126297950745,
+      "learning_rate": 1.8333333333333333e-06,
+      "loss": 0.06347710639238358,
+      "num_tokens": 506544.0,
+      "reward": -0.1251562535762787,
+      "reward_std": 0.20536647737026215,
+      "rewards/GeneratorRewardFunction/mean": -0.1251562535762787,
+      "rewards/GeneratorRewardFunction/std": 0.20536647737026215,
+      "step": 77,
+      "step_time": 12.451436804999958
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 2.5695462226867676,
+      "epoch": 3.25,
+      "grad_norm": 1.0660792589187622,
+      "kl": 0.43529757857322693,
+      "learning_rate": 1.7916666666666667e-06,
+      "loss": -0.06140017509460449,
+      "step": 78,
+      "step_time": 0.223440125000252
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "entropy": 2.771979570388794,
+      "epoch": 3.2916666666666665,
+      "grad_norm": 2.520599365234375,
+      "kl": 0.3895336091518402,
+      "learning_rate": 1.75e-06,
+      "loss": 0.06315714865922928,
+      "step": 79,
+      "step_time": 0.22384726399968713
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 2.956615447998047,
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.1839433908462524,
+      "kl": 0.42561575770378113,
+      "learning_rate": 1.7083333333333334e-06,
+      "loss": -0.061606038361787796,
+      "step": 80,
+      "step_time": 0.22456051800008936
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.8945653438568115,
+      "epoch": 3.375,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.9572227001190186,
+      "kl": 0.3704819977283478,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.0934426486492157,
+      "num_tokens": 531820.0,
+      "reward": -0.050937503576278687,
+      "reward_std": 0.003749999450519681,
+      "rewards/GeneratorRewardFunction/mean": -0.050937503576278687,
+      "rewards/GeneratorRewardFunction/std": 0.0037499992176890373,
+      "step": 81,
+      "step_time": 12.363581040999634
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 2.7152912616729736,
+      "epoch": 3.4166666666666665,
+      "grad_norm": 0.7033668756484985,
+      "kl": 0.41299042105674744,
+      "learning_rate": 1.6250000000000001e-06,
+      "loss": -0.029737064614892006,
+      "step": 82,
+      "step_time": 0.22236188099986975
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 2.211116313934326,
+      "epoch": 3.4583333333333335,
+      "grad_norm": 0.657794177532196,
+      "kl": 0.3911992013454437,
+      "learning_rate": 1.5833333333333333e-06,
+      "loss": -0.029873592779040337,
+      "step": 83,
+      "step_time": 0.22224327899994023
+    },
+    {
+      "clip_ratio/high_max": 0.009765625,
+      "clip_ratio/high_mean": 0.009765625,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 2.4470269680023193,
+      "epoch": 3.5,
+      "grad_norm": 0.6856485605239868,
+      "kl": 0.41081511974334717,
+      "learning_rate": 1.5416666666666668e-06,
+      "loss": -0.029741326346993446,
+      "step": 84,
+      "step_time": 0.2234124139999949
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.9690322875976562,
+      "epoch": 3.5416666666666665,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.06701389700174332,
+      "kl": 0.4637357294559479,
+      "learning_rate": 1.5e-06,
+      "loss": 0.0011593393282964826,
+      "num_tokens": 556888.0,
+      "reward": -0.08843749761581421,
+      "reward_std": 0.1497967392206192,
+      "rewards/GeneratorRewardFunction/mean": -0.08843749761581421,
+      "rewards/GeneratorRewardFunction/std": 0.1497967392206192,
+      "step": 85,
+      "step_time": 12.362771884000267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.651390552520752,
+      "epoch": 3.5833333333333335,
+      "grad_norm": 2.273343086242676,
+      "kl": 0.30297160148620605,
+      "learning_rate": 1.4583333333333335e-06,
+      "loss": 0.06558748334646225,
+      "step": 86,
+      "step_time": 0.22204756400014958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.563995599746704,
+      "epoch": 3.625,
+      "grad_norm": 0.8811482191085815,
+      "kl": 0.34387049078941345,
+      "learning_rate": 1.4166666666666667e-06,
+      "loss": -0.03156151995062828,
+      "step": 87,
+      "step_time": 0.22124292899979991
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 2.9350931644439697,
+      "epoch": 3.6666666666666665,
+      "grad_norm": 0.7562029957771301,
+      "kl": 0.43041226267814636,
+      "learning_rate": 1.3750000000000002e-06,
+      "loss": -0.03143274411559105,
+      "step": 88,
+      "step_time": 0.22161645599999247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 3.0377445220947266,
+      "epoch": 3.7083333333333335,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.229877233505249,
+      "kl": 0.4358494281768799,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.06356880068778992,
+      "num_tokens": 582232.0,
+      "reward": -0.09531249850988388,
+      "reward_std": 0.15116733312606812,
+      "rewards/GeneratorRewardFunction/mean": -0.09531249850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15116733312606812,
+      "step": 89,
+      "step_time": 12.385518950000005
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.346175193786621,
+      "epoch": 3.75,
+      "grad_norm": 2.875476360321045,
+      "kl": 0.29328086972236633,
+      "learning_rate": 1.2916666666666669e-06,
+      "loss": 0.06316754966974258,
+      "step": 90,
+      "step_time": 0.2198658039997099
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 120,
+  "num_input_tokens_seen": 582232,
+  "num_train_epochs": 5,
+  "save_steps": 30,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-90/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c88c3b33b27b27c1cc7746fe61baacbbd09e1573fd273a1e9122cb89b0d1870e
+size 7249