Intermediate checkpoint upload step=120 (generator_train)

Browse files

Files changed (12) hide show

.gitattributes +1 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/chat_template.jinja +54 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/config.json +57 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/generation_config.json +13 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/model.safetensors +3 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/optimizer.pt +3 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/rng_state.pth +3 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/scheduler.pt +3 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/tokenizer.json +3 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/tokenizer_config.json +32 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/trainer_state.json +2224 -0
self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -85,3 +85,4 @@ self_play_hf_l40s_full/round_004/answerer_train/final_model/tokenizer.json filte
 self_play_hf_l40s_full/round_005/generator_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_005/generator_train/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_005/generator_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 self_play_hf_l40s_full/round_005/generator_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_005/generator_train/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_005/generator_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f2d531b41a4f6103c97f60295c77faba854dc8907a530ec20d48c472e5c9621
+size 1976163472

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7790d25b84db93c693c5ef59cf606f44af6b783a82fb135a00dbe3dd5553ee5
+size 3952509771

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddf233ab1c5767c7a618809cb16cda8677bbdb0ed7511223c707ea5be3318fab
+size 14645

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efad52b65f455156135da53c9927013d723b01aa46bbf7e57c3c8480f44fb55f
+size 1465

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2224 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 120,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.218832492828369,
+      "epoch": 0.041666666666666664,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.3453547954559326,
+      "kl": 0.0003995061560999602,
+      "learning_rate": 5e-06,
+      "loss": 0.062480177730321884,
+      "num_tokens": 25296.0,
+      "reward": -0.16249999403953552,
+      "reward_std": 0.24186772108078003,
+      "rewards/GeneratorRewardFunction/mean": -0.16249999403953552,
+      "rewards/GeneratorRewardFunction/std": 0.24186773598194122,
+      "step": 1,
+      "step_time": 12.200799825999638
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0325520820915699,
+      "clip_ratio/low_min": 0.0325520820915699,
+      "clip_ratio/region_mean": 0.0325520820915699,
+      "entropy": 4.258089542388916,
+      "epoch": 0.08333333333333333,
+      "grad_norm": 1.9494712352752686,
+      "kl": 0.01056413259357214,
+      "learning_rate": 4.958333333333334e-06,
+      "loss": 0.10853039473295212,
+      "step": 2,
+      "step_time": 0.22676954300004581
+    },
+    {
+      "clip_ratio/high_max": 0.0618489570915699,
+      "clip_ratio/high_mean": 0.0618489570915699,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0618489570915699,
+      "entropy": 3.910761594772339,
+      "epoch": 0.125,
+      "grad_norm": 1.5771934986114502,
+      "kl": 0.03891553357243538,
+      "learning_rate": 4.9166666666666665e-06,
+      "loss": -0.08321236819028854,
+      "step": 3,
+      "step_time": 0.2259805369994865
+    },
+    {
+      "clip_ratio/high_max": 0.0755208358168602,
+      "clip_ratio/high_mean": 0.0755208358168602,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0755208358168602,
+      "entropy": 4.357999324798584,
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.4966953992843628,
+      "kl": 0.03107132576406002,
+      "learning_rate": 4.875e-06,
+      "loss": -0.08200164884328842,
+      "step": 4,
+      "step_time": 0.22625164300006873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 3.2972638607025146,
+      "epoch": 0.20833333333333334,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012044131755828857,
+      "kl": 0.021904518827795982,
+      "learning_rate": 4.833333333333333e-06,
+      "loss": 5.476129081216641e-05,
+      "num_tokens": 50928.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 5,
+      "step_time": 12.42558103400006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.389660358428955,
+      "epoch": 0.25,
+      "grad_norm": 0.01742626540362835,
+      "kl": 0.034423865377902985,
+      "learning_rate": 4.791666666666668e-06,
+      "loss": 8.605967741459608e-05,
+      "step": 6,
+      "step_time": 0.2294949209999686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.004185199737549,
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.018903816118836403,
+      "kl": 0.0423373244702816,
+      "learning_rate": 4.75e-06,
+      "loss": 0.000105843304481823,
+      "step": 7,
+      "step_time": 0.22864662399933877
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.172105312347412,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.02683727815747261,
+      "kl": 0.05335487797856331,
+      "learning_rate": 4.708333333333334e-06,
+      "loss": 0.00013338720600586385,
+      "step": 8,
+      "step_time": 0.22813066999970033
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 3.9359867572784424,
+      "epoch": 0.375,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.7581679821014404,
+      "kl": 0.05953861400485039,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": -0.031090745702385902,
+      "num_tokens": 76192.0,
+      "reward": -0.08749999850988388,
+      "reward_std": 0.14999999105930328,
+      "rewards/GeneratorRewardFunction/mean": -0.08749999850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15000000596046448,
+      "step": 9,
+      "step_time": 12.354528401000607
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "entropy": 3.8035316467285156,
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.7192441821098328,
+      "kl": 0.05113422870635986,
+      "learning_rate": 4.625000000000001e-06,
+      "loss": -0.031105250120162964,
+      "step": 10,
+      "step_time": 0.22121003899974312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.490121841430664,
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.020974429324269295,
+      "kl": 0.06544887274503708,
+      "learning_rate": 4.583333333333333e-06,
+      "loss": 0.00016362218593712896,
+      "step": 11,
+      "step_time": 0.22179323100044712
+    },
+    {
+      "clip_ratio/high_max": 0.01692708395421505,
+      "clip_ratio/high_mean": 0.01692708395421505,
+      "clip_ratio/low_mean": 0.010416666977107525,
+      "clip_ratio/low_min": 0.010416666977107525,
+      "clip_ratio/region_mean": 0.02734375,
+      "entropy": 4.094717025756836,
+      "epoch": 0.5,
+      "grad_norm": 2.440980911254883,
+      "kl": 0.056731849908828735,
+      "learning_rate": 4.541666666666667e-06,
+      "loss": 0.06417856365442276,
+      "step": 12,
+      "step_time": 0.22233516199958103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.305630207061768,
+      "epoch": 0.5416666666666666,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.7685584425926208,
+      "kl": 0.06827350705862045,
+      "learning_rate": 4.5e-06,
+      "loss": -0.03110312856733799,
+      "num_tokens": 101896.0,
+      "reward": -0.05656249821186066,
+      "reward_std": 0.026250001043081284,
+      "rewards/GeneratorRewardFunction/mean": -0.05656249821186066,
+      "rewards/GeneratorRewardFunction/std": 0.026250001043081284,
+      "step": 13,
+      "step_time": 12.351281250999818
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "entropy": 4.384155750274658,
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.074666976928711,
+      "kl": 0.09433463960886002,
+      "learning_rate": 4.4583333333333336e-06,
+      "loss": -0.06210761144757271,
+      "step": 14,
+      "step_time": 0.228220768999563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 4.467245578765869,
+      "epoch": 0.625,
+      "grad_norm": 2.4682021141052246,
+      "kl": 0.0888032540678978,
+      "learning_rate": 4.416666666666667e-06,
+      "loss": 0.09433958679437637,
+      "step": 15,
+      "step_time": 0.22796953400029452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.379356384277344,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.05159206688404083,
+      "kl": 0.09723836183547974,
+      "learning_rate": 4.3750000000000005e-06,
+      "loss": 0.0002430959139019251,
+      "step": 16,
+      "step_time": 0.22781476000000112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.611701965332031,
+      "epoch": 0.7083333333333334,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.027016103267669678,
+      "kl": 0.09803465753793716,
+      "learning_rate": 4.333333333333334e-06,
+      "loss": 0.00024508664500899613,
+      "num_tokens": 127124.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 17,
+      "step_time": 12.34623024100074
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.3698506355285645,
+      "epoch": 0.75,
+      "grad_norm": 0.027759015560150146,
+      "kl": 0.10794633626937866,
+      "learning_rate": 4.2916666666666665e-06,
+      "loss": 0.00026986582088284194,
+      "step": 18,
+      "step_time": 0.22221811200051889
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.589417934417725,
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.033771730959415436,
+      "kl": 0.1163717731833458,
+      "learning_rate": 4.25e-06,
+      "loss": 0.0002909294271375984,
+      "step": 19,
+      "step_time": 0.22278399300012097
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.381205081939697,
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.034088168293237686,
+      "kl": 0.12577062845230103,
+      "learning_rate": 4.208333333333333e-06,
+      "loss": 0.0003144265792798251,
+      "step": 20,
+      "step_time": 0.2227981229998477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.56951904296875,
+      "epoch": 0.875,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.030865877866744995,
+      "kl": 0.1258154958486557,
+      "learning_rate": 4.166666666666667e-06,
+      "loss": 0.00031453874544240534,
+      "num_tokens": 152420.0,
+      "reward": -0.06875000149011612,
+      "reward_std": 0.07500000298023224,
+      "rewards/GeneratorRewardFunction/mean": -0.06875000149011612,
+      "rewards/GeneratorRewardFunction/std": 0.07500000298023224,
+      "step": 21,
+      "step_time": 12.431868073999794
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "entropy": 4.256842136383057,
+      "epoch": 0.9166666666666666,
+      "grad_norm": 1.0559728145599365,
+      "kl": 0.11962169408798218,
+      "learning_rate": 4.125e-06,
+      "loss": -0.06200842559337616,
+      "step": 22,
+      "step_time": 0.22747174799951608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.441510200500488,
+      "epoch": 0.9583333333333334,
+      "grad_norm": 0.8300934433937073,
+      "kl": 0.12788282334804535,
+      "learning_rate": 4.083333333333334e-06,
+      "loss": -0.030932163819670677,
+      "step": 23,
+      "step_time": 0.2274885289998565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 4.461675643920898,
+      "epoch": 1.0,
+      "grad_norm": 2.326770067214966,
+      "kl": 0.1365349292755127,
+      "learning_rate": 4.041666666666667e-06,
+      "loss": 0.09416845440864563,
+      "step": 24,
+      "step_time": 0.22709047199987253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.375850200653076,
+      "epoch": 1.0416666666666667,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.031697701662778854,
+      "kl": 0.1358719766139984,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.0003396799729671329,
+      "num_tokens": 177592.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 25,
+      "step_time": 12.350898434000555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.472981929779053,
+      "epoch": 1.0833333333333333,
+      "grad_norm": 0.033567290753126144,
+      "kl": 0.1449340432882309,
+      "learning_rate": 3.958333333333333e-06,
+      "loss": 0.0003623350930865854,
+      "step": 26,
+      "step_time": 0.22230296500038094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.471365451812744,
+      "epoch": 1.125,
+      "grad_norm": 0.033745091408491135,
+      "kl": 0.14426572620868683,
+      "learning_rate": 3.916666666666667e-06,
+      "loss": 0.0003606643294915557,
+      "step": 27,
+      "step_time": 0.22167697199984104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.190944671630859,
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.03441477566957474,
+      "kl": 0.14298784732818604,
+      "learning_rate": 3.875e-06,
+      "loss": 0.0003574696311261505,
+      "step": 28,
+      "step_time": 0.22162332900006732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.321583271026611,
+      "epoch": 1.2083333333333333,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 2.1965482234954834,
+      "kl": 0.13321949541568756,
+      "learning_rate": 3.833333333333334e-06,
+      "loss": 0.09402058273553848,
+      "num_tokens": 203376.0,
+      "reward": -0.06875000149011612,
+      "reward_std": 0.07499999552965164,
+      "rewards/GeneratorRewardFunction/mean": -0.06875000149011612,
+      "rewards/GeneratorRewardFunction/std": 0.07500000298023224,
+      "step": 29,
+      "step_time": 12.401389794999886
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.206346035003662,
+      "epoch": 1.25,
+      "grad_norm": 0.8052829504013062,
+      "kl": 0.14546459913253784,
+      "learning_rate": 3.7916666666666666e-06,
+      "loss": -0.03093709610402584,
+      "step": 30,
+      "step_time": 0.22923127000012755
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 4.350301742553711,
+      "epoch": 1.2916666666666667,
+      "grad_norm": 1.0959073305130005,
+      "kl": 0.19109351933002472,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": -0.061980169266462326,
+      "step": 31,
+      "step_time": 0.22754814400013856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.322090148925781,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.031234100461006165,
+      "kl": 0.12836213409900665,
+      "learning_rate": 3.708333333333334e-06,
+      "loss": 0.00032090532477013767,
+      "step": 32,
+      "step_time": 0.22721318900039478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.353842258453369,
+      "epoch": 1.375,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04642889276146889,
+      "kl": 0.15312014520168304,
+      "learning_rate": 3.6666666666666666e-06,
+      "loss": 0.00038280035369098186,
+      "num_tokens": 228780.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 33,
+      "step_time": 12.362105676000283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.097261905670166,
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.047277797013521194,
+      "kl": 0.1807316094636917,
+      "learning_rate": 3.625e-06,
+      "loss": 0.00045182902249507606,
+      "step": 34,
+      "step_time": 0.2276034909991722
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.232495307922363,
+      "epoch": 1.4583333333333333,
+      "grad_norm": 0.03665163367986679,
+      "kl": 0.16979527473449707,
+      "learning_rate": 3.5833333333333335e-06,
+      "loss": 0.0004244882147759199,
+      "step": 35,
+      "step_time": 0.22647975700056122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.373459339141846,
+      "epoch": 1.5,
+      "grad_norm": 0.03509606420993805,
+      "kl": 0.1546647995710373,
+      "learning_rate": 3.5416666666666673e-06,
+      "loss": 0.0003866619954351336,
+      "step": 36,
+      "step_time": 0.22700092600007338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.0846028327941895,
+      "epoch": 1.5416666666666665,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 2.4683332443237305,
+      "kl": 0.16653424501419067,
+      "learning_rate": 3.5e-06,
+      "loss": 0.05828670784831047,
+      "num_tokens": 254116.0,
+      "reward": -0.05015625059604645,
+      "reward_std": 0.0006249994039535522,
+      "rewards/GeneratorRewardFunction/mean": -0.05015625059604645,
+      "rewards/GeneratorRewardFunction/std": 0.0006249994039535522,
+      "step": 37,
+      "step_time": 12.278016894999382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.041675567626953,
+      "epoch": 1.5833333333333335,
+      "grad_norm": 0.036362167447805405,
+      "kl": 0.17584657669067383,
+      "learning_rate": 3.4583333333333334e-06,
+      "loss": 0.00043961641495116055,
+      "step": 38,
+      "step_time": 0.22066016499957186
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 4.3244171142578125,
+      "epoch": 1.625,
+      "grad_norm": 0.6458258032798767,
+      "kl": 0.1514502316713333,
+      "learning_rate": 3.416666666666667e-06,
+      "loss": -0.02857072651386261,
+      "step": 39,
+      "step_time": 0.2196550920007212
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 4.209409236907959,
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.7317789196968079,
+      "kl": 0.13871996104717255,
+      "learning_rate": 3.3750000000000003e-06,
+      "loss": -0.028499560430645943,
+      "step": 40,
+      "step_time": 0.2200379099995189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.327061653137207,
+      "epoch": 1.7083333333333335,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.7506322860717773,
+      "kl": 0.16559262573719025,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": -0.03082560934126377,
+      "num_tokens": 279536.0,
+      "reward": -0.08749999850988388,
+      "reward_std": 0.14999999105930328,
+      "rewards/GeneratorRewardFunction/mean": -0.08749999850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15000000596046448,
+      "step": 41,
+      "step_time": 12.311622474000615
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.28145170211792,
+      "epoch": 1.75,
+      "grad_norm": 0.037420228123664856,
+      "kl": 0.1730494350194931,
+      "learning_rate": 3.2916666666666668e-06,
+      "loss": 0.00043262357939966023,
+      "step": 42,
+      "step_time": 0.22799636899981124
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.0052083334885537624,
+      "entropy": 3.382018804550171,
+      "epoch": 1.7916666666666665,
+      "grad_norm": 2.7098262310028076,
+      "kl": 0.14611031115055084,
+      "learning_rate": 3.2500000000000002e-06,
+      "loss": 0.06306206434965134,
+      "step": 43,
+      "step_time": 0.22749470799953997
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.008463541977107525,
+      "entropy": 4.229821681976318,
+      "epoch": 1.8333333333333335,
+      "grad_norm": 0.6993237733840942,
+      "kl": 0.16464383900165558,
+      "learning_rate": 3.2083333333333337e-06,
+      "loss": -0.030900651589035988,
+      "step": 44,
+      "step_time": 0.2268984759994055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.33951997756958,
+      "epoch": 1.875,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.032941851764917374,
+      "kl": 0.18063347041606903,
+      "learning_rate": 3.1666666666666667e-06,
+      "loss": 0.00045158364810049534,
+      "num_tokens": 304840.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 45,
+      "step_time": 12.408172645000377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.255344867706299,
+      "epoch": 1.9166666666666665,
+      "grad_norm": 0.038530610501766205,
+      "kl": 0.1718256026506424,
+      "learning_rate": 3.125e-06,
+      "loss": 0.00042956401011906564,
+      "step": 46,
+      "step_time": 0.2266177340006834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.8836581707000732,
+      "epoch": 1.9583333333333335,
+      "grad_norm": 0.025322094559669495,
+      "kl": 0.13393373787403107,
+      "learning_rate": 3.0833333333333336e-06,
+      "loss": 0.0003348343598190695,
+      "step": 47,
+      "step_time": 0.22698829199998727
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.329832077026367,
+      "epoch": 2.0,
+      "grad_norm": 0.1431460827589035,
+      "kl": 0.22837483882904053,
+      "learning_rate": 3.0416666666666666e-06,
+      "loss": 0.0005709370598196983,
+      "step": 48,
+      "step_time": 0.22729420899941033
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.286876678466797,
+      "epoch": 2.0416666666666665,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.031947534531354904,
+      "kl": 0.1614980697631836,
+      "learning_rate": 3e-06,
+      "loss": 0.0004037451872136444,
+      "num_tokens": 330064.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 49,
+      "step_time": 12.335740126000019
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.031215667724609,
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.030324924737215042,
+      "kl": 0.16199041903018951,
+      "learning_rate": 2.9583333333333335e-06,
+      "loss": 0.00040497604641132057,
+      "step": 50,
+      "step_time": 0.2283358570002747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.352718353271484,
+      "epoch": 2.125,
+      "grad_norm": 0.03301743045449257,
+      "kl": 0.17951743304729462,
+      "learning_rate": 2.916666666666667e-06,
+      "loss": 0.00044879360939376056,
+      "step": 51,
+      "step_time": 0.22797506900042208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.1786041259765625,
+      "epoch": 2.1666666666666665,
+      "grad_norm": 0.03034944273531437,
+      "kl": 0.16076649725437164,
+      "learning_rate": 2.875e-06,
+      "loss": 0.0004019162443000823,
+      "step": 52,
+      "step_time": 0.2283364969998729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.120919704437256,
+      "epoch": 2.2083333333333335,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.030451813712716103,
+      "kl": 0.16769714653491974,
+      "learning_rate": 2.8333333333333335e-06,
+      "loss": 0.00041924286051653326,
+      "num_tokens": 355500.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 53,
+      "step_time": 12.406069011
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.143009185791016,
+      "epoch": 2.25,
+      "grad_norm": 0.04242139682173729,
+      "kl": 0.18013274669647217,
+      "learning_rate": 2.791666666666667e-06,
+      "loss": 0.00045033186324872077,
+      "step": 54,
+      "step_time": 0.22930021999945893
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.9695751667022705,
+      "epoch": 2.2916666666666665,
+      "grad_norm": 0.026388173922896385,
+      "kl": 0.14907608926296234,
+      "learning_rate": 2.7500000000000004e-06,
+      "loss": 0.00037269017775543034,
+      "step": 55,
+      "step_time": 0.22874155800036533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.342067718505859,
+      "epoch": 2.3333333333333335,
+      "grad_norm": 0.03221583738923073,
+      "kl": 0.17097871005535126,
+      "learning_rate": 2.7083333333333334e-06,
+      "loss": 0.00042744679376482964,
+      "step": 56,
+      "step_time": 0.22889051099991775
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.179519176483154,
+      "epoch": 2.375,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.028109125792980194,
+      "kl": 0.17036812007427216,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 0.0004259202687535435,
+      "num_tokens": 381024.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 57,
+      "step_time": 12.308869638000033
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.261288166046143,
+      "epoch": 2.4166666666666665,
+      "grad_norm": 0.030122917145490646,
+      "kl": 0.1687396764755249,
+      "learning_rate": 2.6250000000000003e-06,
+      "loss": 0.0004218492249492556,
+      "step": 58,
+      "step_time": 0.23021780399994896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.327264308929443,
+      "epoch": 2.4583333333333335,
+      "grad_norm": 0.030730031430721283,
+      "kl": 0.17366604506969452,
+      "learning_rate": 2.5833333333333337e-06,
+      "loss": 0.0004341650346759707,
+      "step": 59,
+      "step_time": 0.23006772000007913
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.266103267669678,
+      "epoch": 2.5,
+      "grad_norm": 0.03138311952352524,
+      "kl": 0.16585773229599,
+      "learning_rate": 2.5416666666666668e-06,
+      "loss": 0.00041464436799287796,
+      "step": 60,
+      "step_time": 0.2302518249998684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.149628162384033,
+      "epoch": 2.5416666666666665,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.031717631965875626,
+      "kl": 0.1798960417509079,
+      "learning_rate": 2.5e-06,
+      "loss": 0.00044974012416787446,
+      "num_tokens": 406568.0,
+      "reward": -0.08749999850988388,
+      "reward_std": 0.15000000596046448,
+      "rewards/GeneratorRewardFunction/mean": -0.08749999850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15000000596046448,
+      "step": 61,
+      "step_time": 12.3971151249998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.3893141746521,
+      "epoch": 2.5833333333333335,
+      "grad_norm": 0.7632278203964233,
+      "kl": 0.1566535085439682,
+      "learning_rate": 2.4583333333333332e-06,
+      "loss": -0.030905282124876976,
+      "step": 62,
+      "step_time": 0.23158665199935058
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.1660566329956055,
+      "epoch": 2.625,
+      "grad_norm": 2.1362805366516113,
+      "kl": 0.1582614928483963,
+      "learning_rate": 2.4166666666666667e-06,
+      "loss": 0.09412431716918945,
+      "step": 63,
+      "step_time": 0.2294769879999876
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 4.3899383544921875,
+      "epoch": 2.6666666666666665,
+      "grad_norm": 1.1546765565872192,
+      "kl": 0.15236173570156097,
+      "learning_rate": 2.375e-06,
+      "loss": -0.06209668517112732,
+      "step": 64,
+      "step_time": 0.22880948400052148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.150634288787842,
+      "epoch": 2.7083333333333335,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.030919430777430534,
+      "kl": 0.1692151576280594,
+      "learning_rate": 2.3333333333333336e-06,
+      "loss": 0.0004230378835927695,
+      "num_tokens": 431868.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 65,
+      "step_time": 12.471727858999657
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.398933410644531,
+      "epoch": 2.75,
+      "grad_norm": 0.03198188170790672,
+      "kl": 0.16747929155826569,
+      "learning_rate": 2.2916666666666666e-06,
+      "loss": 0.00041869821143336594,
+      "step": 66,
+      "step_time": 0.22940661600023304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.332639217376709,
+      "epoch": 2.7916666666666665,
+      "grad_norm": 0.026377171277999878,
+      "kl": 0.15790392458438873,
+      "learning_rate": 2.25e-06,
+      "loss": 0.00039475978701375425,
+      "step": 67,
+      "step_time": 0.2274791459994958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.401764869689941,
+      "epoch": 2.8333333333333335,
+      "grad_norm": 0.03296651318669319,
+      "kl": 0.18048568069934845,
+      "learning_rate": 2.2083333333333335e-06,
+      "loss": 0.000451214233180508,
+      "step": 68,
+      "step_time": 0.22802397799932805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.272307395935059,
+      "epoch": 2.875,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.03396793082356453,
+      "kl": 0.15817908942699432,
+      "learning_rate": 2.166666666666667e-06,
+      "loss": 0.00039544771425426006,
+      "num_tokens": 457260.0,
+      "reward": -0.06875000149011612,
+      "reward_std": 0.07499999552965164,
+      "rewards/GeneratorRewardFunction/mean": -0.06875000149011612,
+      "rewards/GeneratorRewardFunction/std": 0.07500000298023224,
+      "step": 69,
+      "step_time": 12.304982241999824
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.329678058624268,
+      "epoch": 2.9166666666666665,
+      "grad_norm": 2.354091167449951,
+      "kl": 0.15900522470474243,
+      "learning_rate": 2.125e-06,
+      "loss": 0.06253280490636826,
+      "step": 70,
+      "step_time": 0.2274878809994334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.420726776123047,
+      "epoch": 2.9583333333333335,
+      "grad_norm": 1.074094533920288,
+      "kl": 0.14555977284908295,
+      "learning_rate": 2.0833333333333334e-06,
+      "loss": -0.06215827539563179,
+      "step": 71,
+      "step_time": 0.22734129800028313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.37536096572876,
+      "epoch": 3.0,
+      "grad_norm": 0.03002353385090828,
+      "kl": 0.16313163936138153,
+      "learning_rate": 2.041666666666667e-06,
+      "loss": 0.0004078290658071637,
+      "step": 72,
+      "step_time": 0.2267978280005991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.281651020050049,
+      "epoch": 3.0416666666666665,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.029297830536961555,
+      "kl": 0.17000456154346466,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.0004250114143360406,
+      "num_tokens": 482596.0,
+      "reward": -0.08749999850988388,
+      "reward_std": 0.14999999105930328,
+      "rewards/GeneratorRewardFunction/mean": -0.08749999850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15000000596046448,
+      "step": 73,
+      "step_time": 12.327367548999973
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.1541523933410645,
+      "epoch": 3.0833333333333335,
+      "grad_norm": 2.465907573699951,
+      "kl": 0.14764432609081268,
+      "learning_rate": 1.9583333333333334e-06,
+      "loss": 0.09413985162973404,
+      "step": 74,
+      "step_time": 0.22862437100047828
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 4.466418266296387,
+      "epoch": 3.125,
+      "grad_norm": 1.0353459119796753,
+      "kl": 0.1653010994195938,
+      "learning_rate": 1.916666666666667e-06,
+      "loss": -0.062276482582092285,
+      "step": 75,
+      "step_time": 0.22827705399959086
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 4.261594772338867,
+      "epoch": 3.1666666666666665,
+      "grad_norm": 0.7295825481414795,
+      "kl": 0.14723657071590424,
+      "learning_rate": 1.8750000000000003e-06,
+      "loss": -0.03112857975065708,
+      "step": 76,
+      "step_time": 0.22811511000054452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.484732627868652,
+      "epoch": 3.2083333333333335,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.027748463675379753,
+      "kl": 0.1574263721704483,
+      "learning_rate": 1.8333333333333333e-06,
+      "loss": 0.0003935658896807581,
+      "num_tokens": 507932.0,
+      "reward": -0.08749999850988388,
+      "reward_std": 0.15000000596046448,
+      "rewards/GeneratorRewardFunction/mean": -0.08749999850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15000000596046448,
+      "step": 77,
+      "step_time": 12.387434866000149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.202768802642822,
+      "epoch": 3.25,
+      "grad_norm": 0.7808005809783936,
+      "kl": 0.16236911714076996,
+      "learning_rate": 1.7916666666666667e-06,
+      "loss": -0.0308381374925375,
+      "step": 78,
+      "step_time": 0.22989690699978382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.211698532104492,
+      "epoch": 3.2916666666666665,
+      "grad_norm": 2.246155261993408,
+      "kl": 0.17786841094493866,
+      "learning_rate": 1.75e-06,
+      "loss": 0.09421990066766739,
+      "step": 79,
+      "step_time": 0.2285108489995764
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 4.602865219116211,
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.0171711444854736,
+      "kl": 0.1722370833158493,
+      "learning_rate": 1.7083333333333334e-06,
+      "loss": -0.062155067920684814,
+      "step": 80,
+      "step_time": 0.2289368479996483
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.363098621368408,
+      "epoch": 3.375,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.027760757133364677,
+      "kl": 0.16952906548976898,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.00042382263927720487,
+      "num_tokens": 533560.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 81,
+      "step_time": 12.377727779999987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.349791049957275,
+      "epoch": 3.4166666666666665,
+      "grad_norm": 0.027347413823008537,
+      "kl": 0.15509888529777527,
+      "learning_rate": 1.6250000000000001e-06,
+      "loss": 0.0003877471899613738,
+      "step": 82,
+      "step_time": 0.23147920399969735
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.442678451538086,
+      "epoch": 3.4583333333333335,
+      "grad_norm": 0.035067103803157806,
+      "kl": 0.16633033752441406,
+      "learning_rate": 1.5833333333333333e-06,
+      "loss": 0.0004158258670940995,
+      "step": 83,
+      "step_time": 0.2302408289997402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.387204170227051,
+      "epoch": 3.5,
+      "grad_norm": 0.02926628850400448,
+      "kl": 0.16001783311367035,
+      "learning_rate": 1.5416666666666668e-06,
+      "loss": 0.0004000445769634098,
+      "step": 84,
+      "step_time": 0.23049437400004535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 3.68717885017395,
+      "epoch": 3.5416666666666665,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.21665358543396,
+      "kl": 0.14173372089862823,
+      "learning_rate": 1.5e-06,
+      "loss": 0.06187007203698158,
+      "num_tokens": 558860.0,
+      "reward": -0.05078125,
+      "reward_std": 0.0031250000465661287,
+      "rewards/GeneratorRewardFunction/mean": -0.05078125,
+      "rewards/GeneratorRewardFunction/std": 0.0031250000465661287,
+      "step": 85,
+      "step_time": 12.32021258199984
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.593854904174805,
+      "epoch": 3.5833333333333335,
+      "grad_norm": 0.031670551747083664,
+      "kl": 0.1700318455696106,
+      "learning_rate": 1.4583333333333335e-06,
+      "loss": 0.00042507960461080074,
+      "step": 86,
+      "step_time": 0.22247004299970286
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.506083965301514,
+      "epoch": 3.625,
+      "grad_norm": 0.03483761474490166,
+      "kl": 0.1663081794977188,
+      "learning_rate": 1.4166666666666667e-06,
+      "loss": 0.00041577042429707944,
+      "step": 87,
+      "step_time": 0.22216213699994114
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 4.289217472076416,
+      "epoch": 3.6666666666666665,
+      "grad_norm": 1.0936129093170166,
+      "kl": 0.1807820200920105,
+      "learning_rate": 1.3750000000000002e-06,
+      "loss": -0.06104663014411926,
+      "step": 88,
+      "step_time": 0.22188600200024666
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.46470832824707,
+      "epoch": 3.7083333333333335,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0287722647190094,
+      "kl": 0.1612616628408432,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.0004031541757285595,
+      "num_tokens": 584528.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 89,
+      "step_time": 12.365053849000105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.535579204559326,
+      "epoch": 3.75,
+      "grad_norm": 0.05196619778871536,
+      "kl": 0.18497739732265472,
+      "learning_rate": 1.2916666666666669e-06,
+      "loss": 0.00046244345139712095,
+      "step": 90,
+      "step_time": 0.22889404699981242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.383248805999756,
+      "epoch": 3.7916666666666665,
+      "grad_norm": 0.028284456580877304,
+      "kl": 0.16009828448295593,
+      "learning_rate": 1.25e-06,
+      "loss": 0.0004002457426395267,
+      "step": 91,
+      "step_time": 0.2279681170002732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.586085796356201,
+      "epoch": 3.8333333333333335,
+      "grad_norm": 0.026829512789845467,
+      "kl": 0.15121014416217804,
+      "learning_rate": 1.2083333333333333e-06,
+      "loss": 0.00037802537553943694,
+      "step": 92,
+      "step_time": 0.22847567699955107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.395538330078125,
+      "epoch": 3.875,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.033445119857788,
+      "kl": 0.1492132991552353,
+      "learning_rate": 1.1666666666666668e-06,
+      "loss": -0.06210946664214134,
+      "num_tokens": 609680.0,
+      "reward": -0.09460937976837158,
+      "reward_std": 0.1784375011920929,
+      "rewards/GeneratorRewardFunction/mean": -0.09460937976837158,
+      "rewards/GeneratorRewardFunction/std": 0.1784375011920929,
+      "step": 93,
+      "step_time": 12.347689829000046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.484641075134277,
+      "epoch": 3.9166666666666665,
+      "grad_norm": 0.04464663937687874,
+      "kl": 0.18426960706710815,
+      "learning_rate": 1.125e-06,
+      "loss": 0.000460673967609182,
+      "step": 94,
+      "step_time": 0.22694099799991818
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.422642946243286,
+      "epoch": 3.9583333333333335,
+      "grad_norm": 0.5639758110046387,
+      "kl": 0.17420244216918945,
+      "learning_rate": 1.0833333333333335e-06,
+      "loss": 0.09412115812301636,
+      "step": 95,
+      "step_time": 0.22602716800065537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.437925338745117,
+      "epoch": 4.0,
+      "grad_norm": 0.770872950553894,
+      "kl": 0.16676191985607147,
+      "learning_rate": 1.0416666666666667e-06,
+      "loss": -0.03097396530210972,
+      "step": 96,
+      "step_time": 0.22649503800039383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.315637111663818,
+      "epoch": 4.041666666666667,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.8566199541091919,
+      "kl": 0.1636391580104828,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": -0.03090417943894863,
+      "num_tokens": 635024.0,
+      "reward": -0.08749999850988388,
+      "reward_std": 0.14999999105930328,
+      "rewards/GeneratorRewardFunction/mean": -0.08749999850988388,
+      "rewards/GeneratorRewardFunction/std": 0.15000000596046448,
+      "step": 97,
+      "step_time": 12.357226628999342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.506310939788818,
+      "epoch": 4.083333333333333,
+      "grad_norm": 0.7631968855857849,
+      "kl": 0.16027097404003143,
+      "learning_rate": 9.583333333333334e-07,
+      "loss": -0.030869366601109505,
+      "step": 98,
+      "step_time": 0.2284578409999085
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.485410213470459,
+      "epoch": 4.125,
+      "grad_norm": 0.031147433444857597,
+      "kl": 0.1742296814918518,
+      "learning_rate": 9.166666666666666e-07,
+      "loss": 0.00043557421304285526,
+      "step": 99,
+      "step_time": 0.22820654599945556
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.353515148162842,
+      "epoch": 4.166666666666667,
+      "grad_norm": 2.4380831718444824,
+      "kl": 0.17426633834838867,
+      "learning_rate": 8.75e-07,
+      "loss": 0.06307206302881241,
+      "step": 100,
+      "step_time": 0.22843013999954564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.318760871887207,
+      "epoch": 4.208333333333333,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.028121337294578552,
+      "kl": 0.15765920281410217,
+      "learning_rate": 8.333333333333333e-07,
+      "loss": 0.0003941480244975537,
+      "num_tokens": 660652.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 101,
+      "step_time": 12.370995348000179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.430075168609619,
+      "epoch": 4.25,
+      "grad_norm": 0.03579903766512871,
+      "kl": 0.17973928153514862,
+      "learning_rate": 7.916666666666667e-07,
+      "loss": 0.00044934815377928317,
+      "step": 102,
+      "step_time": 0.2293741889998273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.247735023498535,
+      "epoch": 4.291666666666667,
+      "grad_norm": 0.0289804358035326,
+      "kl": 0.1657833307981491,
+      "learning_rate": 7.5e-07,
+      "loss": 0.00041445824899710715,
+      "step": 103,
+      "step_time": 0.2288563800002521
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.115577220916748,
+      "epoch": 4.333333333333333,
+      "grad_norm": 0.028330236673355103,
+      "kl": 0.14796315133571625,
+      "learning_rate": 7.083333333333334e-07,
+      "loss": 0.0003699079097714275,
+      "step": 104,
+      "step_time": 0.23080896100054815
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.270058631896973,
+      "epoch": 4.375,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.02760324627161026,
+      "kl": 0.15488965809345245,
+      "learning_rate": 6.666666666666667e-07,
+      "loss": 0.0003872241359204054,
+      "num_tokens": 685708.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 105,
+      "step_time": 12.344494807000046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.546361923217773,
+      "epoch": 4.416666666666667,
+      "grad_norm": 0.03014361299574375,
+      "kl": 0.17117412388324738,
+      "learning_rate": 6.25e-07,
+      "loss": 0.0004279353597667068,
+      "step": 106,
+      "step_time": 0.22083965500041813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.564030647277832,
+      "epoch": 4.458333333333333,
+      "grad_norm": 0.03440464660525322,
+      "kl": 0.17559456825256348,
+      "learning_rate": 5.833333333333334e-07,
+      "loss": 0.0004389864334370941,
+      "step": 107,
+      "step_time": 0.22036629600006563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.176433086395264,
+      "epoch": 4.5,
+      "grad_norm": 0.028006279841065407,
+      "kl": 0.163270965218544,
+      "learning_rate": 5.416666666666667e-07,
+      "loss": 0.00040817740955390036,
+      "step": 108,
+      "step_time": 0.22036177600057272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.552209854125977,
+      "epoch": 4.541666666666667,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.030375948175787926,
+      "kl": 0.16741327941417694,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.00041853313450701535,
+      "num_tokens": 711060.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 109,
+      "step_time": 12.369933012000729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.339206695556641,
+      "epoch": 4.583333333333333,
+      "grad_norm": 0.10336848348379135,
+      "kl": 0.21190087497234344,
+      "learning_rate": 4.583333333333333e-07,
+      "loss": 0.0005297521711327136,
+      "step": 110,
+      "step_time": 0.2273971739996341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.6490559577941895,
+      "epoch": 4.625,
+      "grad_norm": 0.03464744985103607,
+      "kl": 0.17225484549999237,
+      "learning_rate": 4.1666666666666667e-07,
+      "loss": 0.00043063712655566633,
+      "step": 111,
+      "step_time": 0.2279215849994216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.271548748016357,
+      "epoch": 4.666666666666667,
+      "grad_norm": 0.030309390276670456,
+      "kl": 0.14847837388515472,
+      "learning_rate": 3.75e-07,
+      "loss": 0.0003711959288921207,
+      "step": 112,
+      "step_time": 0.22734109300017735
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.5982232093811035,
+      "epoch": 4.708333333333333,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.029322689399123192,
+      "kl": 0.1586071103811264,
+      "learning_rate": 3.3333333333333335e-07,
+      "loss": 0.0003965177165810019,
+      "num_tokens": 736748.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 113,
+      "step_time": 12.333232826999847
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.43204927444458,
+      "epoch": 4.75,
+      "grad_norm": 0.0323464497923851,
+      "kl": 0.1504228711128235,
+      "learning_rate": 2.916666666666667e-07,
+      "loss": 0.00037605719990096986,
+      "step": 114,
+      "step_time": 0.22894687000007252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.273895263671875,
+      "epoch": 4.791666666666667,
+      "grad_norm": 0.035788681358098984,
+      "kl": 0.17889364063739777,
+      "learning_rate": 2.5000000000000004e-07,
+      "loss": 0.00044723405153490603,
+      "step": 115,
+      "step_time": 0.22825912600001175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.463144302368164,
+      "epoch": 4.833333333333333,
+      "grad_norm": 0.029977256432175636,
+      "kl": 0.16924364864826202,
+      "learning_rate": 2.0833333333333333e-07,
+      "loss": 0.0004231091297697276,
+      "step": 116,
+      "step_time": 0.22959902599995985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 4.496586322784424,
+      "epoch": 4.875,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0810508206486702,
+      "kl": 0.18324460089206696,
+      "learning_rate": 1.6666666666666668e-07,
+      "loss": 0.0004581115208566189,
+      "num_tokens": 762100.0,
+      "reward": -0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/GeneratorRewardFunction/mean": -0.05000000074505806,
+      "rewards/GeneratorRewardFunction/std": 0.0,
+      "step": 117,
+      "step_time": 12.307060693000494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.538492202758789,
+      "epoch": 4.916666666666667,
+      "grad_norm": 0.02771252952516079,
+      "kl": 0.1558750718832016,
+      "learning_rate": 1.2500000000000002e-07,
+      "loss": 0.0003896876296494156,
+      "step": 118,
+      "step_time": 0.22174717100006092
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.4321112632751465,
+      "epoch": 4.958333333333333,
+      "grad_norm": 0.027858059853315353,
+      "kl": 0.14329802989959717,
+      "learning_rate": 8.333333333333334e-08,
+      "loss": 0.00035824510268867016,
+      "step": 119,
+      "step_time": 0.2216543990007267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.336979389190674,
+      "epoch": 5.0,
+      "grad_norm": 0.029509786516427994,
+      "kl": 0.167790487408638,
+      "learning_rate": 4.166666666666667e-08,
+      "loss": 0.00041947621502913535,
+      "step": 120,
+      "step_time": 0.2207649100000708
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 120,
+  "num_input_tokens_seen": 762100,
+  "num_train_epochs": 5,
+  "save_steps": 30,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

self_play_hf_l40s_full/round_005/generator_train/checkpoint-120/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1afbfa894ee2b20e77f8fc01bf94ad1916f3036851a51f84658b8d193a779a9b
+size 7249