Intermediate checkpoint upload step=120 (answerer_train)

Browse files

Files changed (12) hide show

.gitattributes +1 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/chat_template.jinja +54 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/config.json +57 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/generation_config.json +13 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/model.safetensors +3 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/optimizer.pt +3 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/rng_state.pth +3 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/scheduler.pt +3 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/tokenizer.json +3 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/tokenizer_config.json +32 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/trainer_state.json +2224 -0
self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -80,3 +80,4 @@ self_play_hf_l40s_full/round_004/generator_train/final_model/tokenizer.json filt
 self_play_hf_l40s_full/round_004/answerer_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_004/answerer_train/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_004/answerer_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 self_play_hf_l40s_full/round_004/answerer_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_004/answerer_train/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_004/answerer_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48e200a9bce551e2816c8ae9aa3a9439d68e1400fec13b52a770efb923f55060
+size 1976163472

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08fb5a16e7147ef26f7d4ccda154b4b022bf29fe0273ec63345061e26db03ec7
+size 3952509771

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09d387fab291cf55ab5d51fbc665b5b617f39b612148cbfb2c8bec740f61e07b
+size 14645

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7278566d3e343c7c551e937aa2102c1aa1c1b0a72103eeaeaa73c012e20ea6df
+size 1465

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2224 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 120,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.808485507965088,
+      "epoch": 0.025,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.511763095855713,
+      "kl": 0.0,
+      "learning_rate": 3e-06,
+      "loss": -0.062454625964164734,
+      "num_tokens": 10016.0,
+      "reward": 0.796283483505249,
+      "reward_std": 0.15437306463718414,
+      "rewards/AnswererRewardFunction/mean": 0.796283483505249,
+      "rewards/AnswererRewardFunction/std": 0.15437307953834534,
+      "step": 1,
+      "step_time": 7.744606889000352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.061605215072632,
+      "epoch": 0.05,
+      "grad_norm": 0.030779404565691948,
+      "kl": 0.023394376039505005,
+      "learning_rate": 2.9750000000000003e-06,
+      "loss": 5.848593355040066e-05,
+      "step": 2,
+      "step_time": 0.11778782400051568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.029296875,
+      "clip_ratio/low_min": 0.029296875,
+      "clip_ratio/region_mean": 0.029296875,
+      "entropy": 2.177046537399292,
+      "epoch": 0.075,
+      "grad_norm": 3.1303863525390625,
+      "kl": 0.08087262511253357,
+      "learning_rate": 2.9499999999999997e-06,
+      "loss": 0.09753571450710297,
+      "step": 3,
+      "step_time": 0.1114002790000086
+    },
+    {
+      "clip_ratio/high_max": 0.0361328125,
+      "clip_ratio/high_mean": 0.0361328125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0361328125,
+      "entropy": 2.0383293628692627,
+      "epoch": 0.1,
+      "grad_norm": 0.9711113572120667,
+      "kl": 0.13760070502758026,
+      "learning_rate": 2.925e-06,
+      "loss": -0.030087757855653763,
+      "step": 4,
+      "step_time": 0.11656204800056003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2891623973846436,
+      "epoch": 0.125,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 3.584872007369995,
+      "kl": 0.0656374841928482,
+      "learning_rate": 2.9e-06,
+      "loss": -0.00024243921507149935,
+      "num_tokens": 19780.0,
+      "reward": 0.7218090295791626,
+      "reward_std": 0.14923875033855438,
+      "rewards/AnswererRewardFunction/mean": 0.7218090295791626,
+      "rewards/AnswererRewardFunction/std": 0.14923876523971558,
+      "step": 5,
+      "step_time": 7.795673902999624
+    },
+    {
+      "clip_ratio/high_max": 0.0078125,
+      "clip_ratio/high_mean": 0.0078125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 2.2073974609375,
+      "epoch": 0.15,
+      "grad_norm": 0.9116904139518738,
+      "kl": 0.08468662202358246,
+      "learning_rate": 2.875e-06,
+      "loss": -0.031071916222572327,
+      "step": 6,
+      "step_time": 0.1147218090000024
+    },
+    {
+      "clip_ratio/high_max": 0.0458984375,
+      "clip_ratio/high_mean": 0.0458984375,
+      "clip_ratio/low_mean": 0.0048828125,
+      "clip_ratio/low_min": 0.0048828125,
+      "clip_ratio/region_mean": 0.05078125,
+      "entropy": 1.828927755355835,
+      "epoch": 0.175,
+      "grad_norm": 2.9821267127990723,
+      "kl": 0.10931183397769928,
+      "learning_rate": 2.85e-06,
+      "loss": -0.052874594926834106,
+      "step": 7,
+      "step_time": 0.11118124299991905
+    },
+    {
+      "clip_ratio/high_max": 0.060546875,
+      "clip_ratio/high_mean": 0.060546875,
+      "clip_ratio/low_mean": 0.0693359375,
+      "clip_ratio/low_min": 0.0693359375,
+      "clip_ratio/region_mean": 0.1298828125,
+      "entropy": 2.239431619644165,
+      "epoch": 0.2,
+      "grad_norm": 3.761061668395996,
+      "kl": 0.11145439743995667,
+      "learning_rate": 2.825e-06,
+      "loss": 0.08703643828630447,
+      "step": 8,
+      "step_time": 0.11017163199994684
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2865660190582275,
+      "epoch": 0.225,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.6357184648513794,
+      "kl": 0.10884629935026169,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.03159352019429207,
+      "num_tokens": 29592.0,
+      "reward": 0.6900650858879089,
+      "reward_std": 0.1505999118089676,
+      "rewards/AnswererRewardFunction/mean": 0.6900650858879089,
+      "rewards/AnswererRewardFunction/std": 0.1505999118089676,
+      "step": 9,
+      "step_time": 7.738351545999649
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0078125,
+      "clip_ratio/low_min": 0.0078125,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 2.2734012603759766,
+      "epoch": 0.25,
+      "grad_norm": 4.225009918212891,
+      "kl": 0.14073708653450012,
+      "learning_rate": 2.775e-06,
+      "loss": 0.1410004049539566,
+      "step": 10,
+      "step_time": 0.1489122450002469
+    },
+    {
+      "clip_ratio/high_max": 0.0263671875,
+      "clip_ratio/high_mean": 0.0263671875,
+      "clip_ratio/low_mean": 0.015625,
+      "clip_ratio/low_min": 0.015625,
+      "clip_ratio/region_mean": 0.0419921875,
+      "entropy": 2.2513952255249023,
+      "epoch": 0.275,
+      "grad_norm": 3.980247735977173,
+      "kl": 0.12798824906349182,
+      "learning_rate": 2.75e-06,
+      "loss": -0.13872209191322327,
+      "step": 11,
+      "step_time": 0.11282693799967092
+    },
+    {
+      "clip_ratio/high_max": 0.0244140625,
+      "clip_ratio/high_mean": 0.0244140625,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0244140625,
+      "entropy": 2.080702781677246,
+      "epoch": 0.3,
+      "grad_norm": 0.9219491481781006,
+      "kl": 0.12932462990283966,
+      "learning_rate": 2.725e-06,
+      "loss": -0.0311975609511137,
+      "step": 12,
+      "step_time": 0.11239939000006416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.9584801197052002,
+      "epoch": 0.325,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.03335069492459297,
+      "kl": 0.1676618754863739,
+      "learning_rate": 2.7e-06,
+      "loss": 0.0004191546468064189,
+      "num_tokens": 39564.0,
+      "reward": 0.8116456270217896,
+      "reward_std": 0.11149715632200241,
+      "rewards/AnswererRewardFunction/mean": 0.8116456270217896,
+      "rewards/AnswererRewardFunction/std": 0.11149716377258301,
+      "step": 13,
+      "step_time": 7.745207232000212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.2673592567443848,
+      "epoch": 0.35,
+      "grad_norm": 0.04296092316508293,
+      "kl": 0.1360304206609726,
+      "learning_rate": 2.6750000000000002e-06,
+      "loss": 0.0003400760469958186,
+      "step": 14,
+      "step_time": 0.11098732899972674
+    },
+    {
+      "clip_ratio/high_max": 0.0029296875,
+      "clip_ratio/high_mean": 0.0029296875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0029296875,
+      "entropy": 2.088756799697876,
+      "epoch": 0.375,
+      "grad_norm": 0.9206492900848389,
+      "kl": 0.1489439606666565,
+      "learning_rate": 2.65e-06,
+      "loss": -0.031076664105057716,
+      "step": 15,
+      "step_time": 0.11077636499976506
+    },
+    {
+      "clip_ratio/high_max": 0.0263671875,
+      "clip_ratio/high_mean": 0.0263671875,
+      "clip_ratio/low_mean": 0.0078125,
+      "clip_ratio/low_min": 0.0078125,
+      "clip_ratio/region_mean": 0.0341796875,
+      "entropy": 2.237659454345703,
+      "epoch": 0.4,
+      "grad_norm": 3.1504709720611572,
+      "kl": 0.16802266240119934,
+      "learning_rate": 2.6250000000000003e-06,
+      "loss": 0.03298211842775345,
+      "step": 16,
+      "step_time": 0.11010160100067878
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.0111589431762695,
+      "epoch": 0.425,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06525206565856934,
+      "kl": 0.15062233805656433,
+      "learning_rate": 2.6e-06,
+      "loss": 0.0003765558358281851,
+      "num_tokens": 49924.0,
+      "reward": 0.9323133230209351,
+      "reward_std": 0.0,
+      "rewards/AnswererRewardFunction/mean": 0.9323133230209351,
+      "rewards/AnswererRewardFunction/std": 0.0,
+      "step": 17,
+      "step_time": 7.76931892399989
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9240071773529053,
+      "epoch": 0.45,
+      "grad_norm": 0.03619404137134552,
+      "kl": 0.1399194598197937,
+      "learning_rate": 2.575e-06,
+      "loss": 0.0003497986472211778,
+      "step": 18,
+      "step_time": 0.11182963699957327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7755590677261353,
+      "epoch": 0.475,
+      "grad_norm": 0.046630166471004486,
+      "kl": 0.1555626392364502,
+      "learning_rate": 2.55e-06,
+      "loss": 0.00038890657015144825,
+      "step": 19,
+      "step_time": 0.11152786100046796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.1256823539733887,
+      "epoch": 0.5,
+      "grad_norm": 0.04450196027755737,
+      "kl": 0.1300331950187683,
+      "learning_rate": 2.525e-06,
+      "loss": 0.0003250829759053886,
+      "step": 20,
+      "step_time": 0.11261919400021725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.0368661880493164,
+      "epoch": 0.525,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.03705684095621109,
+      "kl": 0.16164252161979675,
+      "learning_rate": 2.5e-06,
+      "loss": 0.00040410630754195154,
+      "num_tokens": 60044.0,
+      "reward": 0.819347620010376,
+      "reward_std": 0.11957820504903793,
+      "rewards/AnswererRewardFunction/mean": 0.819347620010376,
+      "rewards/AnswererRewardFunction/std": 0.11957821249961853,
+      "step": 21,
+      "step_time": 7.867706271000316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8853070735931396,
+      "epoch": 0.55,
+      "grad_norm": 0.055804409086704254,
+      "kl": 0.16001088917255402,
+      "learning_rate": 2.475e-06,
+      "loss": 0.00040002723108045757,
+      "step": 22,
+      "step_time": 0.11128929000005883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9584823846817017,
+      "epoch": 0.575,
+      "grad_norm": 0.04979805648326874,
+      "kl": 0.16607147455215454,
+      "learning_rate": 2.45e-06,
+      "loss": 0.00041517868521623313,
+      "step": 23,
+      "step_time": 0.11279742299939244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9190866947174072,
+      "epoch": 0.6,
+      "grad_norm": 0.03331170231103897,
+      "kl": 0.1263861209154129,
+      "learning_rate": 2.425e-06,
+      "loss": 0.00031596526969224215,
+      "step": 24,
+      "step_time": 0.11192457400011335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.9539425373077393,
+      "epoch": 0.625,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.0814502239227295,
+      "kl": 0.17089709639549255,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.03165344148874283,
+      "num_tokens": 70048.0,
+      "reward": 0.8298832774162292,
+      "reward_std": 0.12205864489078522,
+      "rewards/AnswererRewardFunction/mean": 0.8298832774162292,
+      "rewards/AnswererRewardFunction/std": 0.12205864489078522,
+      "step": 25,
+      "step_time": 7.7907283719996485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.988601565361023,
+      "epoch": 0.65,
+      "grad_norm": 1.3814729452133179,
+      "kl": 0.18201372027397156,
+      "learning_rate": 2.375e-06,
+      "loss": 0.06283903121948242,
+      "step": 26,
+      "step_time": 0.11003638100009994
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 2.075544834136963,
+      "epoch": 0.675,
+      "grad_norm": 3.0482325553894043,
+      "kl": 0.18647681176662445,
+      "learning_rate": 2.35e-06,
+      "loss": -0.09349231421947479,
+      "step": 27,
+      "step_time": 0.11002873100005672
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.125493049621582,
+      "epoch": 0.7,
+      "grad_norm": 0.04627307131886482,
+      "kl": 0.19537895917892456,
+      "learning_rate": 2.325e-06,
+      "loss": 0.000488447374664247,
+      "step": 28,
+      "step_time": 0.1143881190000684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.03523588180542,
+      "epoch": 0.725,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.3853849172592163,
+      "kl": 0.1463131308555603,
+      "learning_rate": 2.3000000000000004e-06,
+      "loss": -0.06208648160099983,
+      "num_tokens": 80268.0,
+      "reward": 0.8684389591217041,
+      "reward_std": 0.09924228489398956,
+      "rewards/AnswererRewardFunction/mean": 0.8684389591217041,
+      "rewards/AnswererRewardFunction/std": 0.09924228489398956,
+      "step": 29,
+      "step_time": 7.750608561000263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0841426849365234,
+      "epoch": 0.75,
+      "grad_norm": 2.837942361831665,
+      "kl": 0.1490478813648224,
+      "learning_rate": 2.275e-06,
+      "loss": 0.09484589099884033,
+      "step": 30,
+      "step_time": 0.11101365800004714
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.026797294616699,
+      "epoch": 0.775,
+      "grad_norm": 0.045485783368349075,
+      "kl": 0.15276005864143372,
+      "learning_rate": 2.25e-06,
+      "loss": 0.0003819001140072942,
+      "step": 31,
+      "step_time": 0.11208413100030157
+    },
+    {
+      "clip_ratio/high_max": 0.0048828125,
+      "clip_ratio/high_mean": 0.0048828125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0048828125,
+      "entropy": 2.1123099327087402,
+      "epoch": 0.8,
+      "grad_norm": 0.9081559777259827,
+      "kl": 0.13755789399147034,
+      "learning_rate": 2.2250000000000003e-06,
+      "loss": -0.030523225665092468,
+      "step": 32,
+      "step_time": 0.11418771500029834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.6823270320892334,
+      "epoch": 0.825,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 2.9273855686187744,
+      "kl": 0.14870840311050415,
+      "learning_rate": 2.1999999999999997e-06,
+      "loss": -0.06208054721355438,
+      "num_tokens": 90568.0,
+      "reward": 0.8832219839096069,
+      "reward_std": 0.10554318875074387,
+      "rewards/AnswererRewardFunction/mean": 0.8832219839096069,
+      "rewards/AnswererRewardFunction/std": 0.10554319620132446,
+      "step": 33,
+      "step_time": 7.824478043999989
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.322800397872925,
+      "epoch": 0.85,
+      "grad_norm": 1.030830979347229,
+      "kl": 0.1581311970949173,
+      "learning_rate": 2.175e-06,
+      "loss": 0.031573764979839325,
+      "step": 34,
+      "step_time": 0.1109909240003617
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 2.079770088195801,
+      "epoch": 0.875,
+      "grad_norm": 1.0320844650268555,
+      "kl": 0.17237909138202667,
+      "learning_rate": 2.15e-06,
+      "loss": 0.03202669695019722,
+      "step": 35,
+      "step_time": 0.1104884630003653
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9677340984344482,
+      "epoch": 0.9,
+      "grad_norm": 0.05142974480986595,
+      "kl": 0.18628746271133423,
+      "learning_rate": 2.125e-06,
+      "loss": 0.000465718621853739,
+      "step": 36,
+      "step_time": 0.11126610900009837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.691863775253296,
+      "epoch": 0.925,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.8771184086799622,
+      "kl": 0.15660348534584045,
+      "learning_rate": 2.1e-06,
+      "loss": 0.03148450702428818,
+      "num_tokens": 100596.0,
+      "reward": 0.8247425556182861,
+      "reward_std": 0.13356582820415497,
+      "rewards/AnswererRewardFunction/mean": 0.8247425556182861,
+      "rewards/AnswererRewardFunction/std": 0.13356582820415497,
+      "step": 37,
+      "step_time": 7.7617874139996275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 2.1614623069763184,
+      "epoch": 0.95,
+      "grad_norm": 0.9001947641372681,
+      "kl": 0.23196256160736084,
+      "learning_rate": 2.075e-06,
+      "loss": 0.03180602565407753,
+      "step": 38,
+      "step_time": 0.1102340079996793
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0009765625,
+      "clip_ratio/low_min": 0.0009765625,
+      "clip_ratio/region_mean": 0.0009765625,
+      "entropy": 1.8785492181777954,
+      "epoch": 0.975,
+      "grad_norm": 1.0380373001098633,
+      "kl": 0.1671696901321411,
+      "learning_rate": 2.0500000000000003e-06,
+      "loss": 0.03167015686631203,
+      "step": 39,
+      "step_time": 0.11017617600009544
+    },
+    {
+      "clip_ratio/high_max": 0.009765625,
+      "clip_ratio/high_mean": 0.009765625,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 1.7612617015838623,
+      "epoch": 1.0,
+      "grad_norm": 2.548452138900757,
+      "kl": 0.20340929925441742,
+      "learning_rate": 2.025e-06,
+      "loss": -0.0916656106710434,
+      "step": 40,
+      "step_time": 0.11005827400003909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.935651183128357,
+      "epoch": 1.025,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9923874139785767,
+      "kl": 0.22261759638786316,
+      "learning_rate": 2e-06,
+      "loss": -0.0307498499751091,
+      "num_tokens": 110180.0,
+      "reward": 0.7413970232009888,
+      "reward_std": 0.10247324407100677,
+      "rewards/AnswererRewardFunction/mean": 0.7413970232009888,
+      "rewards/AnswererRewardFunction/std": 0.10247325897216797,
+      "step": 41,
+      "step_time": 7.812748901999839
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0029296875,
+      "clip_ratio/low_min": 0.0029296875,
+      "clip_ratio/region_mean": 0.0029296875,
+      "entropy": 1.7066082954406738,
+      "epoch": 1.05,
+      "grad_norm": 2.7330000400543213,
+      "kl": 0.1961289346218109,
+      "learning_rate": 1.975e-06,
+      "loss": 0.06310413777828217,
+      "step": 42,
+      "step_time": 0.10875788399971498
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.829827070236206,
+      "epoch": 1.075,
+      "grad_norm": 1.2887893915176392,
+      "kl": 0.1938922256231308,
+      "learning_rate": 1.95e-06,
+      "loss": -0.061667174100875854,
+      "step": 43,
+      "step_time": 0.10980056400057947
+    },
+    {
+      "clip_ratio/high_max": 0.0107421875,
+      "clip_ratio/high_mean": 0.0107421875,
+      "clip_ratio/low_mean": 0.01171875,
+      "clip_ratio/low_min": 0.01171875,
+      "clip_ratio/region_mean": 0.0224609375,
+      "entropy": 2.006434440612793,
+      "epoch": 1.1,
+      "grad_norm": 2.9883673191070557,
+      "kl": 0.20987190306186676,
+      "learning_rate": 1.925e-06,
+      "loss": 0.03190411627292633,
+      "step": 44,
+      "step_time": 0.10804392199952417
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8556946516036987,
+      "epoch": 1.125,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0585976205766201,
+      "kl": 0.17927253246307373,
+      "learning_rate": 1.9e-06,
+      "loss": 0.00044818135211244226,
+      "num_tokens": 120220.0,
+      "reward": 0.8883568644523621,
+      "reward_std": 0.08731486648321152,
+      "rewards/AnswererRewardFunction/mean": 0.8883568644523621,
+      "rewards/AnswererRewardFunction/std": 0.08731484413146973,
+      "step": 45,
+      "step_time": 7.789618280000468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.771440863609314,
+      "epoch": 1.15,
+      "grad_norm": 0.05603954941034317,
+      "kl": 0.1786380112171173,
+      "learning_rate": 1.875e-06,
+      "loss": 0.00044659501872956753,
+      "step": 46,
+      "step_time": 0.11135138499957975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9898463487625122,
+      "epoch": 1.175,
+      "grad_norm": 0.08282279223203659,
+      "kl": 0.21690496802330017,
+      "learning_rate": 1.85e-06,
+      "loss": 0.0005422623944468796,
+      "step": 47,
+      "step_time": 0.11093376600001648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7627424001693726,
+      "epoch": 1.2,
+      "grad_norm": 0.06751802563667297,
+      "kl": 0.16698461771011353,
+      "learning_rate": 1.8249999999999999e-06,
+      "loss": 0.00041746150236576796,
+      "step": 48,
+      "step_time": 0.11264505899998767
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8894991874694824,
+      "epoch": 1.225,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.03931179642677307,
+      "kl": 0.16673244535923004,
+      "learning_rate": 1.8e-06,
+      "loss": 0.0004168311133980751,
+      "num_tokens": 130268.0,
+      "reward": 0.88603675365448,
+      "reward_std": 0.10359743237495422,
+      "rewards/AnswererRewardFunction/mean": 0.88603675365448,
+      "rewards/AnswererRewardFunction/std": 0.10359743237495422,
+      "step": 49,
+      "step_time": 7.824710242999572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8960050344467163,
+      "epoch": 1.25,
+      "grad_norm": 2.838279962539673,
+      "kl": 0.18674209713935852,
+      "learning_rate": 1.7750000000000002e-06,
+      "loss": 0.0941702127456665,
+      "step": 50,
+      "step_time": 0.11219863099995564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9281809329986572,
+      "epoch": 1.275,
+      "grad_norm": 0.046354807913303375,
+      "kl": 0.171075239777565,
+      "learning_rate": 1.7500000000000002e-06,
+      "loss": 0.0004276880936231464,
+      "step": 51,
+      "step_time": 0.1119742769997174
+    },
+    {
+      "clip_ratio/high_max": 0.01171875,
+      "clip_ratio/high_mean": 0.01171875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.01171875,
+      "entropy": 1.631166696548462,
+      "epoch": 1.3,
+      "grad_norm": 1.6717921495437622,
+      "kl": 0.16682168841362,
+      "learning_rate": 1.725e-06,
+      "loss": -0.09329331666231155,
+      "step": 52,
+      "step_time": 0.11258080000061454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8790663480758667,
+      "epoch": 1.325,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06409338861703873,
+      "kl": 0.24286241829395294,
+      "learning_rate": 1.7e-06,
+      "loss": 0.0006071560783311725,
+      "num_tokens": 140416.0,
+      "reward": 0.8848026990890503,
+      "reward_std": 0.08498957008123398,
+      "rewards/AnswererRewardFunction/mean": 0.8848026990890503,
+      "rewards/AnswererRewardFunction/std": 0.08498956263065338,
+      "step": 53,
+      "step_time": 7.8205216610003845
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8914015293121338,
+      "epoch": 1.35,
+      "grad_norm": 0.07145795971155167,
+      "kl": 0.21129751205444336,
+      "learning_rate": 1.675e-06,
+      "loss": 0.000528243777807802,
+      "step": 54,
+      "step_time": 0.1118343839998488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.757217288017273,
+      "epoch": 1.375,
+      "grad_norm": 0.038478150963783264,
+      "kl": 0.14117839932441711,
+      "learning_rate": 1.65e-06,
+      "loss": 0.0003529459936544299,
+      "step": 55,
+      "step_time": 0.11172699199960334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9119102954864502,
+      "epoch": 1.4,
+      "grad_norm": 0.0431695431470871,
+      "kl": 0.17614459991455078,
+      "learning_rate": 1.625e-06,
+      "loss": 0.00044036147301085293,
+      "step": 56,
+      "step_time": 0.11101481700006843
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.9821627140045166,
+      "epoch": 1.425,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 3.133037805557251,
+      "kl": 0.17368242144584656,
+      "learning_rate": 1.6e-06,
+      "loss": 0.09411265701055527,
+      "num_tokens": 150768.0,
+      "reward": 0.9159495830535889,
+      "reward_std": 0.06545510143041611,
+      "rewards/AnswererRewardFunction/mean": 0.9159495830535889,
+      "rewards/AnswererRewardFunction/std": 0.0654551088809967,
+      "step": 57,
+      "step_time": 7.76614765100021
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.7750331163406372,
+      "epoch": 1.45,
+      "grad_norm": 0.953976571559906,
+      "kl": 0.19966614246368408,
+      "learning_rate": 1.5750000000000002e-06,
+      "loss": -0.03101617470383644,
+      "step": 58,
+      "step_time": 0.1098387490001187
+    },
+    {
+      "clip_ratio/high_max": 0.0009765625,
+      "clip_ratio/high_mean": 0.0009765625,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0009765625,
+      "entropy": 1.8459761142730713,
+      "epoch": 1.475,
+      "grad_norm": 0.8789482116699219,
+      "kl": 0.1772395670413971,
+      "learning_rate": 1.5500000000000002e-06,
+      "loss": -0.030849514529109,
+      "step": 59,
+      "step_time": 0.10980367899992416
+    },
+    {
+      "clip_ratio/high_max": 0.0029296875,
+      "clip_ratio/high_mean": 0.0029296875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0029296875,
+      "entropy": 1.8890777826309204,
+      "epoch": 1.5,
+      "grad_norm": 0.938976526260376,
+      "kl": 0.17906132340431213,
+      "learning_rate": 1.525e-06,
+      "loss": -0.030837498605251312,
+      "step": 60,
+      "step_time": 0.11071035700024368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8429818153381348,
+      "epoch": 1.525,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04960624501109123,
+      "kl": 0.2106022834777832,
+      "learning_rate": 1.5e-06,
+      "loss": 0.0005265056970529258,
+      "num_tokens": 160864.0,
+      "reward": 0.8755199909210205,
+      "reward_std": 0.10159505903720856,
+      "rewards/AnswererRewardFunction/mean": 0.8755199909210205,
+      "rewards/AnswererRewardFunction/std": 0.10159505903720856,
+      "step": 61,
+      "step_time": 7.777418299000601
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.6882359981536865,
+      "epoch": 1.55,
+      "grad_norm": 0.06488100439310074,
+      "kl": 0.18747568130493164,
+      "learning_rate": 1.4749999999999999e-06,
+      "loss": 0.00046868916251696646,
+      "step": 62,
+      "step_time": 0.1100167260001399
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9549317359924316,
+      "epoch": 1.575,
+      "grad_norm": 0.06376447528600693,
+      "kl": 0.19842228293418884,
+      "learning_rate": 1.45e-06,
+      "loss": 0.0004960556980222464,
+      "step": 63,
+      "step_time": 0.10958483499962313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7855771780014038,
+      "epoch": 1.6,
+      "grad_norm": 0.047496143728494644,
+      "kl": 0.19172421097755432,
+      "learning_rate": 1.425e-06,
+      "loss": 0.00047931051813066006,
+      "step": 64,
+      "step_time": 0.11118459000044822
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8505133390426636,
+      "epoch": 1.625,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.050711821764707565,
+      "kl": 0.17194393277168274,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": 0.0004298597923479974,
+      "num_tokens": 171008.0,
+      "reward": 0.819347620010376,
+      "reward_std": 0.11957820504903793,
+      "rewards/AnswererRewardFunction/mean": 0.819347620010376,
+      "rewards/AnswererRewardFunction/std": 0.11957821249961853,
+      "step": 65,
+      "step_time": 7.7349191860002975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8674921989440918,
+      "epoch": 1.65,
+      "grad_norm": 0.06205311790108681,
+      "kl": 0.22628569602966309,
+      "learning_rate": 1.375e-06,
+      "loss": 0.0005657142028212547,
+      "step": 66,
+      "step_time": 0.11190354400059732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7050056457519531,
+      "epoch": 1.675,
+      "grad_norm": 0.040369097143411636,
+      "kl": 0.1742599755525589,
+      "learning_rate": 1.35e-06,
+      "loss": 0.0004356499412097037,
+      "step": 67,
+      "step_time": 0.1103675430003932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.6987855434417725,
+      "epoch": 1.7,
+      "grad_norm": 0.0360160693526268,
+      "kl": 0.1615583896636963,
+      "learning_rate": 1.325e-06,
+      "loss": 0.00040389594505541027,
+      "step": 68,
+      "step_time": 0.10992557299960026
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.701324701309204,
+      "epoch": 1.725,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04009808227419853,
+      "kl": 0.18351677060127258,
+      "learning_rate": 1.3e-06,
+      "loss": 0.0004587919102050364,
+      "num_tokens": 181088.0,
+      "reward": 0.8702797293663025,
+      "reward_std": 0.11096905916929245,
+      "rewards/AnswererRewardFunction/mean": 0.8702797293663025,
+      "rewards/AnswererRewardFunction/std": 0.11096906661987305,
+      "step": 69,
+      "step_time": 7.762835034000091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7760753631591797,
+      "epoch": 1.75,
+      "grad_norm": 0.0532107912003994,
+      "kl": 0.19955360889434814,
+      "learning_rate": 1.275e-06,
+      "loss": 0.0004988840082660317,
+      "step": 70,
+      "step_time": 0.11100815299960232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8080956935882568,
+      "epoch": 1.775,
+      "grad_norm": 0.04277520254254341,
+      "kl": 0.18510785698890686,
+      "learning_rate": 1.25e-06,
+      "loss": 0.00046276964712888,
+      "step": 71,
+      "step_time": 0.1123151739993773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7996044158935547,
+      "epoch": 1.8,
+      "grad_norm": 0.05546512082219124,
+      "kl": 0.18127581477165222,
+      "learning_rate": 1.225e-06,
+      "loss": 0.00045318951015360653,
+      "step": 72,
+      "step_time": 0.11235005500020634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.673140048980713,
+      "epoch": 1.825,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05601564049720764,
+      "kl": 0.17777645587921143,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.0004444411606527865,
+      "num_tokens": 191124.0,
+      "reward": 0.9533497095108032,
+      "reward_std": 0.02976372465491295,
+      "rewards/AnswererRewardFunction/mean": 0.9533497095108032,
+      "rewards/AnswererRewardFunction/std": 0.029763715341687202,
+      "step": 73,
+      "step_time": 7.778424793999875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.648258924484253,
+      "epoch": 1.85,
+      "grad_norm": 0.04714483022689819,
+      "kl": 0.18713414669036865,
+      "learning_rate": 1.175e-06,
+      "loss": 0.0004678354016505182,
+      "step": 74,
+      "step_time": 0.11056567400009953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.5942027568817139,
+      "epoch": 1.875,
+      "grad_norm": 0.07229740172624588,
+      "kl": 0.17846925556659698,
+      "learning_rate": 1.1500000000000002e-06,
+      "loss": 0.00044617310049943626,
+      "step": 75,
+      "step_time": 0.10988077099955262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8294134140014648,
+      "epoch": 1.9,
+      "grad_norm": 0.07372913509607315,
+      "kl": 0.19231390953063965,
+      "learning_rate": 1.125e-06,
+      "loss": 0.0004807847726624459,
+      "step": 76,
+      "step_time": 0.11050818300009269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8716906309127808,
+      "epoch": 1.925,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04397920146584511,
+      "kl": 0.20210802555084229,
+      "learning_rate": 1.0999999999999998e-06,
+      "loss": 0.0005052700289525092,
+      "num_tokens": 201192.0,
+      "reward": 0.8848026990890503,
+      "reward_std": 0.08498957008123398,
+      "rewards/AnswererRewardFunction/mean": 0.8848026990890503,
+      "rewards/AnswererRewardFunction/std": 0.08498956263065338,
+      "step": 77,
+      "step_time": 7.766447917999358
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8421611785888672,
+      "epoch": 1.95,
+      "grad_norm": 0.04258396103978157,
+      "kl": 0.16801506280899048,
+      "learning_rate": 1.075e-06,
+      "loss": 0.0004200376570224762,
+      "step": 78,
+      "step_time": 0.11128133199963486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0212643146514893,
+      "epoch": 1.975,
+      "grad_norm": 0.037206318229436874,
+      "kl": 0.17133170366287231,
+      "learning_rate": 1.05e-06,
+      "loss": 0.00042832925100810826,
+      "step": 79,
+      "step_time": 0.11063859800015052
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9713196754455566,
+      "epoch": 2.0,
+      "grad_norm": 0.06273569911718369,
+      "kl": 0.20357024669647217,
+      "learning_rate": 1.0250000000000001e-06,
+      "loss": 0.0005089255864731967,
+      "step": 80,
+      "step_time": 0.11058984600003896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7217357158660889,
+      "epoch": 2.025,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.6169368028640747,
+      "kl": 0.16916689276695251,
+      "learning_rate": 1e-06,
+      "loss": 0.054513730108737946,
+      "num_tokens": 211296.0,
+      "reward": 0.8427923917770386,
+      "reward_std": 0.11980737000703812,
+      "rewards/AnswererRewardFunction/mean": 0.8427923917770386,
+      "rewards/AnswererRewardFunction/std": 0.11980737000703812,
+      "step": 81,
+      "step_time": 7.74370976299997
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0009765625,
+      "clip_ratio/low_min": 0.0009765625,
+      "clip_ratio/region_mean": 0.0009765625,
+      "entropy": 1.7494382858276367,
+      "epoch": 2.05,
+      "grad_norm": 1.5589256286621094,
+      "kl": 0.18270084261894226,
+      "learning_rate": 9.75e-07,
+      "loss": 0.05478590354323387,
+      "step": 82,
+      "step_time": 0.11115284799961955
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.879669427871704,
+      "epoch": 2.075,
+      "grad_norm": 1.6640474796295166,
+      "kl": 0.17780490219593048,
+      "learning_rate": 9.5e-07,
+      "loss": -0.053716279566287994,
+      "step": 83,
+      "step_time": 0.1114861039995958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.063556432723999,
+      "epoch": 2.1,
+      "grad_norm": 1.6819794178009033,
+      "kl": 0.2023072987794876,
+      "learning_rate": 9.25e-07,
+      "loss": -0.05334728956222534,
+      "step": 84,
+      "step_time": 0.11098489600044559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.6534061431884766,
+      "epoch": 2.125,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04503380134701729,
+      "kl": 0.15787094831466675,
+      "learning_rate": 9e-07,
+      "loss": 0.00039467739406973124,
+      "num_tokens": 221124.0,
+      "reward": 0.8965563178062439,
+      "reward_std": 0.11728676408529282,
+      "rewards/AnswererRewardFunction/mean": 0.8965563178062439,
+      "rewards/AnswererRewardFunction/std": 0.11728675663471222,
+      "step": 85,
+      "step_time": 7.803010329999779
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.734236240386963,
+      "epoch": 2.15,
+      "grad_norm": 0.053943030536174774,
+      "kl": 0.1871510148048401,
+      "learning_rate": 8.750000000000001e-07,
+      "loss": 0.00046787751489318907,
+      "step": 86,
+      "step_time": 0.11089296099999046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8206863403320312,
+      "epoch": 2.175,
+      "grad_norm": 0.0429137758910656,
+      "kl": 0.18653716146945953,
+      "learning_rate": 8.5e-07,
+      "loss": 0.0004663429281208664,
+      "step": 87,
+      "step_time": 0.109580606000236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8033851385116577,
+      "epoch": 2.2,
+      "grad_norm": 0.032541193068027496,
+      "kl": 0.1465151309967041,
+      "learning_rate": 8.25e-07,
+      "loss": 0.0003662878298200667,
+      "step": 88,
+      "step_time": 0.10985735099984595
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7311502695083618,
+      "epoch": 2.225,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04454684257507324,
+      "kl": 0.18079979717731476,
+      "learning_rate": 8e-07,
+      "loss": 0.00045199948363006115,
+      "num_tokens": 230912.0,
+      "reward": 0.8309686183929443,
+      "reward_std": 0.14328104257583618,
+      "rewards/AnswererRewardFunction/mean": 0.8309686183929443,
+      "rewards/AnswererRewardFunction/std": 0.14328105747699738,
+      "step": 89,
+      "step_time": 7.7929221280001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0398142337799072,
+      "epoch": 2.25,
+      "grad_norm": 0.04331798851490021,
+      "kl": 0.19130492210388184,
+      "learning_rate": 7.750000000000001e-07,
+      "loss": 0.0004782622854690999,
+      "step": 90,
+      "step_time": 0.1110038219994749
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8197107315063477,
+      "epoch": 2.275,
+      "grad_norm": 0.0389009565114975,
+      "kl": 0.1594439446926117,
+      "learning_rate": 7.5e-07,
+      "loss": 0.00039860987453721464,
+      "step": 91,
+      "step_time": 0.11234117200001492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.6061317920684814,
+      "epoch": 2.3,
+      "grad_norm": 0.035475604236125946,
+      "kl": 0.16071225702762604,
+      "learning_rate": 7.25e-07,
+      "loss": 0.0004017806495539844,
+      "step": 92,
+      "step_time": 0.11047902500013151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.0506911277770996,
+      "epoch": 2.325,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05084988474845886,
+      "kl": 0.19656753540039062,
+      "learning_rate": 7.000000000000001e-07,
+      "loss": 0.0004914188175462186,
+      "num_tokens": 241172.0,
+      "reward": 0.8668582439422607,
+      "reward_std": 0.1170896589756012,
+      "rewards/AnswererRewardFunction/mean": 0.8668582439422607,
+      "rewards/AnswererRewardFunction/std": 0.1170896664261818,
+      "step": 93,
+      "step_time": 7.777627965000647
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7454960346221924,
+      "epoch": 2.35,
+      "grad_norm": 0.0445859469473362,
+      "kl": 0.18581417202949524,
+      "learning_rate": 6.75e-07,
+      "loss": 0.0004645354056265205,
+      "step": 94,
+      "step_time": 0.11115384400000039
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9964624643325806,
+      "epoch": 2.375,
+      "grad_norm": 0.03978120535612106,
+      "kl": 0.1595752239227295,
+      "learning_rate": 6.5e-07,
+      "loss": 0.00039893804932944477,
+      "step": 95,
+      "step_time": 0.11099709100017208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9201608896255493,
+      "epoch": 2.4,
+      "grad_norm": 0.04184085130691528,
+      "kl": 0.16675138473510742,
+      "learning_rate": 6.25e-07,
+      "loss": 0.00041687843622639775,
+      "step": 96,
+      "step_time": 0.11661479099984717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7013036012649536,
+      "epoch": 2.425,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.037217915058135986,
+      "kl": 0.15892338752746582,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.0003973084385506809,
+      "num_tokens": 251112.0,
+      "reward": 0.8227691054344177,
+      "reward_std": 0.11510815471410751,
+      "rewards/AnswererRewardFunction/mean": 0.8227691054344177,
+      "rewards/AnswererRewardFunction/std": 0.11510813981294632,
+      "step": 97,
+      "step_time": 7.6884349259999
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.912070393562317,
+      "epoch": 2.45,
+      "grad_norm": 0.07283549755811691,
+      "kl": 0.1705121099948883,
+      "learning_rate": 5.750000000000001e-07,
+      "loss": 0.00042628025403246284,
+      "step": 98,
+      "step_time": 0.11253656700046122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7756853103637695,
+      "epoch": 2.475,
+      "grad_norm": 0.05578094348311424,
+      "kl": 0.1911633312702179,
+      "learning_rate": 5.499999999999999e-07,
+      "loss": 0.00047790829557925463,
+      "step": 99,
+      "step_time": 0.11162257700016198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8756659030914307,
+      "epoch": 2.5,
+      "grad_norm": 0.05396004766225815,
+      "kl": 0.18036732077598572,
+      "learning_rate": 5.25e-07,
+      "loss": 0.0004509182763285935,
+      "step": 100,
+      "step_time": 0.11048615199979395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.766379952430725,
+      "epoch": 2.525,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.8942388296127319,
+      "kl": 0.16869878768920898,
+      "learning_rate": 5e-07,
+      "loss": -0.03075210005044937,
+      "num_tokens": 261108.0,
+      "reward": 0.8209283351898193,
+      "reward_std": 0.10294758528470993,
+      "rewards/AnswererRewardFunction/mean": 0.8209283351898193,
+      "rewards/AnswererRewardFunction/std": 0.10294758528470993,
+      "step": 101,
+      "step_time": 7.760543512999902
+    },
+    {
+      "clip_ratio/high_max": 0.0009765625,
+      "clip_ratio/high_mean": 0.0009765625,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0009765625,
+      "entropy": 1.7685692310333252,
+      "epoch": 2.55,
+      "grad_norm": 0.9231233596801758,
+      "kl": 0.14601315557956696,
+      "learning_rate": 4.75e-07,
+      "loss": -0.030733846127986908,
+      "step": 102,
+      "step_time": 0.11171397100042668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8422847986221313,
+      "epoch": 2.575,
+      "grad_norm": 2.5536715984344482,
+      "kl": 0.1598617136478424,
+      "learning_rate": 4.5e-07,
+      "loss": 0.09431644529104233,
+      "step": 103,
+      "step_time": 0.11270964300001651
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.8202838897705078,
+      "epoch": 2.6,
+      "grad_norm": 0.9551231861114502,
+      "kl": 0.15085619688034058,
+      "learning_rate": 4.25e-07,
+      "loss": -0.031034285202622414,
+      "step": 104,
+      "step_time": 0.11204244799955632
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7124009132385254,
+      "epoch": 2.625,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.038327667862176895,
+      "kl": 0.16573426127433777,
+      "learning_rate": 4e-07,
+      "loss": 0.00041433563455939293,
+      "num_tokens": 271148.0,
+      "reward": 0.8883568644523621,
+      "reward_std": 0.08731486648321152,
+      "rewards/AnswererRewardFunction/mean": 0.8883568644523621,
+      "rewards/AnswererRewardFunction/std": 0.08731484413146973,
+      "step": 105,
+      "step_time": 7.726755570999558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7165004014968872,
+      "epoch": 2.65,
+      "grad_norm": 0.042729344218969345,
+      "kl": 0.1634857952594757,
+      "learning_rate": 3.75e-07,
+      "loss": 0.0004087144916411489,
+      "step": 106,
+      "step_time": 0.11138214499987953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.5836355686187744,
+      "epoch": 2.675,
+      "grad_norm": 0.03519313782453537,
+      "kl": 0.1412312090396881,
+      "learning_rate": 3.5000000000000004e-07,
+      "loss": 0.00035307800862938166,
+      "step": 107,
+      "step_time": 0.11289172600027086
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.856842279434204,
+      "epoch": 2.7,
+      "grad_norm": 0.037977833300828934,
+      "kl": 0.15611428022384644,
+      "learning_rate": 3.25e-07,
+      "loss": 0.00039028568426147103,
+      "step": 108,
+      "step_time": 0.11108121899997059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7743322849273682,
+      "epoch": 2.725,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.038768816739320755,
+      "kl": 0.16674529016017914,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.0004168632149230689,
+      "num_tokens": 281344.0,
+      "reward": 0.9358674883842468,
+      "reward_std": 0.0063578663393855095,
+      "rewards/AnswererRewardFunction/mean": 0.9358674883842468,
+      "rewards/AnswererRewardFunction/std": 0.006357857491821051,
+      "step": 109,
+      "step_time": 7.73822644899974
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.5768123865127563,
+      "epoch": 2.75,
+      "grad_norm": 0.03679906204342842,
+      "kl": 0.1504622995853424,
+      "learning_rate": 2.7499999999999996e-07,
+      "loss": 0.0003761557163670659,
+      "step": 110,
+      "step_time": 0.11385159400015255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.6607877016067505,
+      "epoch": 2.775,
+      "grad_norm": 0.03713918849825859,
+      "kl": 0.16259214282035828,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0004064803651999682,
+      "step": 111,
+      "step_time": 0.1118541819996608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8108391761779785,
+      "epoch": 2.8,
+      "grad_norm": 0.05037199705839157,
+      "kl": 0.1863050013780594,
+      "learning_rate": 2.25e-07,
+      "loss": 0.0004657625104300678,
+      "step": 112,
+      "step_time": 0.11186235300010594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7566189765930176,
+      "epoch": 2.825,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.036518849432468414,
+      "kl": 0.1620350182056427,
+      "learning_rate": 2e-07,
+      "loss": 0.00040508751408196986,
+      "num_tokens": 291560.0,
+      "reward": 0.8848026990890503,
+      "reward_std": 0.08498957008123398,
+      "rewards/AnswererRewardFunction/mean": 0.8848026990890503,
+      "rewards/AnswererRewardFunction/std": 0.08498956263065338,
+      "step": 113,
+      "step_time": 7.7582261870002185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7600432634353638,
+      "epoch": 2.85,
+      "grad_norm": 0.05634801462292671,
+      "kl": 0.17256814241409302,
+      "learning_rate": 1.7500000000000002e-07,
+      "loss": 0.00043142036884091794,
+      "step": 114,
+      "step_time": 0.11197551500026748
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.874969720840454,
+      "epoch": 2.875,
+      "grad_norm": 0.03801681101322174,
+      "kl": 0.15792644023895264,
+      "learning_rate": 1.5000000000000002e-07,
+      "loss": 0.00039481610292568803,
+      "step": 115,
+      "step_time": 0.11017055599950254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9454420804977417,
+      "epoch": 2.9,
+      "grad_norm": 0.03727322444319725,
+      "kl": 0.1802506446838379,
+      "learning_rate": 1.25e-07,
+      "loss": 0.0004506265977397561,
+      "step": 116,
+      "step_time": 0.1150881510002364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8203485012054443,
+      "epoch": 2.925,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.044991906732320786,
+      "kl": 0.17885687947273254,
+      "learning_rate": 1e-07,
+      "loss": 0.00044714222894981503,
+      "num_tokens": 301788.0,
+      "reward": 0.9323133230209351,
+      "reward_std": 0.0,
+      "rewards/AnswererRewardFunction/mean": 0.9323133230209351,
+      "rewards/AnswererRewardFunction/std": 0.0,
+      "step": 117,
+      "step_time": 7.785143383000104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0509285926818848,
+      "epoch": 2.95,
+      "grad_norm": 0.05878576263785362,
+      "kl": 0.1711733639240265,
+      "learning_rate": 7.500000000000001e-08,
+      "loss": 0.0004279334098100662,
+      "step": 118,
+      "step_time": 0.11581213099998422
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8962795734405518,
+      "epoch": 2.975,
+      "grad_norm": 0.07008994370698929,
+      "kl": 0.17706063389778137,
+      "learning_rate": 5e-08,
+      "loss": 0.00044265156611800194,
+      "step": 119,
+      "step_time": 0.11132281099980901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0024328231811523,
+      "epoch": 3.0,
+      "grad_norm": 0.06333177536725998,
+      "kl": 0.18442605435848236,
+      "learning_rate": 2.5e-08,
+      "loss": 0.0004610651230905205,
+      "step": 120,
+      "step_time": 0.11119805700036522
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 120,
+  "num_input_tokens_seen": 301788,
+  "num_train_epochs": 3,
+  "save_steps": 30,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

self_play_hf_l40s_full/round_004/answerer_train/checkpoint-120/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d1ca30827c82daaee74796a91c1d74f1cbf0768a07df4d6cf7d927229c515d1
+size 7249