Intermediate checkpoint upload step=30 (generator_train)

Browse files

Files changed (12) hide show

.gitattributes +1 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/chat_template.jinja +54 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/config.json +57 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/generation_config.json +13 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/model.safetensors +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/optimizer.pt +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/rng_state.pth +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/scheduler.pt +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer.json +3 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer_config.json +32 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/trainer_state.json +588 -0
self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -52,3 +52,4 @@ self_play_hf_l40s_full/round_001/answerer_train/checkpoint-60/tokenizer.json fil
 self_play_hf_l40s_full/round_001/answerer_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_001/answerer_train/checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_001/answerer_train/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 self_play_hf_l40s_full/round_001/answerer_train/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_001/answerer_train/checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 self_play_hf_l40s_full/round_001/answerer_train/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc9a740f1a034be33c4e3a014ca997f92f6098069fc86f713179c9abe2514b1c
+size 1976163472

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ab91e64f30d78c332c7ae32c010306a035e3792a8a2aadb46366869e6d6c117
+size 3952509771

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41dae3129454fcbd2a668a22abc971454e5e6c939e1a5867ff90c2b866586678
+size 14645

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1d413e3e3c82c1b74871c9cb9e959551000a6f2cc680c7750a2f6acda5c43d5
+size 1465

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/trainer_state.json ADDED Viewed

	@@ -0,0 +1,588 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.25,
+  "eval_steps": 500,
+  "global_step": 30,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2909066677093506,
+      "epoch": 0.041666666666666664,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.419240951538086,
+      "kl": 0.0005526020540855825,
+      "learning_rate": 5e-06,
+      "loss": 0.12301453948020935,
+      "num_tokens": 25288.0,
+      "reward": -0.13312500715255737,
+      "reward_std": 0.1580703854560852,
+      "rewards/GeneratorRewardFunction/mean": -0.13312500715255737,
+      "rewards/GeneratorRewardFunction/std": 0.1580704003572464,
+      "step": 1,
+      "step_time": 12.38097839000011
+    },
+    {
+      "clip_ratio/high_max": 0.025390625,
+      "clip_ratio/high_mean": 0.025390625,
+      "clip_ratio/low_mean": 0.0481770820915699,
+      "clip_ratio/low_min": 0.0481770820915699,
+      "clip_ratio/region_mean": 0.0735677108168602,
+      "entropy": 2.2825264930725098,
+      "epoch": 0.08333333333333333,
+      "grad_norm": 3.043118476867676,
+      "kl": 0.017308469861745834,
+      "learning_rate": 4.958333333333334e-06,
+      "loss": 0.12409868091344833,
+      "step": 2,
+      "step_time": 0.22687196500010032
+    },
+    {
+      "clip_ratio/high_max": 0.068359375,
+      "clip_ratio/high_mean": 0.068359375,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.068359375,
+      "entropy": 1.7901748418807983,
+      "epoch": 0.125,
+      "grad_norm": 1.792228102684021,
+      "kl": 0.042044539004564285,
+      "learning_rate": 4.9166666666666665e-06,
+      "loss": -0.11261526495218277,
+      "step": 3,
+      "step_time": 0.22754332899967267
+    },
+    {
+      "clip_ratio/high_max": 0.1139322891831398,
+      "clip_ratio/high_mean": 0.1139322891831398,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.1139322891831398,
+      "entropy": 2.698014974594116,
+      "epoch": 0.16666666666666666,
+      "grad_norm": 2.728670835494995,
+      "kl": 0.11385751515626907,
+      "learning_rate": 4.875e-06,
+      "loss": -0.11856894940137863,
+      "step": 4,
+      "step_time": 0.22612155700016956
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.2396337985992432,
+      "epoch": 0.20833333333333334,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 3.7660365104675293,
+      "kl": 0.027865365147590637,
+      "learning_rate": 4.833333333333333e-06,
+      "loss": 0.07652663439512253,
+      "num_tokens": 50484.0,
+      "reward": -0.1990624964237213,
+      "reward_std": 0.22889748215675354,
+      "rewards/GeneratorRewardFunction/mean": -0.1990624964237213,
+      "rewards/GeneratorRewardFunction/std": 0.22889748215675354,
+      "step": 5,
+      "step_time": 12.267690716000288
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.025390625,
+      "clip_ratio/low_min": 0.025390625,
+      "clip_ratio/region_mean": 0.0319010429084301,
+      "entropy": 1.3415206670761108,
+      "epoch": 0.25,
+      "grad_norm": 1.7361595630645752,
+      "kl": 0.03342495858669281,
+      "learning_rate": 4.791666666666668e-06,
+      "loss": 0.06931179016828537,
+      "step": 6,
+      "step_time": 0.22098127500021292
+    },
+    {
+      "clip_ratio/high_max": 0.01888020895421505,
+      "clip_ratio/high_mean": 0.01888020895421505,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.01888020895421505,
+      "entropy": 1.5484552383422852,
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.7583158612251282,
+      "kl": 0.0418914258480072,
+      "learning_rate": 4.75e-06,
+      "loss": -0.051571328192949295,
+      "step": 7,
+      "step_time": 0.22065312600034304
+    },
+    {
+      "clip_ratio/high_max": 0.1178385391831398,
+      "clip_ratio/high_mean": 0.1178385391831398,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.1178385391831398,
+      "entropy": 2.2375190258026123,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.100609302520752,
+      "kl": 0.05581093952059746,
+      "learning_rate": 4.708333333333334e-06,
+      "loss": -0.09243377298116684,
+      "step": 8,
+      "step_time": 0.2210044349999407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8528798818588257,
+      "epoch": 0.375,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.863710880279541,
+      "kl": 0.13026608526706696,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": -0.01838863641023636,
+      "num_tokens": 75740.0,
+      "reward": -0.1399218738079071,
+      "reward_std": 0.2057529091835022,
+      "rewards/GeneratorRewardFunction/mean": -0.1399218738079071,
+      "rewards/GeneratorRewardFunction/std": 0.20575293898582458,
+      "step": 9,
+      "step_time": 12.445934573000159
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 1.8978748321533203,
+      "epoch": 0.4166666666666667,
+      "grad_norm": 2.6196653842926025,
+      "kl": 0.0548124723136425,
+      "learning_rate": 4.625000000000001e-06,
+      "loss": -0.02614506147801876,
+      "step": 10,
+      "step_time": 0.2266816760002257
+    },
+    {
+      "clip_ratio/high_max": 0.013671875,
+      "clip_ratio/high_mean": 0.013671875,
+      "clip_ratio/low_mean": 0.013020833022892475,
+      "clip_ratio/low_min": 0.013020833022892475,
+      "clip_ratio/region_mean": 0.02669270895421505,
+      "entropy": 1.9386297464370728,
+      "epoch": 0.4583333333333333,
+      "grad_norm": 3.514759063720703,
+      "kl": 0.0901331901550293,
+      "learning_rate": 4.583333333333333e-06,
+      "loss": 0.12482144683599472,
+      "step": 11,
+      "step_time": 0.22661321499981568
+    },
+    {
+      "clip_ratio/high_max": 0.04296875,
+      "clip_ratio/high_mean": 0.04296875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.04296875,
+      "entropy": 2.489102602005005,
+      "epoch": 0.5,
+      "grad_norm": 1.1873637437820435,
+      "kl": 0.06463827937841415,
+      "learning_rate": 4.541666666666667e-06,
+      "loss": -0.07826828956604004,
+      "step": 12,
+      "step_time": 0.22605949299986605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2705869674682617,
+      "epoch": 0.5416666666666666,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5841100215911865,
+      "kl": 0.07928314059972763,
+      "learning_rate": 4.5e-06,
+      "loss": -0.05041670799255371,
+      "num_tokens": 100992.0,
+      "reward": -0.06781250238418579,
+      "reward_std": 0.02886570803821087,
+      "rewards/GeneratorRewardFunction/mean": -0.06781250238418579,
+      "rewards/GeneratorRewardFunction/std": 0.028865709900856018,
+      "step": 13,
+      "step_time": 12.360951734000082
+    },
+    {
+      "clip_ratio/high_max": 0.015625,
+      "clip_ratio/high_mean": 0.015625,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.01627604104578495,
+      "entropy": 1.7322059869766235,
+      "epoch": 0.5833333333333334,
+      "grad_norm": 2.0741848945617676,
+      "kl": 0.060624316334724426,
+      "learning_rate": 4.4583333333333336e-06,
+      "loss": -0.04468907043337822,
+      "step": 14,
+      "step_time": 0.21993768200036357
+    },
+    {
+      "clip_ratio/high_max": 0.01627604104578495,
+      "clip_ratio/high_mean": 0.01627604104578495,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.01888020895421505,
+      "entropy": 2.18737530708313,
+      "epoch": 0.625,
+      "grad_norm": 4.066506385803223,
+      "kl": 0.07642111927270889,
+      "learning_rate": 4.416666666666667e-06,
+      "loss": 0.1044829785823822,
+      "step": 15,
+      "step_time": 0.22013131699986843
+    },
+    {
+      "clip_ratio/high_max": 0.0442708320915699,
+      "clip_ratio/high_mean": 0.0442708320915699,
+      "clip_ratio/low_mean": 0.009765625,
+      "clip_ratio/low_min": 0.009765625,
+      "clip_ratio/region_mean": 0.0540364570915699,
+      "entropy": 2.622234582901001,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 3.0252487659454346,
+      "kl": 0.10498789697885513,
+      "learning_rate": 4.3750000000000005e-06,
+      "loss": -0.003130403347313404,
+      "step": 16,
+      "step_time": 0.21912707499996031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9028136134147644,
+      "epoch": 0.7083333333333334,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.3516546487808228,
+      "kl": 0.08489418029785156,
+      "learning_rate": 4.333333333333334e-06,
+      "loss": 0.03129524365067482,
+      "num_tokens": 126380.0,
+      "reward": -0.18468749523162842,
+      "reward_std": 0.27848079800605774,
+      "rewards/GeneratorRewardFunction/mean": -0.18468749523162842,
+      "rewards/GeneratorRewardFunction/std": 0.27848079800605774,
+      "step": 17,
+      "step_time": 12.429075290000128
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 2.785872220993042,
+      "epoch": 0.75,
+      "grad_norm": 1.737163782119751,
+      "kl": 0.10512515157461166,
+      "learning_rate": 4.2916666666666665e-06,
+      "loss": -0.0846581980586052,
+      "step": 18,
+      "step_time": 0.2297946270000466
+    },
+    {
+      "clip_ratio/high_max": 0.01171875,
+      "clip_ratio/high_mean": 0.01171875,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.012369791977107525,
+      "entropy": 2.066610097885132,
+      "epoch": 0.7916666666666666,
+      "grad_norm": 2.454716920852661,
+      "kl": 0.10243833065032959,
+      "learning_rate": 4.25e-06,
+      "loss": 0.031895026564598083,
+      "step": 19,
+      "step_time": 0.23007028499978333
+    },
+    {
+      "clip_ratio/high_max": 0.013671875,
+      "clip_ratio/high_mean": 0.013671875,
+      "clip_ratio/low_mean": 0.0032552082557231188,
+      "clip_ratio/low_min": 0.0032552082557231188,
+      "clip_ratio/region_mean": 0.01692708395421505,
+      "entropy": 1.9649995565414429,
+      "epoch": 0.8333333333333334,
+      "grad_norm": 2.563176155090332,
+      "kl": 0.07682739198207855,
+      "learning_rate": 4.208333333333333e-06,
+      "loss": 0.022962143644690514,
+      "step": 20,
+      "step_time": 0.23112243600007787
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.3113908767700195,
+      "epoch": 0.875,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.672222375869751,
+      "kl": 0.09574653953313828,
+      "learning_rate": 4.166666666666667e-06,
+      "loss": -0.04630495235323906,
+      "num_tokens": 151916.0,
+      "reward": -0.18390625715255737,
+      "reward_std": 0.24478043615818024,
+      "rewards/GeneratorRewardFunction/mean": -0.18390625715255737,
+      "rewards/GeneratorRewardFunction/std": 0.24478045105934143,
+      "step": 21,
+      "step_time": 12.423363883000093
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.0052083334885537624,
+      "entropy": 1.7172327041625977,
+      "epoch": 0.9166666666666666,
+      "grad_norm": 2.3833675384521484,
+      "kl": 0.10857907682657242,
+      "learning_rate": 4.125e-06,
+      "loss": 0.08606918901205063,
+      "step": 22,
+      "step_time": 0.23097956299989164
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.009114583022892475,
+      "entropy": 2.305208444595337,
+      "epoch": 0.9583333333333334,
+      "grad_norm": 1.6843225955963135,
+      "kl": 0.14590007066726685,
+      "learning_rate": 4.083333333333334e-06,
+      "loss": -0.037479907274246216,
+      "step": 23,
+      "step_time": 0.2308707210004286
+    },
+    {
+      "clip_ratio/high_max": 0.014973958022892475,
+      "clip_ratio/high_mean": 0.014973958022892475,
+      "clip_ratio/low_mean": 0.0052083334885537624,
+      "clip_ratio/low_min": 0.0052083334885537624,
+      "clip_ratio/region_mean": 0.02018229104578495,
+      "entropy": 1.4579397439956665,
+      "epoch": 1.0,
+      "grad_norm": 2.526630163192749,
+      "kl": 0.10583696514368057,
+      "learning_rate": 4.041666666666667e-06,
+      "loss": -0.0003783749125432223,
+      "step": 24,
+      "step_time": 0.23123665799994342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.00585675239563,
+      "epoch": 1.0416666666666667,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.705661952495575,
+      "kl": 0.10460541397333145,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": -0.030740460380911827,
+      "num_tokens": 177032.0,
+      "reward": -0.054375000298023224,
+      "reward_std": 0.012499997392296791,
+      "rewards/GeneratorRewardFunction/mean": -0.054375000298023224,
+      "rewards/GeneratorRewardFunction/std": 0.012500000186264515,
+      "step": 25,
+      "step_time": 12.342589669000063
+    },
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.3793340921401978,
+      "epoch": 1.0833333333333333,
+      "grad_norm": 1.3432904481887817,
+      "kl": 0.10281199216842651,
+      "learning_rate": 3.958333333333333e-06,
+      "loss": -0.0930887758731842,
+      "step": 26,
+      "step_time": 0.22512248300017745
+    },
+    {
+      "clip_ratio/high_max": 0.005859375,
+      "clip_ratio/high_mean": 0.005859375,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.005859375,
+      "entropy": 1.6825193166732788,
+      "epoch": 1.125,
+      "grad_norm": 2.242509603500366,
+      "kl": 0.10501086711883545,
+      "learning_rate": 3.916666666666667e-06,
+      "loss": 0.031402457505464554,
+      "step": 27,
+      "step_time": 0.22399900899972636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 1.8964576721191406,
+      "epoch": 1.1666666666666667,
+      "grad_norm": 2.502171039581299,
+      "kl": 0.12420836836099625,
+      "learning_rate": 3.875e-06,
+      "loss": 0.09433440119028091,
+      "step": 28,
+      "step_time": 0.22452458999987357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.6724086999893188,
+      "epoch": 1.2083333333333333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0264368057250977,
+      "kl": 0.15805059671401978,
+      "learning_rate": 3.833333333333334e-06,
+      "loss": -0.06202007457613945,
+      "num_tokens": 202536.0,
+      "reward": -0.125,
+      "reward_std": 0.20493900775909424,
+      "rewards/GeneratorRewardFunction/mean": -0.125,
+      "rewards/GeneratorRewardFunction/std": 0.20493900775909424,
+      "step": 29,
+      "step_time": 12.317649789999905
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 1.8093080520629883,
+      "epoch": 1.25,
+      "grad_norm": 1.1415506601333618,
+      "kl": 0.12885110080242157,
+      "learning_rate": 3.7916666666666666e-06,
+      "loss": -0.06245793402194977,
+      "step": 30,
+      "step_time": 0.22891983299996355
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 120,
+  "num_input_tokens_seen": 202536,
+  "num_train_epochs": 5,
+  "save_steps": 30,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

self_play_hf_l40s_full/round_002/generator_train/checkpoint-30/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c88c3b33b27b27c1cc7746fe61baacbbd09e1573fd273a1e9122cb89b0d1870e
+size 7249