Siddeshwar1625 commited on 13 days ago

Commit

4fe6c17

verified ·

1 Parent(s): 756a9d5

Upload generator checkpoints for round 001

Browse files

Files changed (31) hide show

.gitattributes +3 -0
self_play_hf_a10g_train/round_001/generator_train/README.md +67 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/chat_template.jinja +54 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/config.json +57 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/generation_config.json +13 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/model.safetensors +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/optimizer.pt +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/rng_state.pth +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/scheduler.pt +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer.json +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer_config.json +32 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/trainer_state.json +764 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/training_args.bin +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/chat_template.jinja +54 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/config.json +57 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/generation_config.json +13 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/model.safetensors +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/optimizer.pt +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/rng_state.pth +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/scheduler.pt +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer.json +3 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer_config.json +32 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/trainer_state.json +953 -0
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/training_args.bin +3 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/chat_template.jinja +54 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/config.json +57 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/generation_config.json +13 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/model.safetensors +3 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer.json +3 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer_config.json +32 -0
self_play_hf_a10g_train/round_001/generator_train/final_model/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text

self_play_hf_a10g_train/round_001/generator_train/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+base_model: Qwen/Qwen2.5-0.5B-Instruct
+library_name: transformers
+model_name: generator_train
+tags:
+- generated_from_trainer
+- grpo
+- trl
+licence: license
+---
+# Model Card for generator_train
+This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/siddeshwar2004-international-institute-of-information-te/osint-self-play-train/runs/w4yxkqbv)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 1.2.0
+- Transformers: 5.6.2
+- Pytorch: 2.11.0
+- Datasets: 4.8.4
+- Tokenizers: 0.22.2
+## Citations
+Cite GRPO as:
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f104793ef80b632081adc349f9a54bede0112ea26d13ce2c2a8312cf61dbbfae
+size 1976163472

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d04b529c8641e602da72a33b67693f5b64694dac6252d75dad9f985ff685e6b
+size 3952509771

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5725c96886e63d92a4804b9e1b509a94ed72b0ac9da7f6955ce8a319b258d37
+size 14645

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44b7842b78ac3e944fa674f4961bef93e595fe53f0c4495643408e144ea2a074
+size 1465

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/trainer_state.json ADDED Viewed

	@@ -0,0 +1,764 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6666666666666665,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.9362258911132812,
+      "epoch": 0.041666666666666664,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.343350648880005,
+      "kl": 0.0005498419050127268,
+      "learning_rate": 5e-06,
+      "loss": 0.13995857536792755,
+      "num_tokens": 25244.0,
+      "reward": -0.4352343678474426,
+      "reward_std": 0.306624174118042,
+      "rewards/GeneratorRewardFunction/mean": -0.4352343678474426,
+      "rewards/GeneratorRewardFunction/std": 0.306624174118042,
+      "step": 1,
+      "step_time": 12.520110770000002
+    },
+    {
+      "clip_ratio/high_max": 0.0045572915114462376,
+      "clip_ratio/high_mean": 0.0045572915114462376,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.005859375,
+      "entropy": 1.2710224390029907,
+      "epoch": 0.08333333333333333,
+      "grad_norm": 2.8392181396484375,
+      "kl": 0.001467077643610537,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": -0.06676606088876724,
+      "step": 2,
+      "step_time": 0.22065511100001345
+    },
+    {
+      "clip_ratio/high_max": 0.013671875,
+      "clip_ratio/high_mean": 0.013671875,
+      "clip_ratio/low_mean": 0.014322916977107525,
+      "clip_ratio/low_min": 0.014322916977107525,
+      "clip_ratio/region_mean": 0.02799479104578495,
+      "entropy": 1.871756911277771,
+      "epoch": 0.125,
+      "grad_norm": 2.4508721828460693,
+      "kl": 0.004840313456952572,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.010330882854759693,
+      "step": 3,
+      "step_time": 0.21905603899998027
+    },
+    {
+      "clip_ratio/high_max": 0.01822916604578495,
+      "clip_ratio/high_mean": 0.01822916604578495,
+      "clip_ratio/low_mean": 0.010416666977107525,
+      "clip_ratio/low_min": 0.010416666977107525,
+      "clip_ratio/region_mean": 0.02864583395421505,
+      "entropy": 1.1871482133865356,
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.5818687677383423,
+      "kl": 0.005701068323105574,
+      "learning_rate": 4.7e-06,
+      "loss": -0.08211664110422134,
+      "step": 4,
+      "step_time": 0.21982950500000698
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.0733331441879272,
+      "epoch": 0.20833333333333334,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.4188177585601807,
+      "kl": 0.018397731706500053,
+      "learning_rate": 4.600000000000001e-06,
+      "loss": 0.14518485963344574,
+      "num_tokens": 50440.0,
+      "reward": -0.2228125035762787,
+      "reward_std": 0.274566113948822,
+      "rewards/GeneratorRewardFunction/mean": -0.2228125035762787,
+      "rewards/GeneratorRewardFunction/std": 0.2745661437511444,
+      "step": 5,
+      "step_time": 12.121955963999994
+    },
+    {
+      "clip_ratio/high_max": 0.0052083334885537624,
+      "clip_ratio/high_mean": 0.0052083334885537624,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 1.327884316444397,
+      "epoch": 0.25,
+      "grad_norm": 2.6934618949890137,
+      "kl": 0.01829482428729534,
+      "learning_rate": 4.5e-06,
+      "loss": -0.037107061594724655,
+      "step": 6,
+      "step_time": 0.22655425900001092
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0052083334885537624,
+      "clip_ratio/low_min": 0.0052083334885537624,
+      "clip_ratio/region_mean": 0.01171875,
+      "entropy": 1.2937031984329224,
+      "epoch": 0.2916666666666667,
+      "grad_norm": 2.8983969688415527,
+      "kl": 0.021993428468704224,
+      "learning_rate": 4.4e-06,
+      "loss": 0.013194209896028042,
+      "step": 7,
+      "step_time": 0.22578505299998142
+    },
+    {
+      "clip_ratio/high_max": 0.029296875,
+      "clip_ratio/high_mean": 0.029296875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.029296875,
+      "entropy": 0.7918106913566589,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.268328309059143,
+      "kl": 0.03330208733677864,
+      "learning_rate": 4.3e-06,
+      "loss": -0.12033451348543167,
+      "step": 8,
+      "step_time": 0.2253496229999996
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.0043877363204956,
+      "epoch": 0.375,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.3302299976348877,
+      "kl": 0.02948068641126156,
+      "learning_rate": 4.2000000000000004e-06,
+      "loss": -0.07878098636865616,
+      "num_tokens": 75884.0,
+      "reward": -0.12250000238418579,
+      "reward_std": 0.21038061380386353,
+      "rewards/GeneratorRewardFunction/mean": -0.12250000238418579,
+      "rewards/GeneratorRewardFunction/std": 0.21038061380386353,
+      "step": 9,
+      "step_time": 11.949294435000013
+    },
+    {
+      "clip_ratio/high_max": 0.010416666977107525,
+      "clip_ratio/high_mean": 0.010416666977107525,
+      "clip_ratio/low_mean": 0.0032552082557231188,
+      "clip_ratio/low_min": 0.0032552082557231188,
+      "clip_ratio/region_mean": 0.013671875,
+      "entropy": 1.1474007368087769,
+      "epoch": 0.4166666666666667,
+      "grad_norm": 2.1202971935272217,
+      "kl": 0.03631855919957161,
+      "learning_rate": 4.1e-06,
+      "loss": 0.03980601206421852,
+      "step": 10,
+      "step_time": 0.22497136300000875
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.012369791977107525,
+      "entropy": 0.9981658458709717,
+      "epoch": 0.4583333333333333,
+      "grad_norm": 2.094111680984497,
+      "kl": 0.049915943294763565,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.06331142038106918,
+      "step": 11,
+      "step_time": 0.22518257699999822
+    },
+    {
+      "clip_ratio/high_max": 0.0403645820915699,
+      "clip_ratio/high_mean": 0.0403645820915699,
+      "clip_ratio/low_mean": 0.0065104165114462376,
+      "clip_ratio/low_min": 0.0065104165114462376,
+      "clip_ratio/region_mean": 0.046875,
+      "entropy": 1.3572144508361816,
+      "epoch": 0.5,
+      "grad_norm": 2.6413395404815674,
+      "kl": 0.053241848945617676,
+      "learning_rate": 3.900000000000001e-06,
+      "loss": -0.022430941462516785,
+      "step": 12,
+      "step_time": 0.2254562319999991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.001953125,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.629288673400879,
+      "epoch": 0.5416666666666666,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.107161283493042,
+      "kl": 0.0636429563164711,
+      "learning_rate": 3.8000000000000005e-06,
+      "loss": 0.11612584441900253,
+      "num_tokens": 101112.0,
+      "reward": -0.11937499791383743,
+      "reward_std": 0.20747588574886322,
+      "rewards/GeneratorRewardFunction/mean": -0.11937499791383743,
+      "rewards/GeneratorRewardFunction/std": 0.20747590065002441,
+      "step": 13,
+      "step_time": 12.014855219999987
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.00390625,
+      "entropy": 1.4905215501785278,
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.75613272190094,
+      "kl": 0.060588542371988297,
+      "learning_rate": 3.7e-06,
+      "loss": 0.0006357845850288868,
+      "step": 14,
+      "step_time": 0.21892069699998729
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.009114583022892475,
+      "clip_ratio/low_min": 0.009114583022892475,
+      "clip_ratio/region_mean": 0.01627604104578495,
+      "entropy": 1.2682157754898071,
+      "epoch": 0.625,
+      "grad_norm": 2.94674015045166,
+      "kl": 0.08301883935928345,
+      "learning_rate": 3.6000000000000003e-06,
+      "loss": -0.0923055037856102,
+      "step": 15,
+      "step_time": 0.21824655300000018
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.01627604104578495,
+      "clip_ratio/low_min": 0.01627604104578495,
+      "clip_ratio/region_mean": 0.02473958395421505,
+      "entropy": 0.9931669235229492,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 2.1109514236450195,
+      "kl": 0.09274417906999588,
+      "learning_rate": 3.5e-06,
+      "loss": -0.0218100156635046,
+      "step": 16,
+      "step_time": 0.21831970400000955
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.376638412475586,
+      "epoch": 0.7083333333333334,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.625627279281616,
+      "kl": 0.07451707124710083,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 0.02886168472468853,
+      "num_tokens": 126436.0,
+      "reward": -0.09125000238418579,
+      "reward_std": 0.1655445545911789,
+      "rewards/GeneratorRewardFunction/mean": -0.09125000238418579,
+      "rewards/GeneratorRewardFunction/std": 0.1655445545911789,
+      "step": 17,
+      "step_time": 12.074089973000014
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 0.8447733521461487,
+      "epoch": 0.75,
+      "grad_norm": 2.2611021995544434,
+      "kl": 0.08117184042930603,
+      "learning_rate": 3.3000000000000006e-06,
+      "loss": -0.003659568028524518,
+      "step": 18,
+      "step_time": 0.22418350800001008
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 0.9943304061889648,
+      "epoch": 0.7916666666666666,
+      "grad_norm": 1.5852197408676147,
+      "kl": 0.08660884946584702,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": -0.10765092819929123,
+      "step": 19,
+      "step_time": 0.224853281999998
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.02213541604578495,
+      "clip_ratio/low_min": 0.02213541604578495,
+      "clip_ratio/region_mean": 0.03059895895421505,
+      "entropy": 1.2907896041870117,
+      "epoch": 0.8333333333333334,
+      "grad_norm": 2.97239089012146,
+      "kl": 0.08734595775604248,
+      "learning_rate": 3.1000000000000004e-06,
+      "loss": 0.08410018682479858,
+      "step": 20,
+      "step_time": 0.22594529400001306
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3083521127700806,
+      "epoch": 0.875,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.6607460975646973,
+      "kl": 0.08000912517309189,
+      "learning_rate": 3e-06,
+      "loss": -0.11818201094865799,
+      "num_tokens": 151596.0,
+      "reward": -0.22624999284744263,
+      "reward_std": 0.2758048474788666,
+      "rewards/GeneratorRewardFunction/mean": -0.22624999284744263,
+      "rewards/GeneratorRewardFunction/std": 0.27580487728118896,
+      "step": 21,
+      "step_time": 11.957008280999986
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 1.2017608880996704,
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.8747857809066772,
+      "kl": 0.11337386816740036,
+      "learning_rate": 2.9e-06,
+      "loss": -0.003786882385611534,
+      "step": 22,
+      "step_time": 0.221026849999987
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 1.107405662536621,
+      "epoch": 0.9583333333333334,
+      "grad_norm": 2.115562915802002,
+      "kl": 0.09759091585874557,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.08772162348031998,
+      "step": 23,
+      "step_time": 0.22019210400000588
+    },
+    {
+      "clip_ratio/high_max": 0.005859375,
+      "clip_ratio/high_mean": 0.005859375,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 0.7833542823791504,
+      "epoch": 1.0,
+      "grad_norm": 1.686574101448059,
+      "kl": 0.10800782591104507,
+      "learning_rate": 2.7000000000000004e-06,
+      "loss": 0.03493640199303627,
+      "step": 24,
+      "step_time": 0.22060076099998582
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.0690312385559082,
+      "epoch": 1.0416666666666667,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.565182685852051,
+      "kl": 0.12982706725597382,
+      "learning_rate": 2.6e-06,
+      "loss": 0.12511824071407318,
+      "num_tokens": 177212.0,
+      "reward": -0.12406250089406967,
+      "reward_std": 0.19458690285682678,
+      "rewards/GeneratorRewardFunction/mean": -0.12406250089406967,
+      "rewards/GeneratorRewardFunction/std": 0.19458691775798798,
+      "step": 25,
+      "step_time": 12.125360838999995
+    },
+    {
+      "clip_ratio/high_max": 0.00390625,
+      "clip_ratio/high_mean": 0.00390625,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0052083334885537624,
+      "entropy": 0.8722183108329773,
+      "epoch": 1.0833333333333333,
+      "grad_norm": 2.401808261871338,
+      "kl": 0.12285982817411423,
+      "learning_rate": 2.5e-06,
+      "loss": -0.13922104239463806,
+      "step": 26,
+      "step_time": 0.22810240500001555
+    },
+    {
+      "clip_ratio/high_max": 0.0052083334885537624,
+      "clip_ratio/high_mean": 0.0052083334885537624,
+      "clip_ratio/low_mean": 0.005859375,
+      "clip_ratio/low_min": 0.005859375,
+      "clip_ratio/region_mean": 0.011067708022892475,
+      "entropy": 1.3027639389038086,
+      "epoch": 1.125,
+      "grad_norm": 1.7678114175796509,
+      "kl": 0.10112806409597397,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": -0.0586722195148468,
+      "step": 27,
+      "step_time": 0.22764332500003093
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0071614584885537624,
+      "clip_ratio/low_min": 0.0071614584885537624,
+      "clip_ratio/region_mean": 0.013671875,
+      "entropy": 0.9790509343147278,
+      "epoch": 1.1666666666666667,
+      "grad_norm": 1.9319959878921509,
+      "kl": 0.11484679579734802,
+      "learning_rate": 2.3000000000000004e-06,
+      "loss": 0.07509768754243851,
+      "step": 28,
+      "step_time": 0.22808939600002986
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9768911004066467,
+      "epoch": 1.2083333333333333,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.2773685455322266,
+      "kl": 0.10255210846662521,
+      "learning_rate": 2.2e-06,
+      "loss": -0.01104909647256136,
+      "num_tokens": 202200.0,
+      "reward": -0.13343749940395355,
+      "reward_std": 0.2115633636713028,
+      "rewards/GeneratorRewardFunction/mean": -0.13343749940395355,
+      "rewards/GeneratorRewardFunction/std": 0.211563378572464,
+      "step": 29,
+      "step_time": 11.981290445000013
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 0.9071128368377686,
+      "epoch": 1.25,
+      "grad_norm": 2.377110004425049,
+      "kl": 0.11365322023630142,
+      "learning_rate": 2.1000000000000002e-06,
+      "loss": 0.07630521804094315,
+      "step": 30,
+      "step_time": 0.22243629600001213
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.009114583022892475,
+      "entropy": 1.3066421747207642,
+      "epoch": 1.2916666666666667,
+      "grad_norm": 2.6143717765808105,
+      "kl": 0.09395217150449753,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": -0.023749127984046936,
+      "step": 31,
+      "step_time": 0.22260347799999636
+    },
+    {
+      "clip_ratio/high_max": 0.005859375,
+      "clip_ratio/high_mean": 0.005859375,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 1.2596086263656616,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 1.4453171491622925,
+      "kl": 0.09631065279245377,
+      "learning_rate": 1.9000000000000002e-06,
+      "loss": -0.04094076156616211,
+      "step": 32,
+      "step_time": 0.2226373689999832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9096196293830872,
+      "epoch": 1.375,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.4287660121917725,
+      "kl": 0.15610061585903168,
+      "learning_rate": 1.8000000000000001e-06,
+      "loss": 0.060271572321653366,
+      "num_tokens": 227556.0,
+      "reward": -0.13343749940395355,
+      "reward_std": 0.19603331387043,
+      "rewards/GeneratorRewardFunction/mean": -0.13343749940395355,
+      "rewards/GeneratorRewardFunction/std": 0.19603331387043,
+      "step": 33,
+      "step_time": 12.116691536000019
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.00390625,
+      "entropy": 1.2461239099502563,
+      "epoch": 1.4166666666666667,
+      "grad_norm": 2.0638527870178223,
+      "kl": 0.11391329020261765,
+      "learning_rate": 1.7000000000000002e-06,
+      "loss": -0.012484799139201641,
+      "step": 34,
+      "step_time": 0.22859811600000057
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 0.9673511385917664,
+      "epoch": 1.4583333333333333,
+      "grad_norm": 2.4296762943267822,
+      "kl": 0.1084410771727562,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": -0.05158400535583496,
+      "step": 35,
+      "step_time": 0.23024993899997526
+    },
+    {
+      "clip_ratio/high_max": 0.00390625,
+      "clip_ratio/high_mean": 0.00390625,
+      "clip_ratio/low_mean": 0.0032552082557231188,
+      "clip_ratio/low_min": 0.0032552082557231188,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 0.983039915561676,
+      "epoch": 1.5,
+      "grad_norm": 1.957944631576538,
+      "kl": 0.13104547560214996,
+      "learning_rate": 1.5e-06,
+      "loss": 0.004632837139070034,
+      "step": 36,
+      "step_time": 0.2287184080000202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.426942229270935,
+      "epoch": 1.5416666666666665,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.2855005264282227,
+      "kl": 0.14382179081439972,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": -0.0295367781072855,
+      "num_tokens": 252640.0,
+      "reward": -0.07735294103622437,
+      "reward_std": 0.3284520208835602,
+      "rewards/GeneratorRewardFunction/mean": -0.07735294103622437,
+      "rewards/GeneratorRewardFunction/std": 0.32845205068588257,
+      "step": 37,
+      "step_time": 13.249790346999987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 1.1062594652175903,
+      "epoch": 1.5833333333333335,
+      "grad_norm": 1.2463343143463135,
+      "kl": 0.14914868772029877,
+      "learning_rate": 1.3e-06,
+      "loss": 0.06158822774887085,
+      "step": 38,
+      "step_time": 0.22136241700002302
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.3345317840576172,
+      "epoch": 1.625,
+      "grad_norm": 2.388456106185913,
+      "kl": 0.1212289109826088,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.013628202490508556,
+      "step": 39,
+      "step_time": 0.22244895000000042
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 0.9387586712837219,
+      "epoch": 1.6666666666666665,
+      "grad_norm": 2.4696860313415527,
+      "kl": 0.1326405256986618,
+      "learning_rate": 1.1e-06,
+      "loss": -0.0440821647644043,
+      "step": 40,
+      "step_time": 0.22124087500003498
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 50,
+  "num_input_tokens_seen": 252640,
+  "num_train_epochs": 3,
+  "save_steps": 10,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31ec66b64f432daf7616434296713e432d134face96e308f2ebc175e2e26f025
+size 7249

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1bb3fea31b76835f54fffde7e1eeacafdd13f1ca40601af302caf5d8275af4
+size 1976163472

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eeb016f10d80583c7030f8924276f5af074ee87a36483989ee09deeb02394767
+size 3952509771

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7876773bbbd765b5b540a43519e4809b559e65cdfa3f4e9508024dc7702f2f6e
+size 14645

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cf220ca534359fdb729d8e74ed3b0c609c54e6d591e1d2478f5521fc51fba05
+size 1465

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/trainer_state.json ADDED Viewed

	@@ -0,0 +1,953 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0833333333333335,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.001953125,
+      "clip_ratio/high_mean": 0.001953125,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.001953125,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.9362258911132812,
+      "epoch": 0.041666666666666664,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.343350648880005,
+      "kl": 0.0005498419050127268,
+      "learning_rate": 5e-06,
+      "loss": 0.13995857536792755,
+      "num_tokens": 25244.0,
+      "reward": -0.4352343678474426,
+      "reward_std": 0.306624174118042,
+      "rewards/GeneratorRewardFunction/mean": -0.4352343678474426,
+      "rewards/GeneratorRewardFunction/std": 0.306624174118042,
+      "step": 1,
+      "step_time": 12.520110770000002
+    },
+    {
+      "clip_ratio/high_max": 0.0045572915114462376,
+      "clip_ratio/high_mean": 0.0045572915114462376,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.005859375,
+      "entropy": 1.2710224390029907,
+      "epoch": 0.08333333333333333,
+      "grad_norm": 2.8392181396484375,
+      "kl": 0.001467077643610537,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": -0.06676606088876724,
+      "step": 2,
+      "step_time": 0.22065511100001345
+    },
+    {
+      "clip_ratio/high_max": 0.013671875,
+      "clip_ratio/high_mean": 0.013671875,
+      "clip_ratio/low_mean": 0.014322916977107525,
+      "clip_ratio/low_min": 0.014322916977107525,
+      "clip_ratio/region_mean": 0.02799479104578495,
+      "entropy": 1.871756911277771,
+      "epoch": 0.125,
+      "grad_norm": 2.4508721828460693,
+      "kl": 0.004840313456952572,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.010330882854759693,
+      "step": 3,
+      "step_time": 0.21905603899998027
+    },
+    {
+      "clip_ratio/high_max": 0.01822916604578495,
+      "clip_ratio/high_mean": 0.01822916604578495,
+      "clip_ratio/low_mean": 0.010416666977107525,
+      "clip_ratio/low_min": 0.010416666977107525,
+      "clip_ratio/region_mean": 0.02864583395421505,
+      "entropy": 1.1871482133865356,
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.5818687677383423,
+      "kl": 0.005701068323105574,
+      "learning_rate": 4.7e-06,
+      "loss": -0.08211664110422134,
+      "step": 4,
+      "step_time": 0.21982950500000698
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.0733331441879272,
+      "epoch": 0.20833333333333334,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.4188177585601807,
+      "kl": 0.018397731706500053,
+      "learning_rate": 4.600000000000001e-06,
+      "loss": 0.14518485963344574,
+      "num_tokens": 50440.0,
+      "reward": -0.2228125035762787,
+      "reward_std": 0.274566113948822,
+      "rewards/GeneratorRewardFunction/mean": -0.2228125035762787,
+      "rewards/GeneratorRewardFunction/std": 0.2745661437511444,
+      "step": 5,
+      "step_time": 12.121955963999994
+    },
+    {
+      "clip_ratio/high_max": 0.0052083334885537624,
+      "clip_ratio/high_mean": 0.0052083334885537624,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 1.327884316444397,
+      "epoch": 0.25,
+      "grad_norm": 2.6934618949890137,
+      "kl": 0.01829482428729534,
+      "learning_rate": 4.5e-06,
+      "loss": -0.037107061594724655,
+      "step": 6,
+      "step_time": 0.22655425900001092
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0052083334885537624,
+      "clip_ratio/low_min": 0.0052083334885537624,
+      "clip_ratio/region_mean": 0.01171875,
+      "entropy": 1.2937031984329224,
+      "epoch": 0.2916666666666667,
+      "grad_norm": 2.8983969688415527,
+      "kl": 0.021993428468704224,
+      "learning_rate": 4.4e-06,
+      "loss": 0.013194209896028042,
+      "step": 7,
+      "step_time": 0.22578505299998142
+    },
+    {
+      "clip_ratio/high_max": 0.029296875,
+      "clip_ratio/high_mean": 0.029296875,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.029296875,
+      "entropy": 0.7918106913566589,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.268328309059143,
+      "kl": 0.03330208733677864,
+      "learning_rate": 4.3e-06,
+      "loss": -0.12033451348543167,
+      "step": 8,
+      "step_time": 0.2253496229999996
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.0043877363204956,
+      "epoch": 0.375,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.3302299976348877,
+      "kl": 0.02948068641126156,
+      "learning_rate": 4.2000000000000004e-06,
+      "loss": -0.07878098636865616,
+      "num_tokens": 75884.0,
+      "reward": -0.12250000238418579,
+      "reward_std": 0.21038061380386353,
+      "rewards/GeneratorRewardFunction/mean": -0.12250000238418579,
+      "rewards/GeneratorRewardFunction/std": 0.21038061380386353,
+      "step": 9,
+      "step_time": 11.949294435000013
+    },
+    {
+      "clip_ratio/high_max": 0.010416666977107525,
+      "clip_ratio/high_mean": 0.010416666977107525,
+      "clip_ratio/low_mean": 0.0032552082557231188,
+      "clip_ratio/low_min": 0.0032552082557231188,
+      "clip_ratio/region_mean": 0.013671875,
+      "entropy": 1.1474007368087769,
+      "epoch": 0.4166666666666667,
+      "grad_norm": 2.1202971935272217,
+      "kl": 0.03631855919957161,
+      "learning_rate": 4.1e-06,
+      "loss": 0.03980601206421852,
+      "step": 10,
+      "step_time": 0.22497136300000875
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.012369791977107525,
+      "entropy": 0.9981658458709717,
+      "epoch": 0.4583333333333333,
+      "grad_norm": 2.094111680984497,
+      "kl": 0.049915943294763565,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.06331142038106918,
+      "step": 11,
+      "step_time": 0.22518257699999822
+    },
+    {
+      "clip_ratio/high_max": 0.0403645820915699,
+      "clip_ratio/high_mean": 0.0403645820915699,
+      "clip_ratio/low_mean": 0.0065104165114462376,
+      "clip_ratio/low_min": 0.0065104165114462376,
+      "clip_ratio/region_mean": 0.046875,
+      "entropy": 1.3572144508361816,
+      "epoch": 0.5,
+      "grad_norm": 2.6413395404815674,
+      "kl": 0.053241848945617676,
+      "learning_rate": 3.900000000000001e-06,
+      "loss": -0.022430941462516785,
+      "step": 12,
+      "step_time": 0.2254562319999991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.001953125,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.629288673400879,
+      "epoch": 0.5416666666666666,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.107161283493042,
+      "kl": 0.0636429563164711,
+      "learning_rate": 3.8000000000000005e-06,
+      "loss": 0.11612584441900253,
+      "num_tokens": 101112.0,
+      "reward": -0.11937499791383743,
+      "reward_std": 0.20747588574886322,
+      "rewards/GeneratorRewardFunction/mean": -0.11937499791383743,
+      "rewards/GeneratorRewardFunction/std": 0.20747590065002441,
+      "step": 13,
+      "step_time": 12.014855219999987
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.00390625,
+      "entropy": 1.4905215501785278,
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.75613272190094,
+      "kl": 0.060588542371988297,
+      "learning_rate": 3.7e-06,
+      "loss": 0.0006357845850288868,
+      "step": 14,
+      "step_time": 0.21892069699998729
+    },
+    {
+      "clip_ratio/high_max": 0.0071614584885537624,
+      "clip_ratio/high_mean": 0.0071614584885537624,
+      "clip_ratio/low_mean": 0.009114583022892475,
+      "clip_ratio/low_min": 0.009114583022892475,
+      "clip_ratio/region_mean": 0.01627604104578495,
+      "entropy": 1.2682157754898071,
+      "epoch": 0.625,
+      "grad_norm": 2.94674015045166,
+      "kl": 0.08301883935928345,
+      "learning_rate": 3.6000000000000003e-06,
+      "loss": -0.0923055037856102,
+      "step": 15,
+      "step_time": 0.21824655300000018
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.01627604104578495,
+      "clip_ratio/low_min": 0.01627604104578495,
+      "clip_ratio/region_mean": 0.02473958395421505,
+      "entropy": 0.9931669235229492,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 2.1109514236450195,
+      "kl": 0.09274417906999588,
+      "learning_rate": 3.5e-06,
+      "loss": -0.0218100156635046,
+      "step": 16,
+      "step_time": 0.21831970400000955
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.376638412475586,
+      "epoch": 0.7083333333333334,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.625627279281616,
+      "kl": 0.07451707124710083,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 0.02886168472468853,
+      "num_tokens": 126436.0,
+      "reward": -0.09125000238418579,
+      "reward_std": 0.1655445545911789,
+      "rewards/GeneratorRewardFunction/mean": -0.09125000238418579,
+      "rewards/GeneratorRewardFunction/std": 0.1655445545911789,
+      "step": 17,
+      "step_time": 12.074089973000014
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 0.8447733521461487,
+      "epoch": 0.75,
+      "grad_norm": 2.2611021995544434,
+      "kl": 0.08117184042930603,
+      "learning_rate": 3.3000000000000006e-06,
+      "loss": -0.003659568028524518,
+      "step": 18,
+      "step_time": 0.22418350800001008
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 0.9943304061889648,
+      "epoch": 0.7916666666666666,
+      "grad_norm": 1.5852197408676147,
+      "kl": 0.08660884946584702,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": -0.10765092819929123,
+      "step": 19,
+      "step_time": 0.224853281999998
+    },
+    {
+      "clip_ratio/high_max": 0.008463541977107525,
+      "clip_ratio/high_mean": 0.008463541977107525,
+      "clip_ratio/low_mean": 0.02213541604578495,
+      "clip_ratio/low_min": 0.02213541604578495,
+      "clip_ratio/region_mean": 0.03059895895421505,
+      "entropy": 1.2907896041870117,
+      "epoch": 0.8333333333333334,
+      "grad_norm": 2.97239089012146,
+      "kl": 0.08734595775604248,
+      "learning_rate": 3.1000000000000004e-06,
+      "loss": 0.08410018682479858,
+      "step": 20,
+      "step_time": 0.22594529400001306
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3083521127700806,
+      "epoch": 0.875,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.6607460975646973,
+      "kl": 0.08000912517309189,
+      "learning_rate": 3e-06,
+      "loss": -0.11818201094865799,
+      "num_tokens": 151596.0,
+      "reward": -0.22624999284744263,
+      "reward_std": 0.2758048474788666,
+      "rewards/GeneratorRewardFunction/mean": -0.22624999284744263,
+      "rewards/GeneratorRewardFunction/std": 0.27580487728118896,
+      "step": 21,
+      "step_time": 11.957008280999986
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 1.2017608880996704,
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.8747857809066772,
+      "kl": 0.11337386816740036,
+      "learning_rate": 2.9e-06,
+      "loss": -0.003786882385611534,
+      "step": 22,
+      "step_time": 0.221026849999987
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 1.107405662536621,
+      "epoch": 0.9583333333333334,
+      "grad_norm": 2.115562915802002,
+      "kl": 0.09759091585874557,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.08772162348031998,
+      "step": 23,
+      "step_time": 0.22019210400000588
+    },
+    {
+      "clip_ratio/high_max": 0.005859375,
+      "clip_ratio/high_mean": 0.005859375,
+      "clip_ratio/low_mean": 0.00390625,
+      "clip_ratio/low_min": 0.00390625,
+      "clip_ratio/region_mean": 0.009765625,
+      "entropy": 0.7833542823791504,
+      "epoch": 1.0,
+      "grad_norm": 1.686574101448059,
+      "kl": 0.10800782591104507,
+      "learning_rate": 2.7000000000000004e-06,
+      "loss": 0.03493640199303627,
+      "step": 24,
+      "step_time": 0.22060076099998582
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.0690312385559082,
+      "epoch": 1.0416666666666667,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.565182685852051,
+      "kl": 0.12982706725597382,
+      "learning_rate": 2.6e-06,
+      "loss": 0.12511824071407318,
+      "num_tokens": 177212.0,
+      "reward": -0.12406250089406967,
+      "reward_std": 0.19458690285682678,
+      "rewards/GeneratorRewardFunction/mean": -0.12406250089406967,
+      "rewards/GeneratorRewardFunction/std": 0.19458691775798798,
+      "step": 25,
+      "step_time": 12.125360838999995
+    },
+    {
+      "clip_ratio/high_max": 0.00390625,
+      "clip_ratio/high_mean": 0.00390625,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0052083334885537624,
+      "entropy": 0.8722183108329773,
+      "epoch": 1.0833333333333333,
+      "grad_norm": 2.401808261871338,
+      "kl": 0.12285982817411423,
+      "learning_rate": 2.5e-06,
+      "loss": -0.13922104239463806,
+      "step": 26,
+      "step_time": 0.22810240500001555
+    },
+    {
+      "clip_ratio/high_max": 0.0052083334885537624,
+      "clip_ratio/high_mean": 0.0052083334885537624,
+      "clip_ratio/low_mean": 0.005859375,
+      "clip_ratio/low_min": 0.005859375,
+      "clip_ratio/region_mean": 0.011067708022892475,
+      "entropy": 1.3027639389038086,
+      "epoch": 1.125,
+      "grad_norm": 1.7678114175796509,
+      "kl": 0.10112806409597397,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": -0.0586722195148468,
+      "step": 27,
+      "step_time": 0.22764332500003093
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0071614584885537624,
+      "clip_ratio/low_min": 0.0071614584885537624,
+      "clip_ratio/region_mean": 0.013671875,
+      "entropy": 0.9790509343147278,
+      "epoch": 1.1666666666666667,
+      "grad_norm": 1.9319959878921509,
+      "kl": 0.11484679579734802,
+      "learning_rate": 2.3000000000000004e-06,
+      "loss": 0.07509768754243851,
+      "step": 28,
+      "step_time": 0.22808939600002986
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0006510416860692203,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9768911004066467,
+      "epoch": 1.2083333333333333,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.2773685455322266,
+      "kl": 0.10255210846662521,
+      "learning_rate": 2.2e-06,
+      "loss": -0.01104909647256136,
+      "num_tokens": 202200.0,
+      "reward": -0.13343749940395355,
+      "reward_std": 0.2115633636713028,
+      "rewards/GeneratorRewardFunction/mean": -0.13343749940395355,
+      "rewards/GeneratorRewardFunction/std": 0.211563378572464,
+      "step": 29,
+      "step_time": 11.981290445000013
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 0.9071128368377686,
+      "epoch": 1.25,
+      "grad_norm": 2.377110004425049,
+      "kl": 0.11365322023630142,
+      "learning_rate": 2.1000000000000002e-06,
+      "loss": 0.07630521804094315,
+      "step": 30,
+      "step_time": 0.22243629600001213
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0026041667442768812,
+      "clip_ratio/low_min": 0.0026041667442768812,
+      "clip_ratio/region_mean": 0.009114583022892475,
+      "entropy": 1.3066421747207642,
+      "epoch": 1.2916666666666667,
+      "grad_norm": 2.6143717765808105,
+      "kl": 0.09395217150449753,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": -0.023749127984046936,
+      "step": 31,
+      "step_time": 0.22260347799999636
+    },
+    {
+      "clip_ratio/high_max": 0.005859375,
+      "clip_ratio/high_mean": 0.005859375,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 1.2596086263656616,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 1.4453171491622925,
+      "kl": 0.09631065279245377,
+      "learning_rate": 1.9000000000000002e-06,
+      "loss": -0.04094076156616211,
+      "step": 32,
+      "step_time": 0.2226373689999832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9096196293830872,
+      "epoch": 1.375,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.4287660121917725,
+      "kl": 0.15610061585903168,
+      "learning_rate": 1.8000000000000001e-06,
+      "loss": 0.060271572321653366,
+      "num_tokens": 227556.0,
+      "reward": -0.13343749940395355,
+      "reward_std": 0.19603331387043,
+      "rewards/GeneratorRewardFunction/mean": -0.13343749940395355,
+      "rewards/GeneratorRewardFunction/std": 0.19603331387043,
+      "step": 33,
+      "step_time": 12.116691536000019
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.00390625,
+      "entropy": 1.2461239099502563,
+      "epoch": 1.4166666666666667,
+      "grad_norm": 2.0638527870178223,
+      "kl": 0.11391329020261765,
+      "learning_rate": 1.7000000000000002e-06,
+      "loss": -0.012484799139201641,
+      "step": 34,
+      "step_time": 0.22859811600000057
+    },
+    {
+      "clip_ratio/high_max": 0.0065104165114462376,
+      "clip_ratio/high_mean": 0.0065104165114462376,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0078125,
+      "entropy": 0.9673511385917664,
+      "epoch": 1.4583333333333333,
+      "grad_norm": 2.4296762943267822,
+      "kl": 0.1084410771727562,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": -0.05158400535583496,
+      "step": 35,
+      "step_time": 0.23024993899997526
+    },
+    {
+      "clip_ratio/high_max": 0.00390625,
+      "clip_ratio/high_mean": 0.00390625,
+      "clip_ratio/low_mean": 0.0032552082557231188,
+      "clip_ratio/low_min": 0.0032552082557231188,
+      "clip_ratio/region_mean": 0.0071614584885537624,
+      "entropy": 0.983039915561676,
+      "epoch": 1.5,
+      "grad_norm": 1.957944631576538,
+      "kl": 0.13104547560214996,
+      "learning_rate": 1.5e-06,
+      "loss": 0.004632837139070034,
+      "step": 36,
+      "step_time": 0.2287184080000202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.426942229270935,
+      "epoch": 1.5416666666666665,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.2855005264282227,
+      "kl": 0.14382179081439972,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": -0.0295367781072855,
+      "num_tokens": 252640.0,
+      "reward": -0.07735294103622437,
+      "reward_std": 0.3284520208835602,
+      "rewards/GeneratorRewardFunction/mean": -0.07735294103622437,
+      "rewards/GeneratorRewardFunction/std": 0.32845205068588257,
+      "step": 37,
+      "step_time": 13.249790346999987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "entropy": 1.1062594652175903,
+      "epoch": 1.5833333333333335,
+      "grad_norm": 1.2463343143463135,
+      "kl": 0.14914868772029877,
+      "learning_rate": 1.3e-06,
+      "loss": 0.06158822774887085,
+      "step": 38,
+      "step_time": 0.22136241700002302
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.3345317840576172,
+      "epoch": 1.625,
+      "grad_norm": 2.388456106185913,
+      "kl": 0.1212289109826088,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.013628202490508556,
+      "step": 39,
+      "step_time": 0.22244895000000042
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 0.9387586712837219,
+      "epoch": 1.6666666666666665,
+      "grad_norm": 2.4696860313415527,
+      "kl": 0.1326405256986618,
+      "learning_rate": 1.1e-06,
+      "loss": -0.0440821647644043,
+      "step": 40,
+      "step_time": 0.22124087500003498
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.2471710443496704,
+      "epoch": 1.7083333333333335,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.290759801864624,
+      "kl": 0.12401092797517776,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.05721667408943176,
+      "num_tokens": 277896.0,
+      "reward": -0.11687500029802322,
+      "reward_std": 0.1675596982240677,
+      "rewards/GeneratorRewardFunction/mean": -0.11687500029802322,
+      "rewards/GeneratorRewardFunction/std": 0.1675596833229065,
+      "step": 41,
+      "step_time": 12.131979827999999
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.2584272623062134,
+      "epoch": 1.75,
+      "grad_norm": 0.04703531414270401,
+      "kl": 0.11258962005376816,
+      "learning_rate": 9.000000000000001e-07,
+      "loss": 0.001895441091619432,
+      "step": 42,
+      "step_time": 0.2273904870000365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.1508691310882568,
+      "epoch": 1.7916666666666665,
+      "grad_norm": 1.4680578708648682,
+      "kl": 0.1344568133354187,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": -0.10274048894643784,
+      "step": 43,
+      "step_time": 0.22648300899999185
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.0013020833721384406,
+      "clip_ratio/low_min": 0.0013020833721384406,
+      "clip_ratio/region_mean": 0.001953125,
+      "entropy": 1.0707014799118042,
+      "epoch": 1.8333333333333335,
+      "grad_norm": 2.05830717086792,
+      "kl": 0.14121703803539276,
+      "learning_rate": 7.000000000000001e-07,
+      "loss": 0.045286085456609726,
+      "step": 44,
+      "step_time": 0.22805016099999875
+    },
+    {
+      "clip_ratio/high_max": 0.0013020833721384406,
+      "clip_ratio/high_mean": 0.0013020833721384406,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0013020833721384406,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.4754749536514282,
+      "epoch": 1.875,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 2.129523277282715,
+      "kl": 0.15608109533786774,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.06181947514414787,
+      "num_tokens": 303192.0,
+      "reward": -0.04625000059604645,
+      "reward_std": 0.013964240439236164,
+      "rewards/GeneratorRewardFunction/mean": -0.04625000059604645,
+      "rewards/GeneratorRewardFunction/std": 0.013964240439236164,
+      "step": 45,
+      "step_time": 12.076911616000018
+    },
+    {
+      "clip_ratio/high_max": 0.0026041667442768812,
+      "clip_ratio/high_mean": 0.0026041667442768812,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 1.0166983604431152,
+      "epoch": 1.9166666666666665,
+      "grad_norm": 2.7428877353668213,
+      "kl": 0.16378425061702728,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": -0.17563432455062866,
+      "step": 46,
+      "step_time": 0.2192206379999675
+    },
+    {
+      "clip_ratio/high_max": 0.0006510416860692203,
+      "clip_ratio/high_mean": 0.0006510416860692203,
+      "clip_ratio/low_mean": 0.001953125,
+      "clip_ratio/low_min": 0.001953125,
+      "clip_ratio/region_mean": 0.0026041667442768812,
+      "entropy": 1.2661925554275513,
+      "epoch": 1.9583333333333335,
+      "grad_norm": 2.3470659255981445,
+      "kl": 0.13440369069576263,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 0.08357550948858261,
+      "step": 47,
+      "step_time": 0.21930876000004673
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0006510416860692203,
+      "clip_ratio/low_min": 0.0006510416860692203,
+      "clip_ratio/region_mean": 0.00390625,
+      "entropy": 0.8933156132698059,
+      "epoch": 2.0,
+      "grad_norm": 1.1481467485427856,
+      "kl": 0.17724938690662384,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.031690943986177444,
+      "step": 48,
+      "step_time": 0.21845309399998314
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 384.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 384.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.071245551109314,
+      "epoch": 2.0416666666666665,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 1.4133681058883667,
+      "kl": 0.16416768729686737,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": -0.07582488656044006,
+      "num_tokens": 328804.0,
+      "reward": -0.05125000327825546,
+      "reward_std": 0.015329709276556969,
+      "rewards/GeneratorRewardFunction/mean": -0.05125000327825546,
+      "rewards/GeneratorRewardFunction/std": 0.015329709276556969,
+      "step": 49,
+      "step_time": 12.023586194000018
+    },
+    {
+      "clip_ratio/high_max": 0.0032552082557231188,
+      "clip_ratio/high_mean": 0.0032552082557231188,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0032552082557231188,
+      "entropy": 1.2276629209518433,
+      "epoch": 2.0833333333333335,
+      "grad_norm": 2.9151129722595215,
+      "kl": 0.150254487991333,
+      "learning_rate": 1.0000000000000001e-07,
+      "loss": -0.024596773087978363,
+      "step": 50,
+      "step_time": 0.22950664599994752
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 50,
+  "num_input_tokens_seen": 328804,
+  "num_train_epochs": 3,
+  "save_steps": 10,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31ec66b64f432daf7616434296713e432d134face96e308f2ebc175e2e26f025
+size 7249

self_play_hf_a10g_train/round_001/generator_train/final_model/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

self_play_hf_a10g_train/round_001/generator_train/final_model/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

self_play_hf_a10g_train/round_001/generator_train/final_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

self_play_hf_a10g_train/round_001/generator_train/final_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1bb3fea31b76835f54fffde7e1eeacafdd13f1ca40601af302caf5d8275af4
+size 1976163472

self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

self_play_hf_a10g_train/round_001/generator_train/final_model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31ec66b64f432daf7616434296713e432d134face96e308f2ebc175e2e26f025
+size 7249