Siddeshwar1625 commited on 13 days ago

Commit

6cb9f23

verified ·

1 Parent(s): f413b50

Delete self_play_hf_a10g_train

Browse files

Files changed (30) hide show

self_play_hf_a10g_train/round_001/generator_train/README.md +0 -67
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/chat_template.jinja +0 -54
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/config.json +0 -57
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/generation_config.json +0 -13
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/model.safetensors +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/optimizer.pt +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/rng_state.pth +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/scheduler.pt +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer.json +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer_config.json +0 -32
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/trainer_state.json +0 -764
self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/training_args.bin +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/chat_template.jinja +0 -54
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/config.json +0 -57
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/generation_config.json +0 -13
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/model.safetensors +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/optimizer.pt +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/rng_state.pth +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/scheduler.pt +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer.json +0 -3
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer_config.json +0 -32
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/trainer_state.json +0 -953
self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/training_args.bin +0 -3
self_play_hf_a10g_train/round_001/generator_train/final_model/chat_template.jinja +0 -54
self_play_hf_a10g_train/round_001/generator_train/final_model/config.json +0 -57
self_play_hf_a10g_train/round_001/generator_train/final_model/generation_config.json +0 -13
self_play_hf_a10g_train/round_001/generator_train/final_model/model.safetensors +0 -3
self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer.json +0 -3
self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer_config.json +0 -32
self_play_hf_a10g_train/round_001/generator_train/final_model/training_args.bin +0 -3

self_play_hf_a10g_train/round_001/generator_train/README.md DELETED Viewed

@@ -1,67 +0,0 @@
----
-base_model: Qwen/Qwen2.5-0.5B-Instruct
-library_name: transformers
-model_name: generator_train
-tags:
-- generated_from_trainer
-- grpo
-- trl
-licence: license
----
-# Model Card for generator_train
-This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
-It has been trained using [TRL](https://github.com/huggingface/trl).
-## Quick start
-```python
-from transformers import pipeline
-question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
-generator = pipeline("text-generation", model="None", device="cuda")
-output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
-print(output["generated_text"])
-```
-## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/siddeshwar2004-international-institute-of-information-te/osint-self-play-train/runs/d6lveb1e)
-This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
-### Framework versions
-- TRL: 1.2.0
-- Transformers: 5.6.2
-- Pytorch: 2.11.0
-- Datasets: 4.8.4
-- Tokenizers: 0.22.2
-## Citations
-Cite GRPO as:
-```bibtex
-@article{shao2024deepseekmath,
-    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
-    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
-    year         = 2024,
-    eprint       = {arXiv:2402.03300},
-}
-```
-Cite TRL as:
-```bibtex
-@software{vonwerra2020trl,
-  title   = {{TRL: Transformers Reinforcement Learning}},
-  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
-  license = {Apache-2.0},
-  url     = {https://github.com/huggingface/trl},
-  year    = {2020}
-}
-```

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/chat_template.jinja DELETED Viewed

@@ -1,54 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
-    {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/config.json DELETED Viewed

@@ -1,57 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": null,
-  "dtype": "float32",
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "pad_token_id": 151643,
-  "rms_norm_eps": 1e-06,
-  "rope_parameters": {
-    "rope_theta": 1000000.0,
-    "rope_type": "default"
-  },
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "5.6.2",
-  "use_cache": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/generation_config.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "do_sample": true,
-  "eos_token_id": [
-    151645,
-    151643
-  ],
-  "pad_token_id": 151643,
-  "repetition_penalty": 1.1,
-  "temperature": 0.7,
-  "top_k": 20,
-  "top_p": 0.8,
-  "transformers_version": "5.6.2"
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0827989c90bcaad483dee84b62c1ba69fdf377e659087667ae2a28e1992a2fc6
-size 1976163472

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ceec2f7113291001d1654c27484e896c530e9cfd6710e3ffcfdcc4fb0eeee677
-size 3952509771

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/rng_state.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b5725c96886e63d92a4804b9e1b509a94ed72b0ac9da7f6955ce8a319b258d37
-size 14645

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/scheduler.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:44b7842b78ac3e944fa674f4961bef93e595fe53f0c4495643408e144ea2a074
-size 1465

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
-size 11421892

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/tokenizer_config.json DELETED Viewed

@@ -1,32 +0,0 @@
-{
-  "add_prefix_space": false,
-  "backend": "tokenizers",
-  "bos_token": null,
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "extra_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "is_local": false,
-  "local_files_only": false,
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "padding_side": "left",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "truncation_side": "left",
-  "unk_token": null
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/trainer_state.json DELETED Viewed

@@ -1,764 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 1.6666666666666665,
-  "eval_steps": 500,
-  "global_step": 40,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.001953125,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.9362258911132812,
-      "epoch": 0.041666666666666664,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 3.3426833152770996,
-      "kl": 0.0005498419050127268,
-      "learning_rate": 5e-06,
-      "loss": 0.13995857536792755,
-      "num_tokens": 25244.0,
-      "reward": -0.4352343678474426,
-      "reward_std": 0.306624174118042,
-      "rewards/GeneratorRewardFunction/mean": -0.4352343678474426,
-      "rewards/GeneratorRewardFunction/std": 0.306624174118042,
-      "step": 1,
-      "step_time": 12.578062469000088
-    },
-    {
-      "clip_ratio/high_max": 0.00390625,
-      "clip_ratio/high_mean": 0.00390625,
-      "clip_ratio/low_mean": 0.0026041667442768812,
-      "clip_ratio/low_min": 0.0026041667442768812,
-      "clip_ratio/region_mean": 0.0065104165114462376,
-      "entropy": 1.2686206102371216,
-      "epoch": 0.08333333333333333,
-      "grad_norm": 2.8547239303588867,
-      "kl": 0.001546451705507934,
-      "learning_rate": 4.9000000000000005e-06,
-      "loss": -0.06681232899427414,
-      "step": 2,
-      "step_time": 0.22036709600001814
-    },
-    {
-      "clip_ratio/high_max": 0.012369791977107525,
-      "clip_ratio/high_mean": 0.012369791977107525,
-      "clip_ratio/low_mean": 0.015625,
-      "clip_ratio/low_min": 0.015625,
-      "clip_ratio/region_mean": 0.02799479104578495,
-      "entropy": 1.8668650388717651,
-      "epoch": 0.125,
-      "grad_norm": 2.4686105251312256,
-      "kl": 0.005345983896404505,
-      "learning_rate": 4.800000000000001e-06,
-      "loss": 0.010777520947158337,
-      "step": 3,
-      "step_time": 0.2199646679999887
-    },
-    {
-      "clip_ratio/high_max": 0.02083333395421505,
-      "clip_ratio/high_mean": 0.02083333395421505,
-      "clip_ratio/low_mean": 0.010416666977107525,
-      "clip_ratio/low_min": 0.010416666977107525,
-      "clip_ratio/region_mean": 0.03125,
-      "entropy": 1.1842881441116333,
-      "epoch": 0.16666666666666666,
-      "grad_norm": 1.569398045539856,
-      "kl": 0.0072342646308243275,
-      "learning_rate": 4.7e-06,
-      "loss": -0.08198019117116928,
-      "step": 4,
-      "step_time": 0.2201611520000597
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0026041667442768812,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3128995895385742,
-      "epoch": 0.20833333333333334,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.202421188354492,
-      "kl": 0.0129135362803936,
-      "learning_rate": 4.600000000000001e-06,
-      "loss": -0.0841849148273468,
-      "num_tokens": 50440.0,
-      "reward": -0.3341406285762787,
-      "reward_std": 0.3155691623687744,
-      "rewards/GeneratorRewardFunction/mean": -0.3341406285762787,
-      "rewards/GeneratorRewardFunction/std": 0.3155691623687744,
-      "step": 5,
-      "step_time": 12.076087126999937
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0052083334885537624,
-      "clip_ratio/low_min": 0.0052083334885537624,
-      "clip_ratio/region_mean": 0.0078125,
-      "entropy": 1.3001914024353027,
-      "epoch": 0.25,
-      "grad_norm": 2.854139804840088,
-      "kl": 0.01436698716133833,
-      "learning_rate": 4.5e-06,
-      "loss": 0.02869725041091442,
-      "step": 6,
-      "step_time": 0.22715311399997518
-    },
-    {
-      "clip_ratio/high_max": 0.0071614584885537624,
-      "clip_ratio/high_mean": 0.0071614584885537624,
-      "clip_ratio/low_mean": 0.0071614584885537624,
-      "clip_ratio/low_min": 0.0071614584885537624,
-      "clip_ratio/region_mean": 0.014322916977107525,
-      "entropy": 1.0331100225448608,
-      "epoch": 0.2916666666666667,
-      "grad_norm": 1.9297211170196533,
-      "kl": 0.01791433058679104,
-      "learning_rate": 4.4e-06,
-      "loss": -0.028683962300419807,
-      "step": 7,
-      "step_time": 0.22586069900000894
-    },
-    {
-      "clip_ratio/high_max": 0.029296875,
-      "clip_ratio/high_mean": 0.029296875,
-      "clip_ratio/low_mean": 0.01171875,
-      "clip_ratio/low_min": 0.01171875,
-      "clip_ratio/region_mean": 0.041015625,
-      "entropy": 1.1462408304214478,
-      "epoch": 0.3333333333333333,
-      "grad_norm": 2.57124924659729,
-      "kl": 0.0388585664331913,
-      "learning_rate": 4.3e-06,
-      "loss": 0.08592668920755386,
-      "step": 8,
-      "step_time": 0.22552726200001416
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 0.999983549118042,
-      "epoch": 0.375,
-      "frac_reward_zero_std": 0.25,
-      "grad_norm": 2.0052192211151123,
-      "kl": 0.030047910287976265,
-      "learning_rate": 4.2000000000000004e-06,
-      "loss": -0.05663062259554863,
-      "num_tokens": 75884.0,
-      "reward": -0.3902343511581421,
-      "reward_std": 0.31722894310951233,
-      "rewards/GeneratorRewardFunction/mean": -0.3902343511581421,
-      "rewards/GeneratorRewardFunction/std": 0.3172289729118347,
-      "step": 9,
-      "step_time": 12.047747722000054
-    },
-    {
-      "clip_ratio/high_max": 0.005859375,
-      "clip_ratio/high_mean": 0.005859375,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0065104165114462376,
-      "entropy": 1.6177984476089478,
-      "epoch": 0.4166666666666667,
-      "grad_norm": 2.137237071990967,
-      "kl": 0.04101690649986267,
-      "learning_rate": 4.1e-06,
-      "loss": -0.02161034755408764,
-      "step": 10,
-      "step_time": 0.2252395229999138
-    },
-    {
-      "clip_ratio/high_max": 0.01692708395421505,
-      "clip_ratio/high_mean": 0.01692708395421505,
-      "clip_ratio/low_mean": 0.0065104165114462376,
-      "clip_ratio/low_min": 0.0065104165114462376,
-      "clip_ratio/region_mean": 0.0234375,
-      "entropy": 1.038699746131897,
-      "epoch": 0.4583333333333333,
-      "grad_norm": 2.672621965408325,
-      "kl": 0.031740155071020126,
-      "learning_rate": 4.000000000000001e-06,
-      "loss": 0.056199509650468826,
-      "step": 11,
-      "step_time": 0.22556489999999485
-    },
-    {
-      "clip_ratio/high_max": 0.01822916604578495,
-      "clip_ratio/high_mean": 0.01822916604578495,
-      "clip_ratio/low_mean": 0.010416666977107525,
-      "clip_ratio/low_min": 0.010416666977107525,
-      "clip_ratio/region_mean": 0.02864583395421505,
-      "entropy": 1.296442985534668,
-      "epoch": 0.5,
-      "grad_norm": 1.4488099813461304,
-      "kl": 0.04755128547549248,
-      "learning_rate": 3.900000000000001e-06,
-      "loss": 0.02385079860687256,
-      "step": 12,
-      "step_time": 0.22498397900005784
-    },
-    {
-      "clip_ratio/high_max": 0.0013020833721384406,
-      "clip_ratio/high_mean": 0.0013020833721384406,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0013020833721384406,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3575109243392944,
-      "epoch": 0.5416666666666666,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.2914443016052246,
-      "kl": 0.06257984787225723,
-      "learning_rate": 3.8000000000000005e-06,
-      "loss": -0.10538653284311295,
-      "num_tokens": 101112.0,
-      "reward": -0.22843749821186066,
-      "reward_std": 0.294514924287796,
-      "rewards/GeneratorRewardFunction/mean": -0.22843749821186066,
-      "rewards/GeneratorRewardFunction/std": 0.2945149540901184,
-      "step": 13,
-      "step_time": 12.01501083200003
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.001953125,
-      "entropy": 1.2918612957000732,
-      "epoch": 0.5833333333333334,
-      "grad_norm": 3.0368542671203613,
-      "kl": 0.04979195073246956,
-      "learning_rate": 3.7e-06,
-      "loss": -0.003113487036898732,
-      "step": 14,
-      "step_time": 0.21806825399994523
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.005859375,
-      "clip_ratio/low_min": 0.005859375,
-      "clip_ratio/region_mean": 0.008463541977107525,
-      "entropy": 1.1081053018569946,
-      "epoch": 0.625,
-      "grad_norm": 3.5923683643341064,
-      "kl": 0.06817911565303802,
-      "learning_rate": 3.6000000000000003e-06,
-      "loss": 0.15118412673473358,
-      "step": 15,
-      "step_time": 0.217887520999966
-    },
-    {
-      "clip_ratio/high_max": 0.02018229104578495,
-      "clip_ratio/high_mean": 0.02018229104578495,
-      "clip_ratio/low_mean": 0.0026041667442768812,
-      "clip_ratio/low_min": 0.0026041667442768812,
-      "clip_ratio/region_mean": 0.02278645895421505,
-      "entropy": 1.0803831815719604,
-      "epoch": 0.6666666666666666,
-      "grad_norm": 1.789110541343689,
-      "kl": 0.056480005383491516,
-      "learning_rate": 3.5e-06,
-      "loss": -0.03890883922576904,
-      "step": 16,
-      "step_time": 0.21781940799996846
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 0.8709045052528381,
-      "epoch": 0.7083333333333334,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 1.329393982887268,
-      "kl": 0.06073950603604317,
-      "learning_rate": 3.4000000000000005e-06,
-      "loss": -0.11920400708913803,
-      "num_tokens": 126436.0,
-      "reward": -0.2240625023841858,
-      "reward_std": 0.2881968021392822,
-      "rewards/GeneratorRewardFunction/mean": -0.2240625023841858,
-      "rewards/GeneratorRewardFunction/std": 0.2881968021392822,
-      "step": 17,
-      "step_time": 12.08798373600007
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.083386778831482,
-      "epoch": 0.75,
-      "grad_norm": 1.343295931816101,
-      "kl": 0.0919194221496582,
-      "learning_rate": 3.3000000000000006e-06,
-      "loss": -0.007308408617973328,
-      "step": 18,
-      "step_time": 0.2253850289998809
-    },
-    {
-      "clip_ratio/high_max": 0.005859375,
-      "clip_ratio/high_mean": 0.005859375,
-      "clip_ratio/low_mean": 0.0026041667442768812,
-      "clip_ratio/low_min": 0.0026041667442768812,
-      "clip_ratio/region_mean": 0.008463541977107525,
-      "entropy": 1.406662940979004,
-      "epoch": 0.7916666666666666,
-      "grad_norm": 3.3420534133911133,
-      "kl": 0.06450249999761581,
-      "learning_rate": 3.2000000000000003e-06,
-      "loss": 0.12472107261419296,
-      "step": 19,
-      "step_time": 0.22467486499999723
-    },
-    {
-      "clip_ratio/high_max": 0.011067708022892475,
-      "clip_ratio/high_mean": 0.011067708022892475,
-      "clip_ratio/low_mean": 0.0032552082557231188,
-      "clip_ratio/low_min": 0.0032552082557231188,
-      "clip_ratio/region_mean": 0.014322916977107525,
-      "entropy": 1.6491953134536743,
-      "epoch": 0.8333333333333334,
-      "grad_norm": 3.3672103881835938,
-      "kl": 0.0773777961730957,
-      "learning_rate": 3.1000000000000004e-06,
-      "loss": 0.0035695277620106936,
-      "step": 20,
-      "step_time": 0.22389788999998927
-    },
-    {
-      "clip_ratio/high_max": 0.0006510416860692203,
-      "clip_ratio/high_mean": 0.0006510416860692203,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0006510416860692203,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.4178005456924438,
-      "epoch": 0.875,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.6333799362182617,
-      "kl": 0.07392226904630661,
-      "learning_rate": 3e-06,
-      "loss": -0.09101495891809464,
-      "num_tokens": 151596.0,
-      "reward": -0.15257811546325684,
-      "reward_std": 0.24854345619678497,
-      "rewards/GeneratorRewardFunction/mean": -0.15257811546325684,
-      "rewards/GeneratorRewardFunction/std": 0.24854345619678497,
-      "step": 21,
-      "step_time": 12.020297105999816
-    },
-    {
-      "clip_ratio/high_max": 0.0006510416860692203,
-      "clip_ratio/high_mean": 0.0006510416860692203,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.001953125,
-      "entropy": 1.2036248445510864,
-      "epoch": 0.9166666666666666,
-      "grad_norm": 2.1499149799346924,
-      "kl": 0.0772874653339386,
-      "learning_rate": 2.9e-06,
-      "loss": 0.08120749890804291,
-      "step": 22,
-      "step_time": 0.21995178900010615
-    },
-    {
-      "clip_ratio/high_max": 0.0032552082557231188,
-      "clip_ratio/high_mean": 0.0032552082557231188,
-      "clip_ratio/low_mean": 0.0032552082557231188,
-      "clip_ratio/low_min": 0.0032552082557231188,
-      "clip_ratio/region_mean": 0.0065104165114462376,
-      "entropy": 1.1966055631637573,
-      "epoch": 0.9583333333333334,
-      "grad_norm": 2.0064616203308105,
-      "kl": 0.07331382483243942,
-      "learning_rate": 2.8000000000000003e-06,
-      "loss": 0.03140506148338318,
-      "step": 23,
-      "step_time": 0.21996421700009705
-    },
-    {
-      "clip_ratio/high_max": 0.011067708022892475,
-      "clip_ratio/high_mean": 0.011067708022892475,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.01171875,
-      "entropy": 0.9102082252502441,
-      "epoch": 1.0,
-      "grad_norm": 1.7175334692001343,
-      "kl": 0.14611481130123138,
-      "learning_rate": 2.7000000000000004e-06,
-      "loss": -0.021010393276810646,
-      "step": 24,
-      "step_time": 0.21931950600014716
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.9153881072998047,
-      "epoch": 1.0416666666666667,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.3460445404052734,
-      "kl": 0.09710023552179337,
-      "learning_rate": 2.6e-06,
-      "loss": 0.015220091678202152,
-      "num_tokens": 177212.0,
-      "reward": -0.15656250715255737,
-      "reward_std": 0.22349287569522858,
-      "rewards/GeneratorRewardFunction/mean": -0.15656250715255737,
-      "rewards/GeneratorRewardFunction/std": 0.22349286079406738,
-      "step": 25,
-      "step_time": 12.153388549000056
-    },
-    {
-      "clip_ratio/high_max": 0.00390625,
-      "clip_ratio/high_mean": 0.00390625,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.0052083334885537624,
-      "entropy": 1.365325927734375,
-      "epoch": 1.0833333333333333,
-      "grad_norm": 1.8710312843322754,
-      "kl": 0.0985046848654747,
-      "learning_rate": 2.5e-06,
-      "loss": -0.02838735282421112,
-      "step": 26,
-      "step_time": 0.22659933299996737
-    },
-    {
-      "clip_ratio/high_max": 0.008463541977107525,
-      "clip_ratio/high_mean": 0.008463541977107525,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.009765625,
-      "entropy": 1.2517439126968384,
-      "epoch": 1.125,
-      "grad_norm": 2.821958303451538,
-      "kl": 0.09274079650640488,
-      "learning_rate": 2.4000000000000003e-06,
-      "loss": -0.007298170123249292,
-      "step": 27,
-      "step_time": 0.22647249999999985
-    },
-    {
-      "clip_ratio/high_max": 0.0052083334885537624,
-      "clip_ratio/high_mean": 0.0052083334885537624,
-      "clip_ratio/low_mean": 0.009114583022892475,
-      "clip_ratio/low_min": 0.009114583022892475,
-      "clip_ratio/region_mean": 0.014322916977107525,
-      "entropy": 2.0579044818878174,
-      "epoch": 1.1666666666666667,
-      "grad_norm": 3.259742259979248,
-      "kl": 0.10746321082115173,
-      "learning_rate": 2.3000000000000004e-06,
-      "loss": 0.021702758967876434,
-      "step": 28,
-      "step_time": 0.22640677999993386
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3725861310958862,
-      "epoch": 1.2083333333333333,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 1.8806989192962646,
-      "kl": 0.11961983889341354,
-      "learning_rate": 2.2e-06,
-      "loss": 0.07187109440565109,
-      "num_tokens": 202200.0,
-      "reward": -0.0561029389500618,
-      "reward_std": 0.314301997423172,
-      "rewards/GeneratorRewardFunction/mean": -0.0561029389500618,
-      "rewards/GeneratorRewardFunction/std": 0.314301997423172,
-      "step": 29,
-      "step_time": 13.662849896999887
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.2213298082351685,
-      "epoch": 1.25,
-      "grad_norm": 2.1918396949768066,
-      "kl": 0.12398240715265274,
-      "learning_rate": 2.1000000000000002e-06,
-      "loss": -0.052896980196237564,
-      "step": 30,
-      "step_time": 0.2210835590001352
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.001953125,
-      "clip_ratio/low_min": 0.001953125,
-      "clip_ratio/region_mean": 0.00390625,
-      "entropy": 1.2683231830596924,
-      "epoch": 1.2916666666666667,
-      "grad_norm": 2.524726390838623,
-      "kl": 0.14297537505626678,
-      "learning_rate": 2.0000000000000003e-06,
-      "loss": 0.12745414674282074,
-      "step": 31,
-      "step_time": 0.22096665699996265
-    },
-    {
-      "clip_ratio/high_max": 0.0032552082557231188,
-      "clip_ratio/high_mean": 0.0032552082557231188,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.00390625,
-      "entropy": 1.0583091974258423,
-      "epoch": 1.3333333333333333,
-      "grad_norm": 2.408073902130127,
-      "kl": 0.0881701335310936,
-      "learning_rate": 1.9000000000000002e-06,
-      "loss": -0.14430458843708038,
-      "step": 32,
-      "step_time": 0.21999255600007928
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3751106262207031,
-      "epoch": 1.375,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.4002864360809326,
-      "kl": 0.0900505781173706,
-      "learning_rate": 1.8000000000000001e-06,
-      "loss": 0.06270528584718704,
-      "num_tokens": 227556.0,
-      "reward": -0.11414062231779099,
-      "reward_std": 0.21683935821056366,
-      "rewards/GeneratorRewardFunction/mean": -0.11414062231779099,
-      "rewards/GeneratorRewardFunction/std": 0.21683938801288605,
-      "step": 33,
-      "step_time": 12.070902493999938
-    },
-    {
-      "clip_ratio/high_max": 0.0045572915114462376,
-      "clip_ratio/high_mean": 0.0045572915114462376,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0045572915114462376,
-      "entropy": 1.2606719732284546,
-      "epoch": 1.4166666666666667,
-      "grad_norm": 1.671729326248169,
-      "kl": 0.1210540160536766,
-      "learning_rate": 1.7000000000000002e-06,
-      "loss": -0.04401962831616402,
-      "step": 34,
-      "step_time": 0.2276347459999215
-    },
-    {
-      "clip_ratio/high_max": 0.0013020833721384406,
-      "clip_ratio/high_mean": 0.0013020833721384406,
-      "clip_ratio/low_mean": 0.001953125,
-      "clip_ratio/low_min": 0.001953125,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.2780500650405884,
-      "epoch": 1.4583333333333333,
-      "grad_norm": 2.278010845184326,
-      "kl": 0.11484409123659134,
-      "learning_rate": 1.6000000000000001e-06,
-      "loss": -0.08475238084793091,
-      "step": 35,
-      "step_time": 0.22882699100000536
-    },
-    {
-      "clip_ratio/high_max": 0.0052083334885537624,
-      "clip_ratio/high_mean": 0.0052083334885537624,
-      "clip_ratio/low_mean": 0.0032552082557231188,
-      "clip_ratio/low_min": 0.0032552082557231188,
-      "clip_ratio/region_mean": 0.008463541977107525,
-      "entropy": 1.0553101301193237,
-      "epoch": 1.5,
-      "grad_norm": 1.582037091255188,
-      "kl": 0.12029703706502914,
-      "learning_rate": 1.5e-06,
-      "loss": 0.06627888232469559,
-      "step": 36,
-      "step_time": 0.22751581399984389
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.0647958517074585,
-      "epoch": 1.5416666666666665,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.1763558387756348,
-      "kl": 0.08708903193473816,
-      "learning_rate": 1.4000000000000001e-06,
-      "loss": -0.00017260713502764702,
-      "num_tokens": 252640.0,
-      "reward": -0.10210937261581421,
-      "reward_std": 0.19573244452476501,
-      "rewards/GeneratorRewardFunction/mean": -0.10210937261581421,
-      "rewards/GeneratorRewardFunction/std": 0.1957324594259262,
-      "step": 37,
-      "step_time": 12.015305628000078
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0026041667442768812,
-      "entropy": 1.0041130781173706,
-      "epoch": 1.5833333333333335,
-      "grad_norm": 1.6093602180480957,
-      "kl": 0.11537543684244156,
-      "learning_rate": 1.3e-06,
-      "loss": -0.12453166395425797,
-      "step": 38,
-      "step_time": 0.22048816200003785
-    },
-    {
-      "clip_ratio/high_max": 0.0045572915114462376,
-      "clip_ratio/high_mean": 0.0045572915114462376,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.005859375,
-      "entropy": 1.500306487083435,
-      "epoch": 1.625,
-      "grad_norm": 3.409069299697876,
-      "kl": 0.10904627293348312,
-      "learning_rate": 1.2000000000000002e-06,
-      "loss": 0.12661518156528473,
-      "step": 39,
-      "step_time": 0.22087437000004684
-    },
-    {
-      "clip_ratio/high_max": 0.0078125,
-      "clip_ratio/high_mean": 0.0078125,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.009114583022892475,
-      "entropy": 1.0560635328292847,
-      "epoch": 1.6666666666666665,
-      "grad_norm": 2.0718417167663574,
-      "kl": 0.11926760524511337,
-      "learning_rate": 1.1e-06,
-      "loss": -0.0004449083062354475,
-      "step": 40,
-      "step_time": 0.2202887500000088
-    }
-  ],
-  "logging_steps": 1,
-  "max_steps": 50,
-  "num_input_tokens_seen": 252640,
-  "num_train_epochs": 3,
-  "save_steps": 10,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": false
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 0.0,
-  "train_batch_size": 4,
-  "trial_name": null,
-  "trial_params": null
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-40/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:31ec66b64f432daf7616434296713e432d134face96e308f2ebc175e2e26f025
-size 7249

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/chat_template.jinja DELETED Viewed

@@ -1,54 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
-    {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/config.json DELETED Viewed

@@ -1,57 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": null,
-  "dtype": "float32",
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "pad_token_id": 151643,
-  "rms_norm_eps": 1e-06,
-  "rope_parameters": {
-    "rope_theta": 1000000.0,
-    "rope_type": "default"
-  },
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "5.6.2",
-  "use_cache": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/generation_config.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "do_sample": true,
-  "eos_token_id": [
-    151645,
-    151643
-  ],
-  "pad_token_id": 151643,
-  "repetition_penalty": 1.1,
-  "temperature": 0.7,
-  "top_k": 20,
-  "top_p": 0.8,
-  "transformers_version": "5.6.2"
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6fa4eed67a84ce4076ba3848a078496971cd34ba048c794e52cc3b4aab54a27
-size 1976163472

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b145cc09cf03081708247bd99e0dd46e23f798d922e5e7df9e75880345e1d969
-size 3952509771

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/rng_state.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7876773bbbd765b5b540a43519e4809b559e65cdfa3f4e9508024dc7702f2f6e
-size 14645

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/scheduler.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3cf220ca534359fdb729d8e74ed3b0c609c54e6d591e1d2478f5521fc51fba05
-size 1465

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
-size 11421892

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/tokenizer_config.json DELETED Viewed

@@ -1,32 +0,0 @@
-{
-  "add_prefix_space": false,
-  "backend": "tokenizers",
-  "bos_token": null,
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "extra_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "is_local": false,
-  "local_files_only": false,
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "padding_side": "left",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "truncation_side": "left",
-  "unk_token": null
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/trainer_state.json DELETED Viewed

@@ -1,953 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 2.0833333333333335,
-  "eval_steps": 500,
-  "global_step": 50,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.001953125,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.9362258911132812,
-      "epoch": 0.041666666666666664,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 3.3426833152770996,
-      "kl": 0.0005498419050127268,
-      "learning_rate": 5e-06,
-      "loss": 0.13995857536792755,
-      "num_tokens": 25244.0,
-      "reward": -0.4352343678474426,
-      "reward_std": 0.306624174118042,
-      "rewards/GeneratorRewardFunction/mean": -0.4352343678474426,
-      "rewards/GeneratorRewardFunction/std": 0.306624174118042,
-      "step": 1,
-      "step_time": 12.578062469000088
-    },
-    {
-      "clip_ratio/high_max": 0.00390625,
-      "clip_ratio/high_mean": 0.00390625,
-      "clip_ratio/low_mean": 0.0026041667442768812,
-      "clip_ratio/low_min": 0.0026041667442768812,
-      "clip_ratio/region_mean": 0.0065104165114462376,
-      "entropy": 1.2686206102371216,
-      "epoch": 0.08333333333333333,
-      "grad_norm": 2.8547239303588867,
-      "kl": 0.001546451705507934,
-      "learning_rate": 4.9000000000000005e-06,
-      "loss": -0.06681232899427414,
-      "step": 2,
-      "step_time": 0.22036709600001814
-    },
-    {
-      "clip_ratio/high_max": 0.012369791977107525,
-      "clip_ratio/high_mean": 0.012369791977107525,
-      "clip_ratio/low_mean": 0.015625,
-      "clip_ratio/low_min": 0.015625,
-      "clip_ratio/region_mean": 0.02799479104578495,
-      "entropy": 1.8668650388717651,
-      "epoch": 0.125,
-      "grad_norm": 2.4686105251312256,
-      "kl": 0.005345983896404505,
-      "learning_rate": 4.800000000000001e-06,
-      "loss": 0.010777520947158337,
-      "step": 3,
-      "step_time": 0.2199646679999887
-    },
-    {
-      "clip_ratio/high_max": 0.02083333395421505,
-      "clip_ratio/high_mean": 0.02083333395421505,
-      "clip_ratio/low_mean": 0.010416666977107525,
-      "clip_ratio/low_min": 0.010416666977107525,
-      "clip_ratio/region_mean": 0.03125,
-      "entropy": 1.1842881441116333,
-      "epoch": 0.16666666666666666,
-      "grad_norm": 1.569398045539856,
-      "kl": 0.0072342646308243275,
-      "learning_rate": 4.7e-06,
-      "loss": -0.08198019117116928,
-      "step": 4,
-      "step_time": 0.2201611520000597
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0026041667442768812,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3128995895385742,
-      "epoch": 0.20833333333333334,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.202421188354492,
-      "kl": 0.0129135362803936,
-      "learning_rate": 4.600000000000001e-06,
-      "loss": -0.0841849148273468,
-      "num_tokens": 50440.0,
-      "reward": -0.3341406285762787,
-      "reward_std": 0.3155691623687744,
-      "rewards/GeneratorRewardFunction/mean": -0.3341406285762787,
-      "rewards/GeneratorRewardFunction/std": 0.3155691623687744,
-      "step": 5,
-      "step_time": 12.076087126999937
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0052083334885537624,
-      "clip_ratio/low_min": 0.0052083334885537624,
-      "clip_ratio/region_mean": 0.0078125,
-      "entropy": 1.3001914024353027,
-      "epoch": 0.25,
-      "grad_norm": 2.854139804840088,
-      "kl": 0.01436698716133833,
-      "learning_rate": 4.5e-06,
-      "loss": 0.02869725041091442,
-      "step": 6,
-      "step_time": 0.22715311399997518
-    },
-    {
-      "clip_ratio/high_max": 0.0071614584885537624,
-      "clip_ratio/high_mean": 0.0071614584885537624,
-      "clip_ratio/low_mean": 0.0071614584885537624,
-      "clip_ratio/low_min": 0.0071614584885537624,
-      "clip_ratio/region_mean": 0.014322916977107525,
-      "entropy": 1.0331100225448608,
-      "epoch": 0.2916666666666667,
-      "grad_norm": 1.9297211170196533,
-      "kl": 0.01791433058679104,
-      "learning_rate": 4.4e-06,
-      "loss": -0.028683962300419807,
-      "step": 7,
-      "step_time": 0.22586069900000894
-    },
-    {
-      "clip_ratio/high_max": 0.029296875,
-      "clip_ratio/high_mean": 0.029296875,
-      "clip_ratio/low_mean": 0.01171875,
-      "clip_ratio/low_min": 0.01171875,
-      "clip_ratio/region_mean": 0.041015625,
-      "entropy": 1.1462408304214478,
-      "epoch": 0.3333333333333333,
-      "grad_norm": 2.57124924659729,
-      "kl": 0.0388585664331913,
-      "learning_rate": 4.3e-06,
-      "loss": 0.08592668920755386,
-      "step": 8,
-      "step_time": 0.22552726200001416
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 0.999983549118042,
-      "epoch": 0.375,
-      "frac_reward_zero_std": 0.25,
-      "grad_norm": 2.0052192211151123,
-      "kl": 0.030047910287976265,
-      "learning_rate": 4.2000000000000004e-06,
-      "loss": -0.05663062259554863,
-      "num_tokens": 75884.0,
-      "reward": -0.3902343511581421,
-      "reward_std": 0.31722894310951233,
-      "rewards/GeneratorRewardFunction/mean": -0.3902343511581421,
-      "rewards/GeneratorRewardFunction/std": 0.3172289729118347,
-      "step": 9,
-      "step_time": 12.047747722000054
-    },
-    {
-      "clip_ratio/high_max": 0.005859375,
-      "clip_ratio/high_mean": 0.005859375,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0065104165114462376,
-      "entropy": 1.6177984476089478,
-      "epoch": 0.4166666666666667,
-      "grad_norm": 2.137237071990967,
-      "kl": 0.04101690649986267,
-      "learning_rate": 4.1e-06,
-      "loss": -0.02161034755408764,
-      "step": 10,
-      "step_time": 0.2252395229999138
-    },
-    {
-      "clip_ratio/high_max": 0.01692708395421505,
-      "clip_ratio/high_mean": 0.01692708395421505,
-      "clip_ratio/low_mean": 0.0065104165114462376,
-      "clip_ratio/low_min": 0.0065104165114462376,
-      "clip_ratio/region_mean": 0.0234375,
-      "entropy": 1.038699746131897,
-      "epoch": 0.4583333333333333,
-      "grad_norm": 2.672621965408325,
-      "kl": 0.031740155071020126,
-      "learning_rate": 4.000000000000001e-06,
-      "loss": 0.056199509650468826,
-      "step": 11,
-      "step_time": 0.22556489999999485
-    },
-    {
-      "clip_ratio/high_max": 0.01822916604578495,
-      "clip_ratio/high_mean": 0.01822916604578495,
-      "clip_ratio/low_mean": 0.010416666977107525,
-      "clip_ratio/low_min": 0.010416666977107525,
-      "clip_ratio/region_mean": 0.02864583395421505,
-      "entropy": 1.296442985534668,
-      "epoch": 0.5,
-      "grad_norm": 1.4488099813461304,
-      "kl": 0.04755128547549248,
-      "learning_rate": 3.900000000000001e-06,
-      "loss": 0.02385079860687256,
-      "step": 12,
-      "step_time": 0.22498397900005784
-    },
-    {
-      "clip_ratio/high_max": 0.0013020833721384406,
-      "clip_ratio/high_mean": 0.0013020833721384406,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0013020833721384406,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3575109243392944,
-      "epoch": 0.5416666666666666,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.2914443016052246,
-      "kl": 0.06257984787225723,
-      "learning_rate": 3.8000000000000005e-06,
-      "loss": -0.10538653284311295,
-      "num_tokens": 101112.0,
-      "reward": -0.22843749821186066,
-      "reward_std": 0.294514924287796,
-      "rewards/GeneratorRewardFunction/mean": -0.22843749821186066,
-      "rewards/GeneratorRewardFunction/std": 0.2945149540901184,
-      "step": 13,
-      "step_time": 12.01501083200003
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.001953125,
-      "entropy": 1.2918612957000732,
-      "epoch": 0.5833333333333334,
-      "grad_norm": 3.0368542671203613,
-      "kl": 0.04979195073246956,
-      "learning_rate": 3.7e-06,
-      "loss": -0.003113487036898732,
-      "step": 14,
-      "step_time": 0.21806825399994523
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.005859375,
-      "clip_ratio/low_min": 0.005859375,
-      "clip_ratio/region_mean": 0.008463541977107525,
-      "entropy": 1.1081053018569946,
-      "epoch": 0.625,
-      "grad_norm": 3.5923683643341064,
-      "kl": 0.06817911565303802,
-      "learning_rate": 3.6000000000000003e-06,
-      "loss": 0.15118412673473358,
-      "step": 15,
-      "step_time": 0.217887520999966
-    },
-    {
-      "clip_ratio/high_max": 0.02018229104578495,
-      "clip_ratio/high_mean": 0.02018229104578495,
-      "clip_ratio/low_mean": 0.0026041667442768812,
-      "clip_ratio/low_min": 0.0026041667442768812,
-      "clip_ratio/region_mean": 0.02278645895421505,
-      "entropy": 1.0803831815719604,
-      "epoch": 0.6666666666666666,
-      "grad_norm": 1.789110541343689,
-      "kl": 0.056480005383491516,
-      "learning_rate": 3.5e-06,
-      "loss": -0.03890883922576904,
-      "step": 16,
-      "step_time": 0.21781940799996846
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 0.8709045052528381,
-      "epoch": 0.7083333333333334,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 1.329393982887268,
-      "kl": 0.06073950603604317,
-      "learning_rate": 3.4000000000000005e-06,
-      "loss": -0.11920400708913803,
-      "num_tokens": 126436.0,
-      "reward": -0.2240625023841858,
-      "reward_std": 0.2881968021392822,
-      "rewards/GeneratorRewardFunction/mean": -0.2240625023841858,
-      "rewards/GeneratorRewardFunction/std": 0.2881968021392822,
-      "step": 17,
-      "step_time": 12.08798373600007
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.083386778831482,
-      "epoch": 0.75,
-      "grad_norm": 1.343295931816101,
-      "kl": 0.0919194221496582,
-      "learning_rate": 3.3000000000000006e-06,
-      "loss": -0.007308408617973328,
-      "step": 18,
-      "step_time": 0.2253850289998809
-    },
-    {
-      "clip_ratio/high_max": 0.005859375,
-      "clip_ratio/high_mean": 0.005859375,
-      "clip_ratio/low_mean": 0.0026041667442768812,
-      "clip_ratio/low_min": 0.0026041667442768812,
-      "clip_ratio/region_mean": 0.008463541977107525,
-      "entropy": 1.406662940979004,
-      "epoch": 0.7916666666666666,
-      "grad_norm": 3.3420534133911133,
-      "kl": 0.06450249999761581,
-      "learning_rate": 3.2000000000000003e-06,
-      "loss": 0.12472107261419296,
-      "step": 19,
-      "step_time": 0.22467486499999723
-    },
-    {
-      "clip_ratio/high_max": 0.011067708022892475,
-      "clip_ratio/high_mean": 0.011067708022892475,
-      "clip_ratio/low_mean": 0.0032552082557231188,
-      "clip_ratio/low_min": 0.0032552082557231188,
-      "clip_ratio/region_mean": 0.014322916977107525,
-      "entropy": 1.6491953134536743,
-      "epoch": 0.8333333333333334,
-      "grad_norm": 3.3672103881835938,
-      "kl": 0.0773777961730957,
-      "learning_rate": 3.1000000000000004e-06,
-      "loss": 0.0035695277620106936,
-      "step": 20,
-      "step_time": 0.22389788999998927
-    },
-    {
-      "clip_ratio/high_max": 0.0006510416860692203,
-      "clip_ratio/high_mean": 0.0006510416860692203,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0006510416860692203,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.4178005456924438,
-      "epoch": 0.875,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.6333799362182617,
-      "kl": 0.07392226904630661,
-      "learning_rate": 3e-06,
-      "loss": -0.09101495891809464,
-      "num_tokens": 151596.0,
-      "reward": -0.15257811546325684,
-      "reward_std": 0.24854345619678497,
-      "rewards/GeneratorRewardFunction/mean": -0.15257811546325684,
-      "rewards/GeneratorRewardFunction/std": 0.24854345619678497,
-      "step": 21,
-      "step_time": 12.020297105999816
-    },
-    {
-      "clip_ratio/high_max": 0.0006510416860692203,
-      "clip_ratio/high_mean": 0.0006510416860692203,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.001953125,
-      "entropy": 1.2036248445510864,
-      "epoch": 0.9166666666666666,
-      "grad_norm": 2.1499149799346924,
-      "kl": 0.0772874653339386,
-      "learning_rate": 2.9e-06,
-      "loss": 0.08120749890804291,
-      "step": 22,
-      "step_time": 0.21995178900010615
-    },
-    {
-      "clip_ratio/high_max": 0.0032552082557231188,
-      "clip_ratio/high_mean": 0.0032552082557231188,
-      "clip_ratio/low_mean": 0.0032552082557231188,
-      "clip_ratio/low_min": 0.0032552082557231188,
-      "clip_ratio/region_mean": 0.0065104165114462376,
-      "entropy": 1.1966055631637573,
-      "epoch": 0.9583333333333334,
-      "grad_norm": 2.0064616203308105,
-      "kl": 0.07331382483243942,
-      "learning_rate": 2.8000000000000003e-06,
-      "loss": 0.03140506148338318,
-      "step": 23,
-      "step_time": 0.21996421700009705
-    },
-    {
-      "clip_ratio/high_max": 0.011067708022892475,
-      "clip_ratio/high_mean": 0.011067708022892475,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.01171875,
-      "entropy": 0.9102082252502441,
-      "epoch": 1.0,
-      "grad_norm": 1.7175334692001343,
-      "kl": 0.14611481130123138,
-      "learning_rate": 2.7000000000000004e-06,
-      "loss": -0.021010393276810646,
-      "step": 24,
-      "step_time": 0.21931950600014716
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.9153881072998047,
-      "epoch": 1.0416666666666667,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.3460445404052734,
-      "kl": 0.09710023552179337,
-      "learning_rate": 2.6e-06,
-      "loss": 0.015220091678202152,
-      "num_tokens": 177212.0,
-      "reward": -0.15656250715255737,
-      "reward_std": 0.22349287569522858,
-      "rewards/GeneratorRewardFunction/mean": -0.15656250715255737,
-      "rewards/GeneratorRewardFunction/std": 0.22349286079406738,
-      "step": 25,
-      "step_time": 12.153388549000056
-    },
-    {
-      "clip_ratio/high_max": 0.00390625,
-      "clip_ratio/high_mean": 0.00390625,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.0052083334885537624,
-      "entropy": 1.365325927734375,
-      "epoch": 1.0833333333333333,
-      "grad_norm": 1.8710312843322754,
-      "kl": 0.0985046848654747,
-      "learning_rate": 2.5e-06,
-      "loss": -0.02838735282421112,
-      "step": 26,
-      "step_time": 0.22659933299996737
-    },
-    {
-      "clip_ratio/high_max": 0.008463541977107525,
-      "clip_ratio/high_mean": 0.008463541977107525,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.009765625,
-      "entropy": 1.2517439126968384,
-      "epoch": 1.125,
-      "grad_norm": 2.821958303451538,
-      "kl": 0.09274079650640488,
-      "learning_rate": 2.4000000000000003e-06,
-      "loss": -0.007298170123249292,
-      "step": 27,
-      "step_time": 0.22647249999999985
-    },
-    {
-      "clip_ratio/high_max": 0.0052083334885537624,
-      "clip_ratio/high_mean": 0.0052083334885537624,
-      "clip_ratio/low_mean": 0.009114583022892475,
-      "clip_ratio/low_min": 0.009114583022892475,
-      "clip_ratio/region_mean": 0.014322916977107525,
-      "entropy": 2.0579044818878174,
-      "epoch": 1.1666666666666667,
-      "grad_norm": 3.259742259979248,
-      "kl": 0.10746321082115173,
-      "learning_rate": 2.3000000000000004e-06,
-      "loss": 0.021702758967876434,
-      "step": 28,
-      "step_time": 0.22640677999993386
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3725861310958862,
-      "epoch": 1.2083333333333333,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 1.8806989192962646,
-      "kl": 0.11961983889341354,
-      "learning_rate": 2.2e-06,
-      "loss": 0.07187109440565109,
-      "num_tokens": 202200.0,
-      "reward": -0.0561029389500618,
-      "reward_std": 0.314301997423172,
-      "rewards/GeneratorRewardFunction/mean": -0.0561029389500618,
-      "rewards/GeneratorRewardFunction/std": 0.314301997423172,
-      "step": 29,
-      "step_time": 13.662849896999887
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.2213298082351685,
-      "epoch": 1.25,
-      "grad_norm": 2.1918396949768066,
-      "kl": 0.12398240715265274,
-      "learning_rate": 2.1000000000000002e-06,
-      "loss": -0.052896980196237564,
-      "step": 30,
-      "step_time": 0.2210835590001352
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.001953125,
-      "clip_ratio/low_min": 0.001953125,
-      "clip_ratio/region_mean": 0.00390625,
-      "entropy": 1.2683231830596924,
-      "epoch": 1.2916666666666667,
-      "grad_norm": 2.524726390838623,
-      "kl": 0.14297537505626678,
-      "learning_rate": 2.0000000000000003e-06,
-      "loss": 0.12745414674282074,
-      "step": 31,
-      "step_time": 0.22096665699996265
-    },
-    {
-      "clip_ratio/high_max": 0.0032552082557231188,
-      "clip_ratio/high_mean": 0.0032552082557231188,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.00390625,
-      "entropy": 1.0583091974258423,
-      "epoch": 1.3333333333333333,
-      "grad_norm": 2.408073902130127,
-      "kl": 0.0881701335310936,
-      "learning_rate": 1.9000000000000002e-06,
-      "loss": -0.14430458843708038,
-      "step": 32,
-      "step_time": 0.21999255600007928
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.3751106262207031,
-      "epoch": 1.375,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.4002864360809326,
-      "kl": 0.0900505781173706,
-      "learning_rate": 1.8000000000000001e-06,
-      "loss": 0.06270528584718704,
-      "num_tokens": 227556.0,
-      "reward": -0.11414062231779099,
-      "reward_std": 0.21683935821056366,
-      "rewards/GeneratorRewardFunction/mean": -0.11414062231779099,
-      "rewards/GeneratorRewardFunction/std": 0.21683938801288605,
-      "step": 33,
-      "step_time": 12.070902493999938
-    },
-    {
-      "clip_ratio/high_max": 0.0045572915114462376,
-      "clip_ratio/high_mean": 0.0045572915114462376,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0045572915114462376,
-      "entropy": 1.2606719732284546,
-      "epoch": 1.4166666666666667,
-      "grad_norm": 1.671729326248169,
-      "kl": 0.1210540160536766,
-      "learning_rate": 1.7000000000000002e-06,
-      "loss": -0.04401962831616402,
-      "step": 34,
-      "step_time": 0.2276347459999215
-    },
-    {
-      "clip_ratio/high_max": 0.0013020833721384406,
-      "clip_ratio/high_mean": 0.0013020833721384406,
-      "clip_ratio/low_mean": 0.001953125,
-      "clip_ratio/low_min": 0.001953125,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.2780500650405884,
-      "epoch": 1.4583333333333333,
-      "grad_norm": 2.278010845184326,
-      "kl": 0.11484409123659134,
-      "learning_rate": 1.6000000000000001e-06,
-      "loss": -0.08475238084793091,
-      "step": 35,
-      "step_time": 0.22882699100000536
-    },
-    {
-      "clip_ratio/high_max": 0.0052083334885537624,
-      "clip_ratio/high_mean": 0.0052083334885537624,
-      "clip_ratio/low_mean": 0.0032552082557231188,
-      "clip_ratio/low_min": 0.0032552082557231188,
-      "clip_ratio/region_mean": 0.008463541977107525,
-      "entropy": 1.0553101301193237,
-      "epoch": 1.5,
-      "grad_norm": 1.582037091255188,
-      "kl": 0.12029703706502914,
-      "learning_rate": 1.5e-06,
-      "loss": 0.06627888232469559,
-      "step": 36,
-      "step_time": 0.22751581399984389
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.0647958517074585,
-      "epoch": 1.5416666666666665,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.1763558387756348,
-      "kl": 0.08708903193473816,
-      "learning_rate": 1.4000000000000001e-06,
-      "loss": -0.00017260713502764702,
-      "num_tokens": 252640.0,
-      "reward": -0.10210937261581421,
-      "reward_std": 0.19573244452476501,
-      "rewards/GeneratorRewardFunction/mean": -0.10210937261581421,
-      "rewards/GeneratorRewardFunction/std": 0.1957324594259262,
-      "step": 37,
-      "step_time": 12.015305628000078
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0026041667442768812,
-      "entropy": 1.0041130781173706,
-      "epoch": 1.5833333333333335,
-      "grad_norm": 1.6093602180480957,
-      "kl": 0.11537543684244156,
-      "learning_rate": 1.3e-06,
-      "loss": -0.12453166395425797,
-      "step": 38,
-      "step_time": 0.22048816200003785
-    },
-    {
-      "clip_ratio/high_max": 0.0045572915114462376,
-      "clip_ratio/high_mean": 0.0045572915114462376,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.005859375,
-      "entropy": 1.500306487083435,
-      "epoch": 1.625,
-      "grad_norm": 3.409069299697876,
-      "kl": 0.10904627293348312,
-      "learning_rate": 1.2000000000000002e-06,
-      "loss": 0.12661518156528473,
-      "step": 39,
-      "step_time": 0.22087437000004684
-    },
-    {
-      "clip_ratio/high_max": 0.0078125,
-      "clip_ratio/high_mean": 0.0078125,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.009114583022892475,
-      "entropy": 1.0560635328292847,
-      "epoch": 1.6666666666666665,
-      "grad_norm": 2.0718417167663574,
-      "kl": 0.11926760524511337,
-      "learning_rate": 1.1e-06,
-      "loss": -0.0004449083062354475,
-      "step": 40,
-      "step_time": 0.2202887500000088
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0013020833721384406,
-      "clip_ratio/low_min": 0.0013020833721384406,
-      "clip_ratio/region_mean": 0.0013020833721384406,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.0184931755065918,
-      "epoch": 1.7083333333333335,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 1.9755194187164307,
-      "kl": 0.1180298700928688,
-      "learning_rate": 1.0000000000000002e-06,
-      "loss": 0.03202051296830177,
-      "num_tokens": 277896.0,
-      "reward": -0.06937499344348907,
-      "reward_std": 0.1560969203710556,
-      "rewards/GeneratorRewardFunction/mean": -0.06937499344348907,
-      "rewards/GeneratorRewardFunction/std": 0.1560969203710556,
-      "step": 41,
-      "step_time": 12.091393732999904
-    },
-    {
-      "clip_ratio/high_max": 0.0032552082557231188,
-      "clip_ratio/high_mean": 0.0032552082557231188,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.00390625,
-      "entropy": 0.8101570010185242,
-      "epoch": 1.75,
-      "grad_norm": 2.101008653640747,
-      "kl": 0.13180766999721527,
-      "learning_rate": 9.000000000000001e-07,
-      "loss": -0.03199642524123192,
-      "step": 42,
-      "step_time": 0.22810021400005098
-    },
-    {
-      "clip_ratio/high_max": 0.001953125,
-      "clip_ratio/high_mean": 0.001953125,
-      "clip_ratio/low_mean": 0.0006510416860692203,
-      "clip_ratio/low_min": 0.0006510416860692203,
-      "clip_ratio/region_mean": 0.0026041667442768812,
-      "entropy": 0.9268913269042969,
-      "epoch": 1.7916666666666665,
-      "grad_norm": 2.1574151515960693,
-      "kl": 0.11732880026102066,
-      "learning_rate": 8.000000000000001e-07,
-      "loss": 0.0002514577645342797,
-      "step": 43,
-      "step_time": 0.22811048399989886
-    },
-    {
-      "clip_ratio/high_max": 0.00390625,
-      "clip_ratio/high_mean": 0.00390625,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.00390625,
-      "entropy": 1.145074486732483,
-      "epoch": 1.8333333333333335,
-      "grad_norm": 2.5536458492279053,
-      "kl": 0.12928128242492676,
-      "learning_rate": 7.000000000000001e-07,
-      "loss": 0.0016053098952397704,
-      "step": 44,
-      "step_time": 0.22788419799985604
-    },
-    {
-      "clip_ratio/high_max": 0.0006510416860692203,
-      "clip_ratio/high_mean": 0.0006510416860692203,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0006510416860692203,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 1.1803818941116333,
-      "epoch": 1.875,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 3.2263009548187256,
-      "kl": 0.13209813833236694,
-      "learning_rate": 6.000000000000001e-07,
-      "loss": 0.1281612068414688,
-      "num_tokens": 303192.0,
-      "reward": -0.11374999582767487,
-      "reward_std": 0.18029142916202545,
-      "rewards/GeneratorRewardFunction/mean": -0.11374999582767487,
-      "rewards/GeneratorRewardFunction/std": 0.18029142916202545,
-      "step": 45,
-      "step_time": 12.014625670999976
-    },
-    {
-      "clip_ratio/high_max": 0.0026041667442768812,
-      "clip_ratio/high_mean": 0.0026041667442768812,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0026041667442768812,
-      "entropy": 1.6430233716964722,
-      "epoch": 1.9166666666666665,
-      "grad_norm": 2.463127851486206,
-      "kl": 0.11944004148244858,
-      "learning_rate": 5.000000000000001e-07,
-      "loss": -0.01078779250383377,
-      "step": 46,
-      "step_time": 0.22117237800011935
-    },
-    {
-      "clip_ratio/high_max": 0.0013020833721384406,
-      "clip_ratio/high_mean": 0.0013020833721384406,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0013020833721384406,
-      "entropy": 1.1240859031677246,
-      "epoch": 1.9583333333333335,
-      "grad_norm": 2.1054372787475586,
-      "kl": 0.13911886513233185,
-      "learning_rate": 4.0000000000000003e-07,
-      "loss": 0.001417159684933722,
-      "step": 47,
-      "step_time": 0.2201927370001613
-    },
-    {
-      "clip_ratio/high_max": 0.0032552082557231188,
-      "clip_ratio/high_mean": 0.0032552082557231188,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0032552082557231188,
-      "entropy": 1.3605166673660278,
-      "epoch": 2.0,
-      "grad_norm": 1.7440528869628906,
-      "kl": 0.14588220417499542,
-      "learning_rate": 3.0000000000000004e-07,
-      "loss": -0.11717051267623901,
-      "step": 48,
-      "step_time": 0.21969574700005978
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 1.0,
-      "completions/max_length": 384.0,
-      "completions/max_terminated_length": 0.0,
-      "completions/mean_length": 384.0,
-      "completions/mean_terminated_length": 0.0,
-      "completions/min_length": 384.0,
-      "completions/min_terminated_length": 0.0,
-      "entropy": 0.9781540036201477,
-      "epoch": 2.0416666666666665,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 2.4057631492614746,
-      "kl": 0.14009785652160645,
-      "learning_rate": 2.0000000000000002e-07,
-      "loss": 0.06281977146863937,
-      "num_tokens": 328804.0,
-      "reward": -0.07187499850988388,
-      "reward_std": 0.11617336422204971,
-      "rewards/GeneratorRewardFunction/mean": -0.07187499850988388,
-      "rewards/GeneratorRewardFunction/std": 0.11617336422204971,
-      "step": 49,
-      "step_time": 12.04901073699989
-    },
-    {
-      "clip_ratio/high_max": 0.0013020833721384406,
-      "clip_ratio/high_mean": 0.0013020833721384406,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0013020833721384406,
-      "entropy": 1.6572185754776,
-      "epoch": 2.0833333333333335,
-      "grad_norm": 2.6693296432495117,
-      "kl": 0.13599954545497894,
-      "learning_rate": 1.0000000000000001e-07,
-      "loss": -0.16521048545837402,
-      "step": 50,
-      "step_time": 0.23019724899995708
-    }
-  ],
-  "logging_steps": 1,
-  "max_steps": 50,
-  "num_input_tokens_seen": 328804,
-  "num_train_epochs": 3,
-  "save_steps": 10,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": true
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 0.0,
-  "train_batch_size": 4,
-  "trial_name": null,
-  "trial_params": null
-}

self_play_hf_a10g_train/round_001/generator_train/checkpoint-50/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:31ec66b64f432daf7616434296713e432d134face96e308f2ebc175e2e26f025
-size 7249

self_play_hf_a10g_train/round_001/generator_train/final_model/chat_template.jinja DELETED Viewed

@@ -1,54 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
-    {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}

self_play_hf_a10g_train/round_001/generator_train/final_model/config.json DELETED Viewed

@@ -1,57 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": null,
-  "dtype": "float32",
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "pad_token_id": 151643,
-  "rms_norm_eps": 1e-06,
-  "rope_parameters": {
-    "rope_theta": 1000000.0,
-    "rope_type": "default"
-  },
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "5.6.2",
-  "use_cache": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}

self_play_hf_a10g_train/round_001/generator_train/final_model/generation_config.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "do_sample": true,
-  "eos_token_id": [
-    151645,
-    151643
-  ],
-  "pad_token_id": 151643,
-  "repetition_penalty": 1.1,
-  "temperature": 0.7,
-  "top_k": 20,
-  "top_p": 0.8,
-  "transformers_version": "5.6.2"
-}

self_play_hf_a10g_train/round_001/generator_train/final_model/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6fa4eed67a84ce4076ba3848a078496971cd34ba048c794e52cc3b4aab54a27
-size 1976163472

self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
-size 11421892

self_play_hf_a10g_train/round_001/generator_train/final_model/tokenizer_config.json DELETED Viewed

@@ -1,32 +0,0 @@
-{
-  "add_prefix_space": false,
-  "backend": "tokenizers",
-  "bos_token": null,
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "extra_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "is_local": false,
-  "local_files_only": false,
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "padding_side": "left",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "truncation_side": "left",
-  "unk_token": null
-}

self_play_hf_a10g_train/round_001/generator_train/final_model/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:31ec66b64f432daf7616434296713e432d134face96e308f2ebc175e2e26f025
-size 7249