Instructions to use Lomesh7777/salespath-grpo with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Lomesh7777/salespath-grpo with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-1.5B-Instruct") model = PeftModel.from_pretrained(base_model, "Lomesh7777/salespath-grpo") - Transformers
How to use Lomesh7777/salespath-grpo with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Lomesh7777/salespath-grpo") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Lomesh7777/salespath-grpo", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Lomesh7777/salespath-grpo with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Lomesh7777/salespath-grpo" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Lomesh7777/salespath-grpo", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Lomesh7777/salespath-grpo
- SGLang
How to use Lomesh7777/salespath-grpo with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Lomesh7777/salespath-grpo" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Lomesh7777/salespath-grpo", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Lomesh7777/salespath-grpo" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Lomesh7777/salespath-grpo", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Lomesh7777/salespath-grpo with Docker Model Runner:
docker model run hf.co/Lomesh7777/salespath-grpo
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.2195121951219512, | |
| "eval_steps": 500, | |
| "global_step": 50, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 131.0, | |
| "completions/max_terminated_length": 131.0, | |
| "completions/mean_length": 60.10416793823242, | |
| "completions/mean_terminated_length": 60.10416793823242, | |
| "completions/min_length": 28.0, | |
| "completions/min_terminated_length": 28.0, | |
| "entropy": 1.2584454119205475, | |
| "epoch": 0.024390243902439025, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.09302648901939392, | |
| "kl": 1.2248776783962967e-05, | |
| "learning_rate": 0.0, | |
| "loss": -0.0423424206674099, | |
| "num_tokens": 23029.0, | |
| "reward": 0.5082165002822876, | |
| "reward_std": 0.27811428904533386, | |
| "rewards/true_env_reward_fn/mean": 0.5082164406776428, | |
| "rewards/true_env_reward_fn/std": 0.27811428904533386, | |
| "step": 1, | |
| "step_time": 11.815711200999885 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 161.0, | |
| "completions/max_terminated_length": 161.0, | |
| "completions/mean_length": 55.875, | |
| "completions/mean_terminated_length": 55.875, | |
| "completions/min_length": 6.0, | |
| "completions/min_terminated_length": 6.0, | |
| "entropy": 1.3789870142936707, | |
| "epoch": 0.04878048780487805, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11938872188329697, | |
| "kl": 1.2672078355535632e-05, | |
| "learning_rate": 2.4390243902439023e-08, | |
| "loss": -0.11833255738019943, | |
| "num_tokens": 57015.0, | |
| "reward": 0.1327376663684845, | |
| "reward_std": 0.241567462682724, | |
| "rewards/true_env_reward_fn/mean": 0.1327376663684845, | |
| "rewards/true_env_reward_fn/std": 0.241567462682724, | |
| "step": 2, | |
| "step_time": 13.493524850000085 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 124.0, | |
| "completions/max_terminated_length": 124.0, | |
| "completions/mean_length": 63.79166793823242, | |
| "completions/mean_terminated_length": 63.79166793823242, | |
| "completions/min_length": 7.0, | |
| "completions/min_terminated_length": 7.0, | |
| "entropy": 1.315225213766098, | |
| "epoch": 0.07317073170731707, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08519645780324936, | |
| "kl": 1.2407871281538974e-05, | |
| "learning_rate": 4.878048780487805e-08, | |
| "loss": -0.03654177859425545, | |
| "num_tokens": 86989.0, | |
| "reward": 0.3152047097682953, | |
| "reward_std": 0.3069385886192322, | |
| "rewards/true_env_reward_fn/mean": 0.3152047097682953, | |
| "rewards/true_env_reward_fn/std": 0.30693864822387695, | |
| "step": 3, | |
| "step_time": 11.449303891999875 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 226.0, | |
| "completions/max_terminated_length": 226.0, | |
| "completions/mean_length": 77.20833587646484, | |
| "completions/mean_terminated_length": 77.20833587646484, | |
| "completions/min_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "entropy": 1.338063895702362, | |
| "epoch": 0.0975609756097561, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08080132305622101, | |
| "kl": 1.239982589140709e-05, | |
| "learning_rate": 7.317073170731706e-08, | |
| "loss": 0.053779490292072296, | |
| "num_tokens": 112007.0, | |
| "reward": 0.4893929362297058, | |
| "reward_std": 0.28476035594940186, | |
| "rewards/true_env_reward_fn/mean": 0.4893929064273834, | |
| "rewards/true_env_reward_fn/std": 0.28476035594940186, | |
| "step": 4, | |
| "step_time": 18.835909622000145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 212.0, | |
| "completions/max_terminated_length": 212.0, | |
| "completions/mean_length": 67.41667175292969, | |
| "completions/mean_terminated_length": 67.41667175292969, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 1.3855182826519012, | |
| "epoch": 0.12195121951219512, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08783729374408722, | |
| "kl": 1.1660237760224845e-05, | |
| "learning_rate": 9.75609756097561e-08, | |
| "loss": -0.026884621009230614, | |
| "num_tokens": 135883.0, | |
| "reward": 0.48575252294540405, | |
| "reward_std": 0.335994690656662, | |
| "rewards/true_env_reward_fn/mean": 0.48575249314308167, | |
| "rewards/true_env_reward_fn/std": 0.335994690656662, | |
| "step": 5, | |
| "step_time": 14.435845696000001 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 164.0, | |
| "completions/max_terminated_length": 164.0, | |
| "completions/mean_length": 71.29167175292969, | |
| "completions/mean_terminated_length": 71.29167175292969, | |
| "completions/min_length": 26.0, | |
| "completions/min_terminated_length": 26.0, | |
| "entropy": 1.2962585091590881, | |
| "epoch": 0.14634146341463414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.08510823547840118, | |
| "kl": 1.241418908648484e-05, | |
| "learning_rate": 1.219512195121951e-07, | |
| "loss": -0.05353507027029991, | |
| "num_tokens": 157537.0, | |
| "reward": 0.47622889280319214, | |
| "reward_std": 0.3605790138244629, | |
| "rewards/true_env_reward_fn/mean": 0.47622886300086975, | |
| "rewards/true_env_reward_fn/std": 0.3605790138244629, | |
| "step": 6, | |
| "step_time": 13.232063896999989 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 114.0, | |
| "completions/max_terminated_length": 114.0, | |
| "completions/mean_length": 69.45833587646484, | |
| "completions/mean_terminated_length": 69.45833587646484, | |
| "completions/min_length": 44.0, | |
| "completions/min_terminated_length": 44.0, | |
| "entropy": 1.273663192987442, | |
| "epoch": 0.17073170731707318, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.0775279700756073, | |
| "kl": 1.2900356978207128e-05, | |
| "learning_rate": 1.4634146341463413e-07, | |
| "loss": -0.010494321584701538, | |
| "num_tokens": 179167.0, | |
| "reward": 0.5062826871871948, | |
| "reward_std": 0.18032674491405487, | |
| "rewards/true_env_reward_fn/mean": 0.5062826871871948, | |
| "rewards/true_env_reward_fn/std": 0.18032673001289368, | |
| "step": 7, | |
| "step_time": 9.810652986000036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 287.0, | |
| "completions/max_terminated_length": 287.0, | |
| "completions/mean_length": 65.54167175292969, | |
| "completions/mean_terminated_length": 65.54167175292969, | |
| "completions/min_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "entropy": 1.255563884973526, | |
| "epoch": 0.1951219512195122, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.07464194297790527, | |
| "kl": 1.1561841347429436e-05, | |
| "learning_rate": 1.7073170731707317e-07, | |
| "loss": 0.0830899029970169, | |
| "num_tokens": 201865.0, | |
| "reward": 0.38212963938713074, | |
| "reward_std": 0.29894331097602844, | |
| "rewards/true_env_reward_fn/mean": 0.38212963938713074, | |
| "rewards/true_env_reward_fn/std": 0.29894331097602844, | |
| "step": 8, | |
| "step_time": 19.874756868999953 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 122.0, | |
| "completions/max_terminated_length": 122.0, | |
| "completions/mean_length": 68.33333587646484, | |
| "completions/mean_terminated_length": 68.33333587646484, | |
| "completions/min_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "entropy": 1.2706169188022614, | |
| "epoch": 0.21951219512195122, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.049192048609256744, | |
| "kl": 1.157601468548819e-05, | |
| "learning_rate": 1.951219512195122e-07, | |
| "loss": 0.010864660143852234, | |
| "num_tokens": 219953.0, | |
| "reward": 0.6740004420280457, | |
| "reward_std": 0.18809831142425537, | |
| "rewards/true_env_reward_fn/mean": 0.6740004420280457, | |
| "rewards/true_env_reward_fn/std": 0.18809829652309418, | |
| "step": 9, | |
| "step_time": 9.458149736999985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 148.0, | |
| "completions/max_terminated_length": 148.0, | |
| "completions/mean_length": 59.833335876464844, | |
| "completions/mean_terminated_length": 59.833335876464844, | |
| "completions/min_length": 19.0, | |
| "completions/min_terminated_length": 19.0, | |
| "entropy": 1.1927059888839722, | |
| "epoch": 0.24390243902439024, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.0561092346906662, | |
| "kl": 1.0622535000948119e-05, | |
| "learning_rate": 2.195121951219512e-07, | |
| "loss": -0.02407176047563553, | |
| "num_tokens": 244913.0, | |
| "reward": 0.5113257169723511, | |
| "reward_std": 0.32156965136528015, | |
| "rewards/true_env_reward_fn/mean": 0.5113256573677063, | |
| "rewards/true_env_reward_fn/std": 0.32156962156295776, | |
| "step": 10, | |
| "step_time": 14.219840567000006 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 115.0, | |
| "completions/max_terminated_length": 115.0, | |
| "completions/mean_length": 65.47917175292969, | |
| "completions/mean_terminated_length": 65.47917175292969, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 1.2782267928123474, | |
| "epoch": 0.2682926829268293, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.05816411226987839, | |
| "kl": 1.2071807759639341e-05, | |
| "learning_rate": 2.439024390243902e-07, | |
| "loss": 0.007693461142480373, | |
| "num_tokens": 269080.0, | |
| "reward": 0.37106746435165405, | |
| "reward_std": 0.26608046889305115, | |
| "rewards/true_env_reward_fn/mean": 0.37106743454933167, | |
| "rewards/true_env_reward_fn/std": 0.26608046889305115, | |
| "step": 11, | |
| "step_time": 9.271131832999913 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 143.0, | |
| "completions/max_terminated_length": 143.0, | |
| "completions/mean_length": 67.9375, | |
| "completions/mean_terminated_length": 67.9375, | |
| "completions/min_length": 41.0, | |
| "completions/min_terminated_length": 41.0, | |
| "entropy": 1.3190773129463196, | |
| "epoch": 0.2926829268292683, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.09140665084123611, | |
| "kl": 1.2069708191120299e-05, | |
| "learning_rate": 2.682926829268293e-07, | |
| "loss": 0.07185906916856766, | |
| "num_tokens": 291317.0, | |
| "reward": 0.4376159906387329, | |
| "reward_std": 0.27247554063796997, | |
| "rewards/true_env_reward_fn/mean": 0.4376159906387329, | |
| "rewards/true_env_reward_fn/std": 0.27247554063796997, | |
| "step": 12, | |
| "step_time": 12.184364300000084 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 122.0, | |
| "completions/max_terminated_length": 122.0, | |
| "completions/mean_length": 66.54167175292969, | |
| "completions/mean_terminated_length": 66.54167175292969, | |
| "completions/min_length": 25.0, | |
| "completions/min_terminated_length": 25.0, | |
| "entropy": 1.3555113077163696, | |
| "epoch": 0.3170731707317073, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08824986964464188, | |
| "kl": 1.2127976788178785e-05, | |
| "learning_rate": 2.9268292682926825e-07, | |
| "loss": -0.0217185840010643, | |
| "num_tokens": 313623.0, | |
| "reward": 0.5092746615409851, | |
| "reward_std": 0.3137436807155609, | |
| "rewards/true_env_reward_fn/mean": 0.5092746615409851, | |
| "rewards/true_env_reward_fn/std": 0.3137436509132385, | |
| "step": 13, | |
| "step_time": 10.720424850000086 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 188.0, | |
| "completions/max_terminated_length": 188.0, | |
| "completions/mean_length": 69.3125, | |
| "completions/mean_terminated_length": 69.3125, | |
| "completions/min_length": 47.0, | |
| "completions/min_terminated_length": 47.0, | |
| "entropy": 1.3283279240131378, | |
| "epoch": 0.34146341463414637, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.05055573210120201, | |
| "kl": 1.3128182672517141e-05, | |
| "learning_rate": 3.170731707317073e-07, | |
| "loss": -0.024722743779420853, | |
| "num_tokens": 339118.0, | |
| "reward": 0.45545920729637146, | |
| "reward_std": 0.18457132577896118, | |
| "rewards/true_env_reward_fn/mean": 0.45545920729637146, | |
| "rewards/true_env_reward_fn/std": 0.18457134068012238, | |
| "step": 14, | |
| "step_time": 14.965493325000011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 181.0, | |
| "completions/max_terminated_length": 181.0, | |
| "completions/mean_length": 66.45833587646484, | |
| "completions/mean_terminated_length": 66.45833587646484, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 24.0, | |
| "entropy": 1.2629931271076202, | |
| "epoch": 0.36585365853658536, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.06967486441135406, | |
| "kl": 1.1465989928183262e-05, | |
| "learning_rate": 3.4146341463414634e-07, | |
| "loss": 0.046319857239723206, | |
| "num_tokens": 366364.0, | |
| "reward": 0.4448578357696533, | |
| "reward_std": 0.24966756999492645, | |
| "rewards/true_env_reward_fn/mean": 0.4448578357696533, | |
| "rewards/true_env_reward_fn/std": 0.24966755509376526, | |
| "step": 15, | |
| "step_time": 13.628413805999912 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 194.0, | |
| "completions/max_terminated_length": 194.0, | |
| "completions/mean_length": 69.04167175292969, | |
| "completions/mean_terminated_length": 69.04167175292969, | |
| "completions/min_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "entropy": 1.2793545722961426, | |
| "epoch": 0.3902439024390244, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.04725664108991623, | |
| "kl": 1.1130929124192335e-05, | |
| "learning_rate": 3.6585365853658536e-07, | |
| "loss": 0.006799306720495224, | |
| "num_tokens": 392926.0, | |
| "reward": 0.414639949798584, | |
| "reward_std": 0.2748004198074341, | |
| "rewards/true_env_reward_fn/mean": 0.414639949798584, | |
| "rewards/true_env_reward_fn/std": 0.2748004198074341, | |
| "step": 16, | |
| "step_time": 14.229579036999894 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 195.0, | |
| "completions/max_terminated_length": 195.0, | |
| "completions/mean_length": 76.4375, | |
| "completions/mean_terminated_length": 76.4375, | |
| "completions/min_length": 31.0, | |
| "completions/min_terminated_length": 31.0, | |
| "entropy": 1.3106227219104767, | |
| "epoch": 0.4146341463414634, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.06872504949569702, | |
| "kl": 1.2065312830600305e-05, | |
| "learning_rate": 3.902439024390244e-07, | |
| "loss": 0.036527130752801895, | |
| "num_tokens": 419219.0, | |
| "reward": 0.49165210127830505, | |
| "reward_std": 0.267509400844574, | |
| "rewards/true_env_reward_fn/mean": 0.49165210127830505, | |
| "rewards/true_env_reward_fn/std": 0.267509400844574, | |
| "step": 17, | |
| "step_time": 17.023353198999985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 159.0, | |
| "completions/max_terminated_length": 159.0, | |
| "completions/mean_length": 71.72917175292969, | |
| "completions/mean_terminated_length": 71.72917175292969, | |
| "completions/min_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "entropy": 1.3780030608177185, | |
| "epoch": 0.43902439024390244, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.05453665927052498, | |
| "kl": 1.2325858278927626e-05, | |
| "learning_rate": 4.146341463414634e-07, | |
| "loss": 0.01989848166704178, | |
| "num_tokens": 442822.0, | |
| "reward": 0.5288735032081604, | |
| "reward_std": 0.2950553297996521, | |
| "rewards/true_env_reward_fn/mean": 0.5288735032081604, | |
| "rewards/true_env_reward_fn/std": 0.2950552701950073, | |
| "step": 18, | |
| "step_time": 11.965533113999868 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 106.0, | |
| "completions/max_terminated_length": 106.0, | |
| "completions/mean_length": 65.4375, | |
| "completions/mean_terminated_length": 65.4375, | |
| "completions/min_length": 42.0, | |
| "completions/min_terminated_length": 42.0, | |
| "entropy": 1.3424750864505768, | |
| "epoch": 0.4634146341463415, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.09814280271530151, | |
| "kl": 1.2686515219684225e-05, | |
| "learning_rate": 4.390243902439024e-07, | |
| "loss": 0.06940581649541855, | |
| "num_tokens": 467275.0, | |
| "reward": 0.5175753831863403, | |
| "reward_std": 0.2811976969242096, | |
| "rewards/true_env_reward_fn/mean": 0.5175753235816956, | |
| "rewards/true_env_reward_fn/std": 0.2811976969242096, | |
| "step": 19, | |
| "step_time": 10.33812468799988 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 244.0, | |
| "completions/max_terminated_length": 244.0, | |
| "completions/mean_length": 65.10417175292969, | |
| "completions/mean_terminated_length": 65.10417175292969, | |
| "completions/min_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 1.1681120097637177, | |
| "epoch": 0.4878048780487805, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.09474422037601471, | |
| "kl": 1.2183225862827385e-05, | |
| "learning_rate": 4.634146341463415e-07, | |
| "loss": 0.05423373728990555, | |
| "num_tokens": 494320.0, | |
| "reward": 0.48628994822502136, | |
| "reward_std": 0.25381213426589966, | |
| "rewards/true_env_reward_fn/mean": 0.48628994822502136, | |
| "rewards/true_env_reward_fn/std": 0.25381216406822205, | |
| "step": 20, | |
| "step_time": 17.317542748000164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 157.0, | |
| "completions/max_terminated_length": 157.0, | |
| "completions/mean_length": 62.395835876464844, | |
| "completions/mean_terminated_length": 62.395835876464844, | |
| "completions/min_length": 28.0, | |
| "completions/min_terminated_length": 28.0, | |
| "entropy": 1.2504475116729736, | |
| "epoch": 0.5121951219512195, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.0819205492734909, | |
| "kl": 1.0698822279664455e-05, | |
| "learning_rate": 4.878048780487804e-07, | |
| "loss": 0.05607657879590988, | |
| "num_tokens": 518323.0, | |
| "reward": 0.4693639278411865, | |
| "reward_std": 0.32881346344947815, | |
| "rewards/true_env_reward_fn/mean": 0.4693639278411865, | |
| "rewards/true_env_reward_fn/std": 0.32881346344947815, | |
| "step": 21, | |
| "step_time": 12.20283881399996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 144.0, | |
| "completions/max_terminated_length": 144.0, | |
| "completions/mean_length": 68.91667175292969, | |
| "completions/mean_terminated_length": 68.91667175292969, | |
| "completions/min_length": 31.0, | |
| "completions/min_terminated_length": 31.0, | |
| "entropy": 1.2199381291866302, | |
| "epoch": 0.5365853658536586, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.06889473646879196, | |
| "kl": 1.1745656820494332e-05, | |
| "learning_rate": 5.121951219512195e-07, | |
| "loss": -0.017973195761442184, | |
| "num_tokens": 543591.0, | |
| "reward": 0.49388420581817627, | |
| "reward_std": 0.2952423393726349, | |
| "rewards/true_env_reward_fn/mean": 0.49388420581817627, | |
| "rewards/true_env_reward_fn/std": 0.2952423095703125, | |
| "step": 22, | |
| "step_time": 11.211206898000114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 107.0, | |
| "completions/max_terminated_length": 107.0, | |
| "completions/mean_length": 65.625, | |
| "completions/mean_terminated_length": 65.625, | |
| "completions/min_length": 41.0, | |
| "completions/min_terminated_length": 41.0, | |
| "entropy": 1.2588726878166199, | |
| "epoch": 0.5609756097560976, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08144447952508926, | |
| "kl": 1.2306870758038713e-05, | |
| "learning_rate": 5.365853658536586e-07, | |
| "loss": 0.02826106920838356, | |
| "num_tokens": 567973.0, | |
| "reward": 0.48142755031585693, | |
| "reward_std": 0.26756224036216736, | |
| "rewards/true_env_reward_fn/mean": 0.48142755031585693, | |
| "rewards/true_env_reward_fn/std": 0.26756221055984497, | |
| "step": 23, | |
| "step_time": 10.428452587999914 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 122.0, | |
| "completions/max_terminated_length": 122.0, | |
| "completions/mean_length": 59.5625, | |
| "completions/mean_terminated_length": 59.5625, | |
| "completions/min_length": 21.0, | |
| "completions/min_terminated_length": 21.0, | |
| "entropy": 1.384379804134369, | |
| "epoch": 0.5853658536585366, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11170398443937302, | |
| "kl": 1.2296073691686615e-05, | |
| "learning_rate": 5.609756097560975e-07, | |
| "loss": 0.07271970808506012, | |
| "num_tokens": 590248.0, | |
| "reward": 0.38166365027427673, | |
| "reward_std": 0.34809473156929016, | |
| "rewards/true_env_reward_fn/mean": 0.38166365027427673, | |
| "rewards/true_env_reward_fn/std": 0.3480947017669678, | |
| "step": 24, | |
| "step_time": 11.223491792000118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 123.0, | |
| "completions/max_terminated_length": 123.0, | |
| "completions/mean_length": 63.35416793823242, | |
| "completions/mean_terminated_length": 63.35416793823242, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "entropy": 1.3013385236263275, | |
| "epoch": 0.6097560975609756, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.10069931298494339, | |
| "kl": 1.2947949016961502e-05, | |
| "learning_rate": 5.853658536585365e-07, | |
| "loss": 0.033605337142944336, | |
| "num_tokens": 615345.0, | |
| "reward": 0.5046355724334717, | |
| "reward_std": 0.2754679322242737, | |
| "rewards/true_env_reward_fn/mean": 0.5046355128288269, | |
| "rewards/true_env_reward_fn/std": 0.2754679322242737, | |
| "step": 25, | |
| "step_time": 10.92509102200006 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 101.0, | |
| "completions/max_terminated_length": 101.0, | |
| "completions/mean_length": 61.41666793823242, | |
| "completions/mean_terminated_length": 61.41666793823242, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 1.2652399837970734, | |
| "epoch": 0.6341463414634146, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.07595694065093994, | |
| "kl": 1.151612354988174e-05, | |
| "learning_rate": 6.097560975609756e-07, | |
| "loss": 0.04607678949832916, | |
| "num_tokens": 644749.0, | |
| "reward": 0.3311978578567505, | |
| "reward_std": 0.21527718007564545, | |
| "rewards/true_env_reward_fn/mean": 0.3311978578567505, | |
| "rewards/true_env_reward_fn/std": 0.21527719497680664, | |
| "step": 26, | |
| "step_time": 10.458724108999945 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 140.0, | |
| "completions/max_terminated_length": 140.0, | |
| "completions/mean_length": 71.25, | |
| "completions/mean_terminated_length": 71.25, | |
| "completions/min_length": 28.0, | |
| "completions/min_terminated_length": 28.0, | |
| "entropy": 1.193794459104538, | |
| "epoch": 0.6585365853658537, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.07690244168043137, | |
| "kl": 1.2164698546257569e-05, | |
| "learning_rate": 6.341463414634146e-07, | |
| "loss": 0.00818883627653122, | |
| "num_tokens": 671153.0, | |
| "reward": 0.3635203242301941, | |
| "reward_std": 0.23849114775657654, | |
| "rewards/true_env_reward_fn/mean": 0.3635202944278717, | |
| "rewards/true_env_reward_fn/std": 0.23849113285541534, | |
| "step": 27, | |
| "step_time": 14.364785926000081 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 111.0, | |
| "completions/max_terminated_length": 111.0, | |
| "completions/mean_length": 63.4375, | |
| "completions/mean_terminated_length": 63.4375, | |
| "completions/min_length": 32.0, | |
| "completions/min_terminated_length": 32.0, | |
| "entropy": 1.2883787751197815, | |
| "epoch": 0.6829268292682927, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.0902288407087326, | |
| "kl": 1.1798915693361778e-05, | |
| "learning_rate": 6.585365853658536e-07, | |
| "loss": 0.038317371159791946, | |
| "num_tokens": 697614.0, | |
| "reward": 0.44166144728660583, | |
| "reward_std": 0.25748196244239807, | |
| "rewards/true_env_reward_fn/mean": 0.44166144728660583, | |
| "rewards/true_env_reward_fn/std": 0.25748199224472046, | |
| "step": 28, | |
| "step_time": 10.888908384999922 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 238.0, | |
| "completions/max_terminated_length": 238.0, | |
| "completions/mean_length": 69.60417175292969, | |
| "completions/mean_terminated_length": 69.60417175292969, | |
| "completions/min_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "entropy": 1.3002805709838867, | |
| "epoch": 0.7073170731707317, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.07522639632225037, | |
| "kl": 1.2230455695316778e-05, | |
| "learning_rate": 6.829268292682927e-07, | |
| "loss": 0.031045034527778625, | |
| "num_tokens": 719187.0, | |
| "reward": 0.5349087119102478, | |
| "reward_std": 0.29909756779670715, | |
| "rewards/true_env_reward_fn/mean": 0.5349087119102478, | |
| "rewards/true_env_reward_fn/std": 0.29909753799438477, | |
| "step": 29, | |
| "step_time": 15.510035302999995 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 189.0, | |
| "completions/max_terminated_length": 189.0, | |
| "completions/mean_length": 70.91667175292969, | |
| "completions/mean_terminated_length": 70.91667175292969, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 1.2718828916549683, | |
| "epoch": 0.7317073170731707, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.06776711344718933, | |
| "kl": 1.2617916354429326e-05, | |
| "learning_rate": 7.073170731707316e-07, | |
| "loss": 0.09301326423883438, | |
| "num_tokens": 744095.0, | |
| "reward": 0.43472790718078613, | |
| "reward_std": 0.3138841986656189, | |
| "rewards/true_env_reward_fn/mean": 0.43472790718078613, | |
| "rewards/true_env_reward_fn/std": 0.3138841688632965, | |
| "step": 30, | |
| "step_time": 14.50245602599989 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 153.0, | |
| "completions/max_terminated_length": 153.0, | |
| "completions/mean_length": 69.77083587646484, | |
| "completions/mean_terminated_length": 69.77083587646484, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "entropy": 1.2918945252895355, | |
| "epoch": 0.7560975609756098, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08040682971477509, | |
| "kl": 1.2672349157583085e-05, | |
| "learning_rate": 7.317073170731707e-07, | |
| "loss": 0.0367550291121006, | |
| "num_tokens": 764612.0, | |
| "reward": 0.5134401321411133, | |
| "reward_std": 0.19073942303657532, | |
| "rewards/true_env_reward_fn/mean": 0.5134401321411133, | |
| "rewards/true_env_reward_fn/std": 0.19073940813541412, | |
| "step": 31, | |
| "step_time": 11.06186091799998 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 269.0, | |
| "completions/max_terminated_length": 269.0, | |
| "completions/mean_length": 71.79167175292969, | |
| "completions/mean_terminated_length": 71.79167175292969, | |
| "completions/min_length": 21.0, | |
| "completions/min_terminated_length": 21.0, | |
| "entropy": 1.1679067015647888, | |
| "epoch": 0.7804878048780488, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.0744430273771286, | |
| "kl": 1.2661263326663175e-05, | |
| "learning_rate": 7.560975609756097e-07, | |
| "loss": 0.05885648727416992, | |
| "num_tokens": 782058.0, | |
| "reward": 0.5372593402862549, | |
| "reward_std": 0.18350909650325775, | |
| "rewards/true_env_reward_fn/mean": 0.5372593402862549, | |
| "rewards/true_env_reward_fn/std": 0.18350908160209656, | |
| "step": 32, | |
| "step_time": 15.808748693000211 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 265.0, | |
| "completions/max_terminated_length": 265.0, | |
| "completions/mean_length": 76.79167175292969, | |
| "completions/mean_terminated_length": 76.79167175292969, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 1.1829756796360016, | |
| "epoch": 0.8048780487804879, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.051698025315999985, | |
| "kl": 1.0996191576850833e-05, | |
| "learning_rate": 7.804878048780488e-07, | |
| "loss": 0.010143717750906944, | |
| "num_tokens": 810472.0, | |
| "reward": 0.4369215667247772, | |
| "reward_std": 0.30869919061660767, | |
| "rewards/true_env_reward_fn/mean": 0.4369215667247772, | |
| "rewards/true_env_reward_fn/std": 0.30869919061660767, | |
| "step": 33, | |
| "step_time": 24.20358999299981 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 137.0, | |
| "completions/max_terminated_length": 137.0, | |
| "completions/mean_length": 61.85416793823242, | |
| "completions/mean_terminated_length": 61.85416793823242, | |
| "completions/min_length": 31.0, | |
| "completions/min_terminated_length": 31.0, | |
| "entropy": 1.2468958497047424, | |
| "epoch": 0.8292682926829268, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.09706687182188034, | |
| "kl": 1.2097383432774222e-05, | |
| "learning_rate": 8.048780487804878e-07, | |
| "loss": 0.026558157056570053, | |
| "num_tokens": 836713.0, | |
| "reward": 0.3587157428264618, | |
| "reward_std": 0.2754887044429779, | |
| "rewards/true_env_reward_fn/mean": 0.3587157428264618, | |
| "rewards/true_env_reward_fn/std": 0.2754887044429779, | |
| "step": 34, | |
| "step_time": 12.218407348999904 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 115.0, | |
| "completions/max_terminated_length": 115.0, | |
| "completions/mean_length": 59.5625, | |
| "completions/mean_terminated_length": 59.5625, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 1.2368170320987701, | |
| "epoch": 0.8536585365853658, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08964981138706207, | |
| "kl": 1.3131634887031396e-05, | |
| "learning_rate": 8.292682926829268e-07, | |
| "loss": -0.01139204390347004, | |
| "num_tokens": 860028.0, | |
| "reward": 0.49109315872192383, | |
| "reward_std": 0.20359393954277039, | |
| "rewards/true_env_reward_fn/mean": 0.49109315872192383, | |
| "rewards/true_env_reward_fn/std": 0.20359393954277039, | |
| "step": 35, | |
| "step_time": 9.66908789599995 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 102.0, | |
| "completions/max_terminated_length": 102.0, | |
| "completions/mean_length": 66.02083587646484, | |
| "completions/mean_terminated_length": 66.02083587646484, | |
| "completions/min_length": 41.0, | |
| "completions/min_terminated_length": 41.0, | |
| "entropy": 1.1611860394477844, | |
| "epoch": 0.8780487804878049, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08851195871829987, | |
| "kl": 1.2570341596074286e-05, | |
| "learning_rate": 8.536585365853657e-07, | |
| "loss": 0.021737128496170044, | |
| "num_tokens": 883189.0, | |
| "reward": 0.46058258414268494, | |
| "reward_std": 0.2632383108139038, | |
| "rewards/true_env_reward_fn/mean": 0.46058258414268494, | |
| "rewards/true_env_reward_fn/std": 0.2632383108139038, | |
| "step": 36, | |
| "step_time": 8.370980583999994 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 135.0, | |
| "completions/max_terminated_length": 135.0, | |
| "completions/mean_length": 75.58333587646484, | |
| "completions/mean_terminated_length": 75.58333587646484, | |
| "completions/min_length": 47.0, | |
| "completions/min_terminated_length": 47.0, | |
| "entropy": 1.37085822224617, | |
| "epoch": 0.9024390243902439, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.05852028727531433, | |
| "kl": 1.2957561011717189e-05, | |
| "learning_rate": 8.780487804878048e-07, | |
| "loss": -0.024281952530145645, | |
| "num_tokens": 906801.0, | |
| "reward": 0.5022324323654175, | |
| "reward_std": 0.11637427657842636, | |
| "rewards/true_env_reward_fn/mean": 0.5022324323654175, | |
| "rewards/true_env_reward_fn/std": 0.11637428402900696, | |
| "step": 37, | |
| "step_time": 10.285125336999727 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 141.0, | |
| "completions/max_terminated_length": 141.0, | |
| "completions/mean_length": 65.14583587646484, | |
| "completions/mean_terminated_length": 65.14583587646484, | |
| "completions/min_length": 30.0, | |
| "completions/min_terminated_length": 30.0, | |
| "entropy": 1.2760809361934662, | |
| "epoch": 0.926829268292683, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.09105321019887924, | |
| "kl": 1.3129126955391257e-05, | |
| "learning_rate": 9.024390243902439e-07, | |
| "loss": -0.011838603764772415, | |
| "num_tokens": 929536.0, | |
| "reward": 0.49639374017715454, | |
| "reward_std": 0.32166802883148193, | |
| "rewards/true_env_reward_fn/mean": 0.49639371037483215, | |
| "rewards/true_env_reward_fn/std": 0.32166802883148193, | |
| "step": 38, | |
| "step_time": 12.449738128000035 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 124.0, | |
| "completions/max_terminated_length": 124.0, | |
| "completions/mean_length": 72.08333587646484, | |
| "completions/mean_terminated_length": 72.08333587646484, | |
| "completions/min_length": 43.0, | |
| "completions/min_terminated_length": 43.0, | |
| "entropy": 1.2545586228370667, | |
| "epoch": 0.9512195121951219, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.06919296830892563, | |
| "kl": 1.459557256566768e-05, | |
| "learning_rate": 9.26829268292683e-07, | |
| "loss": 0.021831180900335312, | |
| "num_tokens": 950388.0, | |
| "reward": 0.4879913330078125, | |
| "reward_std": 0.24854585528373718, | |
| "rewards/true_env_reward_fn/mean": 0.4879913330078125, | |
| "rewards/true_env_reward_fn/std": 0.24854585528373718, | |
| "step": 39, | |
| "step_time": 10.279209028999958 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 179.0, | |
| "completions/max_terminated_length": 179.0, | |
| "completions/mean_length": 74.20833587646484, | |
| "completions/mean_terminated_length": 74.20833587646484, | |
| "completions/min_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "entropy": 1.2255937159061432, | |
| "epoch": 0.975609756097561, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.06352153420448303, | |
| "kl": 1.2041192348988261e-05, | |
| "learning_rate": 9.512195121951218e-07, | |
| "loss": -0.013997981324791908, | |
| "num_tokens": 981254.0, | |
| "reward": 0.39802420139312744, | |
| "reward_std": 0.20212584733963013, | |
| "rewards/true_env_reward_fn/mean": 0.39802420139312744, | |
| "rewards/true_env_reward_fn/std": 0.20212584733963013, | |
| "step": 40, | |
| "step_time": 13.58010066599968 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 139.0, | |
| "completions/max_terminated_length": 139.0, | |
| "completions/mean_length": 75.04167175292969, | |
| "completions/mean_terminated_length": 75.04167175292969, | |
| "completions/min_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "entropy": 1.2703719735145569, | |
| "epoch": 1.0, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.045169439166784286, | |
| "kl": 1.1270850109212915e-05, | |
| "learning_rate": 9.756097560975609e-07, | |
| "loss": -0.010194316506385803, | |
| "num_tokens": 1009968.0, | |
| "reward": 0.4517599940299988, | |
| "reward_std": 0.11791092902421951, | |
| "rewards/true_env_reward_fn/mean": 0.4517599642276764, | |
| "rewards/true_env_reward_fn/std": 0.11791091412305832, | |
| "step": 41, | |
| "step_time": 10.35077203700007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 109.0, | |
| "completions/max_terminated_length": 109.0, | |
| "completions/mean_length": 64.33333587646484, | |
| "completions/mean_terminated_length": 64.33333587646484, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 1.329576164484024, | |
| "epoch": 1.024390243902439, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08522730320692062, | |
| "kl": 1.4469044799625408e-05, | |
| "learning_rate": 1e-06, | |
| "loss": -0.00014946190640330315, | |
| "num_tokens": 1039032.0, | |
| "reward": 0.33548423647880554, | |
| "reward_std": 0.22271563112735748, | |
| "rewards/true_env_reward_fn/mean": 0.33548423647880554, | |
| "rewards/true_env_reward_fn/std": 0.22271563112735748, | |
| "step": 42, | |
| "step_time": 10.548370664999993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 372.0, | |
| "completions/max_terminated_length": 372.0, | |
| "completions/mean_length": 70.02083587646484, | |
| "completions/mean_terminated_length": 70.02083587646484, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 24.0, | |
| "entropy": 1.2357364892959595, | |
| "epoch": 1.048780487804878, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.07030358910560608, | |
| "kl": 1.3562755839302554e-05, | |
| "learning_rate": 9.999818789066163e-07, | |
| "loss": -0.02616041898727417, | |
| "num_tokens": 1060833.0, | |
| "reward": 0.5167371034622192, | |
| "reward_std": 0.24280032515525818, | |
| "rewards/true_env_reward_fn/mean": 0.5167370438575745, | |
| "rewards/true_env_reward_fn/std": 0.24280032515525818, | |
| "step": 43, | |
| "step_time": 24.089396637999698 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 234.0, | |
| "completions/max_terminated_length": 234.0, | |
| "completions/mean_length": 77.47917175292969, | |
| "completions/mean_terminated_length": 77.47917175292969, | |
| "completions/min_length": 14.0, | |
| "completions/min_terminated_length": 14.0, | |
| "entropy": 1.1693778038024902, | |
| "epoch": 1.0731707317073171, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.07017157226800919, | |
| "kl": 1.332453393843025e-05, | |
| "learning_rate": 9.999275169399612e-07, | |
| "loss": -0.006466507911682129, | |
| "num_tokens": 1088648.0, | |
| "reward": 0.4498252272605896, | |
| "reward_std": 0.21398545801639557, | |
| "rewards/true_env_reward_fn/mean": 0.4498251974582672, | |
| "rewards/true_env_reward_fn/std": 0.21398545801639557, | |
| "step": 44, | |
| "step_time": 19.39071501599983 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 186.0, | |
| "completions/max_terminated_length": 186.0, | |
| "completions/mean_length": 72.16667175292969, | |
| "completions/mean_terminated_length": 72.16667175292969, | |
| "completions/min_length": 41.0, | |
| "completions/min_terminated_length": 41.0, | |
| "entropy": 1.3268415927886963, | |
| "epoch": 1.0975609756097562, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.06632921099662781, | |
| "kl": 1.4458733630817733e-05, | |
| "learning_rate": 9.99836918040428e-07, | |
| "loss": -0.03534461930394173, | |
| "num_tokens": 1117096.0, | |
| "reward": 0.4053138196468353, | |
| "reward_std": 0.21476909518241882, | |
| "rewards/true_env_reward_fn/mean": 0.4053138196468353, | |
| "rewards/true_env_reward_fn/std": 0.21476909518241882, | |
| "step": 45, | |
| "step_time": 13.893569495999827 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 160.0, | |
| "completions/max_terminated_length": 160.0, | |
| "completions/mean_length": 70.16667175292969, | |
| "completions/mean_terminated_length": 70.16667175292969, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 1.2670875787734985, | |
| "epoch": 1.1219512195121952, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.08321154117584229, | |
| "kl": 1.4837954950053245e-05, | |
| "learning_rate": 9.997100887750215e-07, | |
| "loss": -0.039235007017850876, | |
| "num_tokens": 1136480.0, | |
| "reward": 0.48141974210739136, | |
| "reward_std": 0.2837103307247162, | |
| "rewards/true_env_reward_fn/mean": 0.48141971230506897, | |
| "rewards/true_env_reward_fn/std": 0.2837103009223938, | |
| "step": 46, | |
| "step_time": 10.50698806499986 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 159.0, | |
| "completions/max_terminated_length": 159.0, | |
| "completions/mean_length": 76.1875, | |
| "completions/mean_terminated_length": 76.1875, | |
| "completions/min_length": 32.0, | |
| "completions/min_terminated_length": 32.0, | |
| "entropy": 1.3037313222885132, | |
| "epoch": 1.146341463414634, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.061912886798381805, | |
| "kl": 1.283655774386716e-05, | |
| "learning_rate": 9.995470383368808e-07, | |
| "loss": -0.01992109790444374, | |
| "num_tokens": 1162249.0, | |
| "reward": 0.49922606348991394, | |
| "reward_std": 0.2621309757232666, | |
| "rewards/true_env_reward_fn/mean": 0.49922606348991394, | |
| "rewards/true_env_reward_fn/std": 0.2621309757232666, | |
| "step": 47, | |
| "step_time": 12.964419044000124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 231.0, | |
| "completions/max_terminated_length": 231.0, | |
| "completions/mean_length": 71.375, | |
| "completions/mean_terminated_length": 71.375, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 1.2007178366184235, | |
| "epoch": 1.170731707317073, | |
| "frac_reward_zero_std": 0.1666666716337204, | |
| "grad_norm": 0.0889662653207779, | |
| "kl": 1.6228528693318367e-05, | |
| "learning_rate": 9.993477785446149e-07, | |
| "loss": 0.045945264399051666, | |
| "num_tokens": 1184555.0, | |
| "reward": 0.42501482367515564, | |
| "reward_std": 0.27350595593452454, | |
| "rewards/true_env_reward_fn/mean": 0.42501482367515564, | |
| "rewards/true_env_reward_fn/std": 0.27350592613220215, | |
| "step": 48, | |
| "step_time": 17.23041258299986 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 86.0, | |
| "completions/max_terminated_length": 86.0, | |
| "completions/mean_length": 55.9375, | |
| "completions/mean_terminated_length": 55.9375, | |
| "completions/min_length": 31.0, | |
| "completions/min_terminated_length": 31.0, | |
| "entropy": 1.182040810585022, | |
| "epoch": 1.1951219512195121, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.08547856658697128, | |
| "kl": 1.571832831359643e-05, | |
| "learning_rate": 9.991123238414453e-07, | |
| "loss": 0.02548346482217312, | |
| "num_tokens": 1208384.0, | |
| "reward": 0.3845663070678711, | |
| "reward_std": 0.315467894077301, | |
| "rewards/true_env_reward_fn/mean": 0.3845663070678711, | |
| "rewards/true_env_reward_fn/std": 0.31546786427497864, | |
| "step": 49, | |
| "step_time": 8.691208415999881 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 109.0, | |
| "completions/max_terminated_length": 109.0, | |
| "completions/mean_length": 64.75, | |
| "completions/mean_terminated_length": 64.75, | |
| "completions/min_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 1.2306177020072937, | |
| "epoch": 1.2195121951219512, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.07395736873149872, | |
| "kl": 1.2643881973417592e-05, | |
| "learning_rate": 9.988406912941589e-07, | |
| "loss": -0.04186868295073509, | |
| "num_tokens": 1227700.0, | |
| "reward": 0.5068289637565613, | |
| "reward_std": 0.31324177980422974, | |
| "rewards/true_env_reward_fn/mean": 0.5068289637565613, | |
| "rewards/true_env_reward_fn/std": 0.31324175000190735, | |
| "step": 50, | |
| "step_time": 10.162109979000206 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 410, | |
| "num_input_tokens_seen": 1227700, | |
| "num_train_epochs": 10, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |