Jerry999 commited on 6 days ago

Commit

02dfe15

verified ·

1 Parent(s): 987f783

Upload checkpoints/math_operations/lora_sft_primitive_atomic_50k

Browse files

Files changed (50) hide show

.gitattributes +4 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_config.json +46 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_model.safetensors +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/chat_template.jinja +4 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/optimizer.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/rng_state.pth +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/scheduler.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer.json +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer_config.json +29 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokens_state. +1 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/trainer_state.json +1500 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/training_args.bin +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_config.json +46 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_model.safetensors +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/chat_template.jinja +4 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/optimizer.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/rng_state.pth +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/scheduler.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer.json +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer_config.json +29 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokens_state. +1 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/trainer_state.json +2966 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/training_args.bin +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_config.json +46 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_model.safetensors +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/chat_template.jinja +4 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/optimizer.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/rng_state.pth +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/scheduler.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer.json +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer_config.json +29 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokens_state. +1 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json +0 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/training_args.bin +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_config.json +46 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_model.safetensors +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/chat_template.jinja +4 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/optimizer.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/rng_state.pth +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/scheduler.pt +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer.json +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer_config.json +29 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokens_state. +1 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json +0 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/training_args.bin +3 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/debug.log +0 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_results.csv +2 -0
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_summary.json +133 -0

.gitattributes CHANGED Viewed

@@ -63,3 +63,7 @@ checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1248/tokenizer.j
 checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1274/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/math_operations/base_model_eval/eval_results_easy_ops/balanced_test_alpaca_results.jsonl filter=lfs diff=lfs merge=lfs -text

 checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1274/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/math_operations/base_model_eval/eval_results_easy_ops/balanced_test_alpaca_results.jsonl filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "target_parameters": [],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:379e91d5a96f500546d4939abf324418d8037973997e317959dee26b1120871d
+size 264308896

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c868a01dbfce25472c802e9bb70d445af05f005b7200ed0296eeb2afef96ff3
+size 528915403

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ac2f7da54075bc45ef2073674c010a395ea84101521997fd9e15096792e2601
+size 14645

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2edfde6394729c32e1a44395988017d981d5c693031d697a92775ac9a22761ec
+size 1465

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokens_state. ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"total": 16904192, "trainable": 5351770}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1500 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.000969696969697,
+  "eval_steps": 516,
+  "global_step": 1031,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 0.8898435831069946,
+      "eval_ppl": 2.43475,
+      "eval_runtime": 12.6383,
+      "eval_samples_per_second": 15.825,
+      "eval_steps_per_second": 7.912,
+      "memory/device_reserved (GiB)": 13.84,
+      "memory/max_active (GiB)": 13.69,
+      "memory/max_allocated (GiB)": 13.69,
+      "step": 0
+    },
+    {
+      "epoch": 0.009696969696969697,
+      "grad_norm": 2.995619058609009,
+      "learning_rate": 3.4951456310679615e-06,
+      "loss": 0.8680612564086914,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 2.38229,
+      "step": 10,
+      "tokens/total": 163840,
+      "tokens/train_per_sec_per_gpu": 14.27,
+      "tokens/trainable": 51990
+    },
+    {
+      "epoch": 0.019393939393939394,
+      "grad_norm": 2.1244935989379883,
+      "learning_rate": 7.378640776699029e-06,
+      "loss": 0.7699687004089355,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 2.1597,
+      "step": 20,
+      "tokens/total": 327680,
+      "tokens/train_per_sec_per_gpu": 16.06,
+      "tokens/trainable": 104391
+    },
+    {
+      "epoch": 0.02909090909090909,
+      "grad_norm": 0.9706138372421265,
+      "learning_rate": 1.1262135922330098e-05,
+      "loss": 0.5319457054138184,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.70224,
+      "step": 30,
+      "tokens/total": 491520,
+      "tokens/train_per_sec_per_gpu": 16.48,
+      "tokens/trainable": 156787
+    },
+    {
+      "epoch": 0.03878787878787879,
+      "grad_norm": 0.7689842581748962,
+      "learning_rate": 1.5145631067961166e-05,
+      "loss": 0.30234951972961427,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.35303,
+      "step": 40,
+      "tokens/total": 655360,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 208924
+    },
+    {
+      "epoch": 0.048484848484848485,
+      "grad_norm": 0.45850396156311035,
+      "learning_rate": 1.9029126213592234e-05,
+      "loss": 0.1519382953643799,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.16409,
+      "step": 50,
+      "tokens/total": 819200,
+      "tokens/train_per_sec_per_gpu": 14.61,
+      "tokens/trainable": 261170
+    },
+    {
+      "epoch": 0.05818181818181818,
+      "grad_norm": 0.41381561756134033,
+      "learning_rate": 2.29126213592233e-05,
+      "loss": 0.062263429164886475,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.06424,
+      "step": 60,
+      "tokens/total": 983040,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 313808
+    },
+    {
+      "epoch": 0.06787878787878789,
+      "grad_norm": 0.4865979254245758,
+      "learning_rate": 2.6796116504854367e-05,
+      "loss": 0.018695920705795288,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.01887,
+      "step": 70,
+      "tokens/total": 1146880,
+      "tokens/train_per_sec_per_gpu": 14.62,
+      "tokens/trainable": 366068
+    },
+    {
+      "epoch": 0.07757575757575758,
+      "grad_norm": 0.39099738001823425,
+      "learning_rate": 3.067961165048544e-05,
+      "loss": 0.006136053055524826,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00615,
+      "step": 80,
+      "tokens/total": 1310720,
+      "tokens/train_per_sec_per_gpu": 13.81,
+      "tokens/trainable": 418120
+    },
+    {
+      "epoch": 0.08727272727272728,
+      "grad_norm": 0.08230593055486679,
+      "learning_rate": 3.456310679611651e-05,
+      "loss": 0.004204501211643219,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00421,
+      "step": 90,
+      "tokens/total": 1474560,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 470244
+    },
+    {
+      "epoch": 0.09696969696969697,
+      "grad_norm": 0.13297680020332336,
+      "learning_rate": 3.844660194174757e-05,
+      "loss": 0.0036250378936529158,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00363,
+      "step": 100,
+      "tokens/total": 1638400,
+      "tokens/train_per_sec_per_gpu": 14.91,
+      "tokens/trainable": 522666
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.2430051565170288,
+      "learning_rate": 4.2330097087378647e-05,
+      "loss": 0.003873714804649353,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00388,
+      "step": 110,
+      "tokens/total": 1802240,
+      "tokens/train_per_sec_per_gpu": 14.17,
+      "tokens/trainable": 574329
+    },
+    {
+      "epoch": 0.11636363636363636,
+      "grad_norm": 0.09347938001155853,
+      "learning_rate": 4.621359223300971e-05,
+      "loss": 0.00237951148301363,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00238,
+      "step": 120,
+      "tokens/total": 1966080,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 626194
+    },
+    {
+      "epoch": 0.12606060606060607,
+      "grad_norm": 0.13388365507125854,
+      "learning_rate": 5.0097087378640786e-05,
+      "loss": 0.0015400107949972153,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00154,
+      "step": 130,
+      "tokens/total": 2129920,
+      "tokens/train_per_sec_per_gpu": 14.01,
+      "tokens/trainable": 678140
+    },
+    {
+      "epoch": 0.13575757575757577,
+      "grad_norm": 0.13342970609664917,
+      "learning_rate": 5.398058252427185e-05,
+      "loss": 0.001996887102723122,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.002,
+      "step": 140,
+      "tokens/total": 2293760,
+      "tokens/train_per_sec_per_gpu": 14.41,
+      "tokens/trainable": 730201
+    },
+    {
+      "epoch": 0.14545454545454545,
+      "grad_norm": 0.0299234539270401,
+      "learning_rate": 5.786407766990292e-05,
+      "loss": 0.0015132850036025046,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00151,
+      "step": 150,
+      "tokens/total": 2457600,
+      "tokens/train_per_sec_per_gpu": 15.8,
+      "tokens/trainable": 782196
+    },
+    {
+      "epoch": 0.15515151515151515,
+      "grad_norm": 0.04437975212931633,
+      "learning_rate": 6.174757281553398e-05,
+      "loss": 0.0012883609160780907,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00129,
+      "step": 160,
+      "tokens/total": 2621440,
+      "tokens/train_per_sec_per_gpu": 14.64,
+      "tokens/trainable": 833614
+    },
+    {
+      "epoch": 0.16484848484848486,
+      "grad_norm": 0.014039761386811733,
+      "learning_rate": 6.563106796116505e-05,
+      "loss": 0.0011639594100415706,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00116,
+      "step": 170,
+      "tokens/total": 2785280,
+      "tokens/train_per_sec_per_gpu": 13.95,
+      "tokens/trainable": 885591
+    },
+    {
+      "epoch": 0.17454545454545456,
+      "grad_norm": 0.0033261056523770094,
+      "learning_rate": 6.951456310679612e-05,
+      "loss": 0.0007388167083263397,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00074,
+      "step": 180,
+      "tokens/total": 2949120,
+      "tokens/train_per_sec_per_gpu": 14.37,
+      "tokens/trainable": 937712
+    },
+    {
+      "epoch": 0.18424242424242424,
+      "grad_norm": 0.010476192459464073,
+      "learning_rate": 7.339805825242719e-05,
+      "loss": 0.0008642122149467469,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00086,
+      "step": 190,
+      "tokens/total": 3112960,
+      "tokens/train_per_sec_per_gpu": 15.52,
+      "tokens/trainable": 989913
+    },
+    {
+      "epoch": 0.19393939393939394,
+      "grad_norm": 0.01253255270421505,
+      "learning_rate": 7.728155339805826e-05,
+      "loss": 0.0007610846310853958,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00076,
+      "step": 200,
+      "tokens/total": 3276800,
+      "tokens/train_per_sec_per_gpu": 14.17,
+      "tokens/trainable": 1041978
+    },
+    {
+      "epoch": 0.20363636363636364,
+      "grad_norm": 0.01779557578265667,
+      "learning_rate": 8.116504854368933e-05,
+      "loss": 0.0007697530556470156,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00077,
+      "step": 210,
+      "tokens/total": 3440640,
+      "tokens/train_per_sec_per_gpu": 14.12,
+      "tokens/trainable": 1093395
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.16895800828933716,
+      "learning_rate": 8.504854368932039e-05,
+      "loss": 0.0006535804830491542,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 220,
+      "tokens/total": 3604480,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 1145329
+    },
+    {
+      "epoch": 0.22303030303030302,
+      "grad_norm": 0.08973463624715805,
+      "learning_rate": 8.893203883495146e-05,
+      "loss": 0.0009510296396911145,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 230,
+      "tokens/total": 3768320,
+      "tokens/train_per_sec_per_gpu": 14.67,
+      "tokens/trainable": 1197537
+    },
+    {
+      "epoch": 0.23272727272727273,
+      "grad_norm": 0.044939588755369186,
+      "learning_rate": 9.281553398058253e-05,
+      "loss": 0.001187363639473915,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00119,
+      "step": 240,
+      "tokens/total": 3932160,
+      "tokens/train_per_sec_per_gpu": 15.39,
+      "tokens/trainable": 1249924
+    },
+    {
+      "epoch": 0.24242424242424243,
+      "grad_norm": 0.08850465714931488,
+      "learning_rate": 9.66990291262136e-05,
+      "loss": 0.0013382930308580398,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00134,
+      "step": 250,
+      "tokens/total": 4096000,
+      "tokens/train_per_sec_per_gpu": 15.06,
+      "tokens/trainable": 1301558
+    },
+    {
+      "epoch": 0.25212121212121213,
+      "grad_norm": 0.101528100669384,
+      "learning_rate": 0.00010058252427184467,
+      "loss": 0.0008709387853741646,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00087,
+      "step": 260,
+      "tokens/total": 4259840,
+      "tokens/train_per_sec_per_gpu": 15.16,
+      "tokens/trainable": 1353706
+    },
+    {
+      "epoch": 0.26181818181818184,
+      "grad_norm": 0.08298433572053909,
+      "learning_rate": 0.00010446601941747574,
+      "loss": 0.0013300922699272632,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00133,
+      "step": 270,
+      "tokens/total": 4423680,
+      "tokens/train_per_sec_per_gpu": 15.11,
+      "tokens/trainable": 1405519
+    },
+    {
+      "epoch": 0.27151515151515154,
+      "grad_norm": 0.03734389320015907,
+      "learning_rate": 0.00010834951456310681,
+      "loss": 0.0006868645548820495,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00069,
+      "step": 280,
+      "tokens/total": 4587520,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 1457494
+    },
+    {
+      "epoch": 0.2812121212121212,
+      "grad_norm": 0.07898428291082382,
+      "learning_rate": 0.00011223300970873786,
+      "loss": 0.0013550779782235622,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00136,
+      "step": 290,
+      "tokens/total": 4751360,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 1509320
+    },
+    {
+      "epoch": 0.2909090909090909,
+      "grad_norm": 0.06320006400346756,
+      "learning_rate": 0.00011611650485436893,
+      "loss": 0.0010121697559952736,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00101,
+      "step": 300,
+      "tokens/total": 4915200,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 1561332
+    },
+    {
+      "epoch": 0.3006060606060606,
+      "grad_norm": 0.013749867677688599,
+      "learning_rate": 0.00012,
+      "loss": 0.0006499682553112507,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 310,
+      "tokens/total": 5079040,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 1613189
+    },
+    {
+      "epoch": 0.3103030303030303,
+      "grad_norm": 0.033964402973651886,
+      "learning_rate": 0.00012388349514563107,
+      "loss": 0.0008866124786436558,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00089,
+      "step": 320,
+      "tokens/total": 5242880,
+      "tokens/train_per_sec_per_gpu": 15.78,
+      "tokens/trainable": 1665681
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.04327597841620445,
+      "learning_rate": 0.00012776699029126213,
+      "loss": 0.0005569641944020987,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00056,
+      "step": 330,
+      "tokens/total": 5406720,
+      "tokens/train_per_sec_per_gpu": 14.92,
+      "tokens/trainable": 1718317
+    },
+    {
+      "epoch": 0.3296969696969697,
+      "grad_norm": 0.02717934548854828,
+      "learning_rate": 0.0001316504854368932,
+      "loss": 0.0003776244120672345,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00038,
+      "step": 340,
+      "tokens/total": 5570560,
+      "tokens/train_per_sec_per_gpu": 14.42,
+      "tokens/trainable": 1770210
+    },
+    {
+      "epoch": 0.3393939393939394,
+      "grad_norm": 0.0028237912338227034,
+      "learning_rate": 0.0001355339805825243,
+      "loss": 0.0005292522720992566,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00053,
+      "step": 350,
+      "tokens/total": 5734400,
+      "tokens/train_per_sec_per_gpu": 16.4,
+      "tokens/trainable": 1821987
+    },
+    {
+      "epoch": 0.3490909090909091,
+      "grad_norm": 0.0310799703001976,
+      "learning_rate": 0.00013941747572815535,
+      "loss": 0.0006786303594708443,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 360,
+      "tokens/total": 5898240,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 1874266
+    },
+    {
+      "epoch": 0.35878787878787877,
+      "grad_norm": 0.17325043678283691,
+      "learning_rate": 0.0001433009708737864,
+      "loss": 0.0013975565321743487,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0014,
+      "step": 370,
+      "tokens/total": 6062080,
+      "tokens/train_per_sec_per_gpu": 13.73,
+      "tokens/trainable": 1926124
+    },
+    {
+      "epoch": 0.36848484848484847,
+      "grad_norm": 0.07738752663135529,
+      "learning_rate": 0.0001471844660194175,
+      "loss": 0.0006820175796747208,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 380,
+      "tokens/total": 6225920,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 1978693
+    },
+    {
+      "epoch": 0.3781818181818182,
+      "grad_norm": 0.10022349655628204,
+      "learning_rate": 0.00015106796116504855,
+      "loss": 0.00063879219815135,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 390,
+      "tokens/total": 6389760,
+      "tokens/train_per_sec_per_gpu": 13.34,
+      "tokens/trainable": 2030378
+    },
+    {
+      "epoch": 0.3878787878787879,
+      "grad_norm": 0.0495997779071331,
+      "learning_rate": 0.00015495145631067963,
+      "loss": 0.0021283581852912905,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00213,
+      "step": 400,
+      "tokens/total": 6553600,
+      "tokens/train_per_sec_per_gpu": 15.34,
+      "tokens/trainable": 2083047
+    },
+    {
+      "epoch": 0.3975757575757576,
+      "grad_norm": 0.07361701130867004,
+      "learning_rate": 0.0001588349514563107,
+      "loss": 0.001862115040421486,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00186,
+      "step": 410,
+      "tokens/total": 6717440,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 2135527
+    },
+    {
+      "epoch": 0.4072727272727273,
+      "grad_norm": 0.05466209724545479,
+      "learning_rate": 0.00016271844660194174,
+      "loss": 0.0011581303551793098,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00116,
+      "step": 420,
+      "tokens/total": 6881280,
+      "tokens/train_per_sec_per_gpu": 14.77,
+      "tokens/trainable": 2187636
+    },
+    {
+      "epoch": 0.416969696969697,
+      "grad_norm": 0.04331392049789429,
+      "learning_rate": 0.00016660194174757283,
+      "loss": 0.0051729224622249605,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00519,
+      "step": 430,
+      "tokens/total": 7045120,
+      "tokens/train_per_sec_per_gpu": 13.76,
+      "tokens/trainable": 2239006
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.05931795388460159,
+      "learning_rate": 0.00017048543689320388,
+      "loss": 0.00242764875292778,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00243,
+      "step": 440,
+      "tokens/total": 7208960,
+      "tokens/train_per_sec_per_gpu": 14.59,
+      "tokens/trainable": 2290540
+    },
+    {
+      "epoch": 0.43636363636363634,
+      "grad_norm": 0.04634418711066246,
+      "learning_rate": 0.00017436893203883494,
+      "loss": 0.001389546226710081,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00139,
+      "step": 450,
+      "tokens/total": 7372800,
+      "tokens/train_per_sec_per_gpu": 14.78,
+      "tokens/trainable": 2341852
+    },
+    {
+      "epoch": 0.44606060606060605,
+      "grad_norm": 0.04817213863134384,
+      "learning_rate": 0.00017825242718446602,
+      "loss": 0.001370794139802456,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00137,
+      "step": 460,
+      "tokens/total": 7536640,
+      "tokens/train_per_sec_per_gpu": 13.77,
+      "tokens/trainable": 2393320
+    },
+    {
+      "epoch": 0.45575757575757575,
+      "grad_norm": 0.011335949413478374,
+      "learning_rate": 0.00018213592233009708,
+      "loss": 0.0009715131483972073,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 470,
+      "tokens/total": 7700480,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 2445170
+    },
+    {
+      "epoch": 0.46545454545454545,
+      "grad_norm": 0.05298445746302605,
+      "learning_rate": 0.00018601941747572816,
+      "loss": 0.0008222623728215694,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 480,
+      "tokens/total": 7864320,
+      "tokens/train_per_sec_per_gpu": 13.87,
+      "tokens/trainable": 2497473
+    },
+    {
+      "epoch": 0.47515151515151516,
+      "grad_norm": 0.061686884611845016,
+      "learning_rate": 0.00018990291262135925,
+      "loss": 0.000748783303424716,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00075,
+      "step": 490,
+      "tokens/total": 8028160,
+      "tokens/train_per_sec_per_gpu": 15.41,
+      "tokens/trainable": 2549206
+    },
+    {
+      "epoch": 0.48484848484848486,
+      "grad_norm": 0.03281249850988388,
+      "learning_rate": 0.0001937864077669903,
+      "loss": 0.0006062469445168972,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00061,
+      "step": 500,
+      "tokens/total": 8192000,
+      "tokens/train_per_sec_per_gpu": 14.49,
+      "tokens/trainable": 2600583
+    },
+    {
+      "epoch": 0.49454545454545457,
+      "grad_norm": 0.008482079952955246,
+      "learning_rate": 0.0001976699029126214,
+      "loss": 0.0008583014830946922,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00086,
+      "step": 510,
+      "tokens/total": 8355840,
+      "tokens/train_per_sec_per_gpu": 13.86,
+      "tokens/trainable": 2652927
+    },
+    {
+      "epoch": 0.5003636363636363,
+      "eval_loss": 0.0009036393603309989,
+      "eval_ppl": 1.0009,
+      "eval_runtime": 12.7872,
+      "eval_samples_per_second": 15.641,
+      "eval_steps_per_second": 7.82,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "step": 516
+    },
+    {
+      "epoch": 0.5042424242424243,
+      "grad_norm": 0.04333305358886719,
+      "learning_rate": 0.0001999996332640321,
+      "loss": 0.0005093200132250785,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 520,
+      "tokens/total": 8519680,
+      "tokens/train_per_sec_per_gpu": 14.09,
+      "tokens/trainable": 2705083
+    },
+    {
+      "epoch": 0.5139393939393939,
+      "grad_norm": 0.02485118806362152,
+      "learning_rate": 0.00019999550751528488,
+      "loss": 0.0006649125367403031,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 530,
+      "tokens/total": 8683520,
+      "tokens/train_per_sec_per_gpu": 14.44,
+      "tokens/trainable": 2756975
+    },
+    {
+      "epoch": 0.5236363636363637,
+      "grad_norm": 0.03736363351345062,
+      "learning_rate": 0.00019998679778759294,
+      "loss": 0.0006726076360791921,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 540,
+      "tokens/total": 8847360,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 2808076
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.05156765505671501,
+      "learning_rate": 0.0001999735044802263,
+      "loss": 0.000789718609303236,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00079,
+      "step": 550,
+      "tokens/total": 9011200,
+      "tokens/train_per_sec_per_gpu": 16.36,
+      "tokens/trainable": 2859893
+    },
+    {
+      "epoch": 0.5430303030303031,
+      "grad_norm": 0.647550106048584,
+      "learning_rate": 0.00019995562820257474,
+      "loss": 0.003008325584232807,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00301,
+      "step": 560,
+      "tokens/total": 9175040,
+      "tokens/train_per_sec_per_gpu": 14.21,
+      "tokens/trainable": 2911399
+    },
+    {
+      "epoch": 0.5527272727272727,
+      "grad_norm": 0.185165673494339,
+      "learning_rate": 0.00019993316977411993,
+      "loss": 0.013715097308158874,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.01381,
+      "step": 570,
+      "tokens/total": 9338880,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 2962403
+    },
+    {
+      "epoch": 0.5624242424242424,
+      "grad_norm": 0.2401553839445114,
+      "learning_rate": 0.0001999061302243977,
+      "loss": 0.009026474505662917,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00907,
+      "step": 580,
+      "tokens/total": 9502720,
+      "tokens/train_per_sec_per_gpu": 14.38,
+      "tokens/trainable": 3015083
+    },
+    {
+      "epoch": 0.5721212121212121,
+      "grad_norm": 0.08092579245567322,
+      "learning_rate": 0.000199874510792951,
+      "loss": 0.005716494470834732,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00573,
+      "step": 590,
+      "tokens/total": 9666560,
+      "tokens/train_per_sec_per_gpu": 16.38,
+      "tokens/trainable": 3066501
+    },
+    {
+      "epoch": 0.5818181818181818,
+      "grad_norm": 3.418715476989746,
+      "learning_rate": 0.00019983831292927305,
+      "loss": 0.048504295945167544,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0497,
+      "step": 600,
+      "tokens/total": 9830400,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 3118633
+    },
+    {
+      "epoch": 0.5915151515151515,
+      "grad_norm": 0.2194036841392517,
+      "learning_rate": 0.00019979753829274085,
+      "loss": 0.03429323434829712,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.03489,
+      "step": 610,
+      "tokens/total": 9994240,
+      "tokens/train_per_sec_per_gpu": 13.14,
+      "tokens/trainable": 3170577
+    },
+    {
+      "epoch": 0.6012121212121212,
+      "grad_norm": 0.022929901257157326,
+      "learning_rate": 0.0001997521887525391,
+      "loss": 0.0015171168372035027,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00152,
+      "step": 620,
+      "tokens/total": 10158080,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 3221696
+    },
+    {
+      "epoch": 0.610909090909091,
+      "grad_norm": 0.10083670169115067,
+      "learning_rate": 0.00019970226638757458,
+      "loss": 0.0025377947837114333,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00254,
+      "step": 630,
+      "tokens/total": 10321920,
+      "tokens/train_per_sec_per_gpu": 14.7,
+      "tokens/trainable": 3273775
+    },
+    {
+      "epoch": 0.6206060606060606,
+      "grad_norm": 0.01761380024254322,
+      "learning_rate": 0.00019964777348638083,
+      "loss": 0.002281896211206913,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00228,
+      "step": 640,
+      "tokens/total": 10485760,
+      "tokens/train_per_sec_per_gpu": 14.89,
+      "tokens/trainable": 3325516
+    },
+    {
+      "epoch": 0.6303030303030303,
+      "grad_norm": 0.004510029684752226,
+      "learning_rate": 0.00019958871254701315,
+      "loss": 0.0009477110579609871,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 650,
+      "tokens/total": 10649600,
+      "tokens/train_per_sec_per_gpu": 16.46,
+      "tokens/trainable": 3377214
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.05332477018237114,
+      "learning_rate": 0.0001995250862769342,
+      "loss": 0.0005660496186465025,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00057,
+      "step": 660,
+      "tokens/total": 10813440,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 3428627
+    },
+    {
+      "epoch": 0.6496969696969697,
+      "grad_norm": 0.03861689195036888,
+      "learning_rate": 0.0001994568975928899,
+      "loss": 0.0008976863697171211,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0009,
+      "step": 670,
+      "tokens/total": 10977280,
+      "tokens/train_per_sec_per_gpu": 15.66,
+      "tokens/trainable": 3480170
+    },
+    {
+      "epoch": 0.6593939393939394,
+      "grad_norm": 0.021123304963111877,
+      "learning_rate": 0.00019938414962077553,
+      "loss": 0.0009612766094505787,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00096,
+      "step": 680,
+      "tokens/total": 11141120,
+      "tokens/train_per_sec_per_gpu": 15.15,
+      "tokens/trainable": 3532037
+    },
+    {
+      "epoch": 0.6690909090909091,
+      "grad_norm": 0.02421347238123417,
+      "learning_rate": 0.00019930684569549264,
+      "loss": 0.001021684519946575,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00102,
+      "step": 690,
+      "tokens/total": 11304960,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 3583461
+    },
+    {
+      "epoch": 0.6787878787878788,
+      "grad_norm": 0.05008835345506668,
+      "learning_rate": 0.00019922498936079613,
+      "loss": 0.0007617876864969731,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00076,
+      "step": 700,
+      "tokens/total": 11468800,
+      "tokens/train_per_sec_per_gpu": 14.08,
+      "tokens/trainable": 3634649
+    },
+    {
+      "epoch": 0.6884848484848485,
+      "grad_norm": 0.035733792930841446,
+      "learning_rate": 0.00019913858436913171,
+      "loss": 0.0012347914278507232,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00124,
+      "step": 710,
+      "tokens/total": 11632640,
+      "tokens/train_per_sec_per_gpu": 14.45,
+      "tokens/trainable": 3685786
+    },
+    {
+      "epoch": 0.6981818181818182,
+      "grad_norm": 0.010948767885565758,
+      "learning_rate": 0.00019904763468146393,
+      "loss": 0.0008165687322616577,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 720,
+      "tokens/total": 11796480,
+      "tokens/train_per_sec_per_gpu": 15.77,
+      "tokens/trainable": 3737566
+    },
+    {
+      "epoch": 0.7078787878787879,
+      "grad_norm": 0.03577027469873428,
+      "learning_rate": 0.00019895214446709463,
+      "loss": 0.001333119161427021,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00133,
+      "step": 730,
+      "tokens/total": 11960320,
+      "tokens/train_per_sec_per_gpu": 13.98,
+      "tokens/trainable": 3789817
+    },
+    {
+      "epoch": 0.7175757575757575,
+      "grad_norm": 0.03971279785037041,
+      "learning_rate": 0.00019885211810347184,
+      "loss": 0.0011184611357748508,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00112,
+      "step": 740,
+      "tokens/total": 12124160,
+      "tokens/train_per_sec_per_gpu": 14.67,
+      "tokens/trainable": 3841912
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 0.06546575576066971,
+      "learning_rate": 0.00019874756017598894,
+      "loss": 0.0012452728115022182,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00125,
+      "step": 750,
+      "tokens/total": 12288000,
+      "tokens/train_per_sec_per_gpu": 14.58,
+      "tokens/trainable": 3893725
+    },
+    {
+      "epoch": 0.7369696969696969,
+      "grad_norm": 0.047058816999197006,
+      "learning_rate": 0.00019863847547777467,
+      "loss": 0.0008146104402840138,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00081,
+      "step": 760,
+      "tokens/total": 12451840,
+      "tokens/train_per_sec_per_gpu": 13.49,
+      "tokens/trainable": 3945033
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.028811641037464142,
+      "learning_rate": 0.00019852486900947327,
+      "loss": 0.0008652995340526104,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00087,
+      "step": 770,
+      "tokens/total": 12615680,
+      "tokens/train_per_sec_per_gpu": 15.12,
+      "tokens/trainable": 3996749
+    },
+    {
+      "epoch": 0.7563636363636363,
+      "grad_norm": 0.012203546240925789,
+      "learning_rate": 0.0001984067459790153,
+      "loss": 0.000670672720298171,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 780,
+      "tokens/total": 12779520,
+      "tokens/train_per_sec_per_gpu": 13.71,
+      "tokens/trainable": 4048173
+    },
+    {
+      "epoch": 0.7660606060606061,
+      "grad_norm": 0.016218814998865128,
+      "learning_rate": 0.0001982841118013789,
+      "loss": 0.00046353964135050776,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 790,
+      "tokens/total": 12943360,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 4099789
+    },
+    {
+      "epoch": 0.7757575757575758,
+      "grad_norm": 0.034673016518354416,
+      "learning_rate": 0.00019815697209834147,
+      "loss": 0.000707306619733572,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00071,
+      "step": 800,
+      "tokens/total": 13107200,
+      "tokens/train_per_sec_per_gpu": 14.45,
+      "tokens/trainable": 4150960
+    },
+    {
+      "epoch": 0.7854545454545454,
+      "grad_norm": 0.0022127812262624502,
+      "learning_rate": 0.00019802533269822208,
+      "loss": 0.00021896373946219682,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 810,
+      "tokens/total": 13271040,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 4202984
+    },
+    {
+      "epoch": 0.7951515151515152,
+      "grad_norm": 0.000919274752959609,
+      "learning_rate": 0.00019788919963561422,
+      "loss": 0.00043264860287308695,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00043,
+      "step": 820,
+      "tokens/total": 13434880,
+      "tokens/train_per_sec_per_gpu": 14.06,
+      "tokens/trainable": 4254907
+    },
+    {
+      "epoch": 0.8048484848484848,
+      "grad_norm": 0.007699873298406601,
+      "learning_rate": 0.00019774857915110913,
+      "loss": 0.0003196246922016144,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 830,
+      "tokens/total": 13598720,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 4306095
+    },
+    {
+      "epoch": 0.8145454545454546,
+      "grad_norm": 0.015523642301559448,
+      "learning_rate": 0.00019760347769100987,
+      "loss": 0.0004476988688111305,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00045,
+      "step": 840,
+      "tokens/total": 13762560,
+      "tokens/train_per_sec_per_gpu": 14.14,
+      "tokens/trainable": 4357442
+    },
+    {
+      "epoch": 0.8242424242424242,
+      "grad_norm": 0.013460986316204071,
+      "learning_rate": 0.00019745390190703565,
+      "loss": 0.0004673306830227375,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 850,
+      "tokens/total": 13926400,
+      "tokens/train_per_sec_per_gpu": 14.1,
+      "tokens/trainable": 4409277
+    },
+    {
+      "epoch": 0.833939393939394,
+      "grad_norm": 0.0014691110700368881,
+      "learning_rate": 0.0001972998586560169,
+      "loss": 0.0003277578856796026,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 860,
+      "tokens/total": 14090240,
+      "tokens/train_per_sec_per_gpu": 14.28,
+      "tokens/trainable": 4460714
+    },
+    {
+      "epoch": 0.8436363636363636,
+      "grad_norm": 0.001358041656203568,
+      "learning_rate": 0.00019714135499958112,
+      "loss": 0.00032470382284373046,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 870,
+      "tokens/total": 14254080,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 4511989
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.04510723799467087,
+      "learning_rate": 0.0001969783982038289,
+      "loss": 0.00023182881996035575,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 880,
+      "tokens/total": 14417920,
+      "tokens/train_per_sec_per_gpu": 15.41,
+      "tokens/trainable": 4563354
+    },
+    {
+      "epoch": 0.863030303030303,
+      "grad_norm": 0.14508692920207977,
+      "learning_rate": 0.00019681099573900113,
+      "loss": 0.00026136748492717744,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 890,
+      "tokens/total": 14581760,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 4615691
+    },
+    {
+      "epoch": 0.8727272727272727,
+      "grad_norm": 0.010969490744173527,
+      "learning_rate": 0.00019663915527913625,
+      "loss": 0.00016044279327616097,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 900,
+      "tokens/total": 14745600,
+      "tokens/train_per_sec_per_gpu": 15.76,
+      "tokens/trainable": 4667433
+    },
+    {
+      "epoch": 0.8824242424242424,
+      "grad_norm": 0.03874114155769348,
+      "learning_rate": 0.00019646288470171868,
+      "loss": 0.0004159804433584213,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 910,
+      "tokens/total": 14909440,
+      "tokens/train_per_sec_per_gpu": 16.01,
+      "tokens/trainable": 4719807
+    },
+    {
+      "epoch": 0.8921212121212121,
+      "grad_norm": 0.044620465487241745,
+      "learning_rate": 0.00019628219208731756,
+      "loss": 0.0006739750038832426,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 920,
+      "tokens/total": 15073280,
+      "tokens/train_per_sec_per_gpu": 15.05,
+      "tokens/trainable": 4771772
+    },
+    {
+      "epoch": 0.9018181818181819,
+      "grad_norm": 0.024856949225068092,
+      "learning_rate": 0.00019609708571921645,
+      "loss": 0.00039347023703157903,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 930,
+      "tokens/total": 15237120,
+      "tokens/train_per_sec_per_gpu": 15.16,
+      "tokens/trainable": 4823415
+    },
+    {
+      "epoch": 0.9115151515151515,
+      "grad_norm": 0.022198157384991646,
+      "learning_rate": 0.0001959075740830335,
+      "loss": 0.0005907822400331497,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00059,
+      "step": 940,
+      "tokens/total": 15400960,
+      "tokens/train_per_sec_per_gpu": 15.36,
+      "tokens/trainable": 4875269
+    },
+    {
+      "epoch": 0.9212121212121213,
+      "grad_norm": 0.01670038513839245,
+      "learning_rate": 0.00019571366586633245,
+      "loss": 0.00027316866908222437,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 950,
+      "tokens/total": 15564800,
+      "tokens/train_per_sec_per_gpu": 15.11,
+      "tokens/trainable": 4927244
+    },
+    {
+      "epoch": 0.9309090909090909,
+      "grad_norm": 0.021392742171883583,
+      "learning_rate": 0.00019551536995822454,
+      "loss": 0.0004320886451750994,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00043,
+      "step": 960,
+      "tokens/total": 15728640,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 4979068
+    },
+    {
+      "epoch": 0.9406060606060606,
+      "grad_norm": 0.028143158182501793,
+      "learning_rate": 0.00019531269544896076,
+      "loss": 0.0005637989845126868,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00056,
+      "step": 970,
+      "tokens/total": 15892480,
+      "tokens/train_per_sec_per_gpu": 14.26,
+      "tokens/trainable": 5030980
+    },
+    {
+      "epoch": 0.9503030303030303,
+      "grad_norm": 0.077091746032238,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.0010597245767712594,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00106,
+      "step": 980,
+      "tokens/total": 16056320,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 5082759
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.04455556347966194,
+      "learning_rate": 0.00019489424799115984,
+      "loss": 0.0009517236612737179,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 990,
+      "tokens/total": 16220160,
+      "tokens/train_per_sec_per_gpu": 13.04,
+      "tokens/trainable": 5134379
+    },
+    {
+      "epoch": 0.9696969696969697,
+      "grad_norm": 0.03573840856552124,
+      "learning_rate": 0.00019467849422502784,
+      "loss": 0.0008812972344458103,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00088,
+      "step": 1000,
+      "tokens/total": 16384000,
+      "tokens/train_per_sec_per_gpu": 15.23,
+      "tokens/trainable": 5186184
+    },
+    {
+      "epoch": 0.9793939393939394,
+      "grad_norm": 0.0006549305398948491,
+      "learning_rate": 0.0001944584002216709,
+      "loss": 0.0006358013488352299,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1010,
+      "tokens/total": 16547840,
+      "tokens/train_per_sec_per_gpu": 16.1,
+      "tokens/trainable": 5238320
+    },
+    {
+      "epoch": 0.9890909090909091,
+      "grad_norm": 0.021742813289165497,
+      "learning_rate": 0.00019423397607060507,
+      "loss": 0.000400003744289279,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1020,
+      "tokens/total": 16711680,
+      "tokens/train_per_sec_per_gpu": 14.53,
+      "tokens/trainable": 5290445
+    },
+    {
+      "epoch": 0.9987878787878788,
+      "grad_norm": 0.04323820024728775,
+      "learning_rate": 0.00019400523205984833,
+      "loss": 0.0002954686991870403,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1030,
+      "tokens/total": 16875520,
+      "tokens/train_per_sec_per_gpu": 14.98,
+      "tokens/trainable": 5342720
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5155,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 1031,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.7522967515417805e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
+size 7121

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "target_parameters": [],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da62714aaa23848ef165e457db4ac64c154ba06f2678a79b5c9a5f4d9131e877
+size 264308896

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9568a797d761234200086dde3789d57bc2b2944e366ab3e5a3273a96b0515b3
+size 528915403

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d132e1337dcddf36eb41c4686b7b1f64060722a1c210d58a733ddcfc9c9fb9b
+size 14645

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fad802e90d13070878b8d7c99bc22b3028181f43a8f5425264542fd04c806af
+size 1465

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokens_state. ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"total": 33808384, "trainable": 10703590}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2966 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.001939393939394,
+  "eval_steps": 516,
+  "global_step": 2062,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 0.8898435831069946,
+      "eval_ppl": 2.43475,
+      "eval_runtime": 12.6383,
+      "eval_samples_per_second": 15.825,
+      "eval_steps_per_second": 7.912,
+      "memory/device_reserved (GiB)": 13.84,
+      "memory/max_active (GiB)": 13.69,
+      "memory/max_allocated (GiB)": 13.69,
+      "step": 0
+    },
+    {
+      "epoch": 0.009696969696969697,
+      "grad_norm": 2.995619058609009,
+      "learning_rate": 3.4951456310679615e-06,
+      "loss": 0.8680612564086914,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 2.38229,
+      "step": 10,
+      "tokens/total": 163840,
+      "tokens/train_per_sec_per_gpu": 14.27,
+      "tokens/trainable": 51990
+    },
+    {
+      "epoch": 0.019393939393939394,
+      "grad_norm": 2.1244935989379883,
+      "learning_rate": 7.378640776699029e-06,
+      "loss": 0.7699687004089355,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 2.1597,
+      "step": 20,
+      "tokens/total": 327680,
+      "tokens/train_per_sec_per_gpu": 16.06,
+      "tokens/trainable": 104391
+    },
+    {
+      "epoch": 0.02909090909090909,
+      "grad_norm": 0.9706138372421265,
+      "learning_rate": 1.1262135922330098e-05,
+      "loss": 0.5319457054138184,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.70224,
+      "step": 30,
+      "tokens/total": 491520,
+      "tokens/train_per_sec_per_gpu": 16.48,
+      "tokens/trainable": 156787
+    },
+    {
+      "epoch": 0.03878787878787879,
+      "grad_norm": 0.7689842581748962,
+      "learning_rate": 1.5145631067961166e-05,
+      "loss": 0.30234951972961427,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.35303,
+      "step": 40,
+      "tokens/total": 655360,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 208924
+    },
+    {
+      "epoch": 0.048484848484848485,
+      "grad_norm": 0.45850396156311035,
+      "learning_rate": 1.9029126213592234e-05,
+      "loss": 0.1519382953643799,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.16409,
+      "step": 50,
+      "tokens/total": 819200,
+      "tokens/train_per_sec_per_gpu": 14.61,
+      "tokens/trainable": 261170
+    },
+    {
+      "epoch": 0.05818181818181818,
+      "grad_norm": 0.41381561756134033,
+      "learning_rate": 2.29126213592233e-05,
+      "loss": 0.062263429164886475,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.06424,
+      "step": 60,
+      "tokens/total": 983040,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 313808
+    },
+    {
+      "epoch": 0.06787878787878789,
+      "grad_norm": 0.4865979254245758,
+      "learning_rate": 2.6796116504854367e-05,
+      "loss": 0.018695920705795288,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.01887,
+      "step": 70,
+      "tokens/total": 1146880,
+      "tokens/train_per_sec_per_gpu": 14.62,
+      "tokens/trainable": 366068
+    },
+    {
+      "epoch": 0.07757575757575758,
+      "grad_norm": 0.39099738001823425,
+      "learning_rate": 3.067961165048544e-05,
+      "loss": 0.006136053055524826,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00615,
+      "step": 80,
+      "tokens/total": 1310720,
+      "tokens/train_per_sec_per_gpu": 13.81,
+      "tokens/trainable": 418120
+    },
+    {
+      "epoch": 0.08727272727272728,
+      "grad_norm": 0.08230593055486679,
+      "learning_rate": 3.456310679611651e-05,
+      "loss": 0.004204501211643219,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00421,
+      "step": 90,
+      "tokens/total": 1474560,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 470244
+    },
+    {
+      "epoch": 0.09696969696969697,
+      "grad_norm": 0.13297680020332336,
+      "learning_rate": 3.844660194174757e-05,
+      "loss": 0.0036250378936529158,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00363,
+      "step": 100,
+      "tokens/total": 1638400,
+      "tokens/train_per_sec_per_gpu": 14.91,
+      "tokens/trainable": 522666
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.2430051565170288,
+      "learning_rate": 4.2330097087378647e-05,
+      "loss": 0.003873714804649353,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00388,
+      "step": 110,
+      "tokens/total": 1802240,
+      "tokens/train_per_sec_per_gpu": 14.17,
+      "tokens/trainable": 574329
+    },
+    {
+      "epoch": 0.11636363636363636,
+      "grad_norm": 0.09347938001155853,
+      "learning_rate": 4.621359223300971e-05,
+      "loss": 0.00237951148301363,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00238,
+      "step": 120,
+      "tokens/total": 1966080,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 626194
+    },
+    {
+      "epoch": 0.12606060606060607,
+      "grad_norm": 0.13388365507125854,
+      "learning_rate": 5.0097087378640786e-05,
+      "loss": 0.0015400107949972153,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00154,
+      "step": 130,
+      "tokens/total": 2129920,
+      "tokens/train_per_sec_per_gpu": 14.01,
+      "tokens/trainable": 678140
+    },
+    {
+      "epoch": 0.13575757575757577,
+      "grad_norm": 0.13342970609664917,
+      "learning_rate": 5.398058252427185e-05,
+      "loss": 0.001996887102723122,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.002,
+      "step": 140,
+      "tokens/total": 2293760,
+      "tokens/train_per_sec_per_gpu": 14.41,
+      "tokens/trainable": 730201
+    },
+    {
+      "epoch": 0.14545454545454545,
+      "grad_norm": 0.0299234539270401,
+      "learning_rate": 5.786407766990292e-05,
+      "loss": 0.0015132850036025046,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00151,
+      "step": 150,
+      "tokens/total": 2457600,
+      "tokens/train_per_sec_per_gpu": 15.8,
+      "tokens/trainable": 782196
+    },
+    {
+      "epoch": 0.15515151515151515,
+      "grad_norm": 0.04437975212931633,
+      "learning_rate": 6.174757281553398e-05,
+      "loss": 0.0012883609160780907,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00129,
+      "step": 160,
+      "tokens/total": 2621440,
+      "tokens/train_per_sec_per_gpu": 14.64,
+      "tokens/trainable": 833614
+    },
+    {
+      "epoch": 0.16484848484848486,
+      "grad_norm": 0.014039761386811733,
+      "learning_rate": 6.563106796116505e-05,
+      "loss": 0.0011639594100415706,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00116,
+      "step": 170,
+      "tokens/total": 2785280,
+      "tokens/train_per_sec_per_gpu": 13.95,
+      "tokens/trainable": 885591
+    },
+    {
+      "epoch": 0.17454545454545456,
+      "grad_norm": 0.0033261056523770094,
+      "learning_rate": 6.951456310679612e-05,
+      "loss": 0.0007388167083263397,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00074,
+      "step": 180,
+      "tokens/total": 2949120,
+      "tokens/train_per_sec_per_gpu": 14.37,
+      "tokens/trainable": 937712
+    },
+    {
+      "epoch": 0.18424242424242424,
+      "grad_norm": 0.010476192459464073,
+      "learning_rate": 7.339805825242719e-05,
+      "loss": 0.0008642122149467469,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00086,
+      "step": 190,
+      "tokens/total": 3112960,
+      "tokens/train_per_sec_per_gpu": 15.52,
+      "tokens/trainable": 989913
+    },
+    {
+      "epoch": 0.19393939393939394,
+      "grad_norm": 0.01253255270421505,
+      "learning_rate": 7.728155339805826e-05,
+      "loss": 0.0007610846310853958,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00076,
+      "step": 200,
+      "tokens/total": 3276800,
+      "tokens/train_per_sec_per_gpu": 14.17,
+      "tokens/trainable": 1041978
+    },
+    {
+      "epoch": 0.20363636363636364,
+      "grad_norm": 0.01779557578265667,
+      "learning_rate": 8.116504854368933e-05,
+      "loss": 0.0007697530556470156,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00077,
+      "step": 210,
+      "tokens/total": 3440640,
+      "tokens/train_per_sec_per_gpu": 14.12,
+      "tokens/trainable": 1093395
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.16895800828933716,
+      "learning_rate": 8.504854368932039e-05,
+      "loss": 0.0006535804830491542,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 220,
+      "tokens/total": 3604480,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 1145329
+    },
+    {
+      "epoch": 0.22303030303030302,
+      "grad_norm": 0.08973463624715805,
+      "learning_rate": 8.893203883495146e-05,
+      "loss": 0.0009510296396911145,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 230,
+      "tokens/total": 3768320,
+      "tokens/train_per_sec_per_gpu": 14.67,
+      "tokens/trainable": 1197537
+    },
+    {
+      "epoch": 0.23272727272727273,
+      "grad_norm": 0.044939588755369186,
+      "learning_rate": 9.281553398058253e-05,
+      "loss": 0.001187363639473915,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00119,
+      "step": 240,
+      "tokens/total": 3932160,
+      "tokens/train_per_sec_per_gpu": 15.39,
+      "tokens/trainable": 1249924
+    },
+    {
+      "epoch": 0.24242424242424243,
+      "grad_norm": 0.08850465714931488,
+      "learning_rate": 9.66990291262136e-05,
+      "loss": 0.0013382930308580398,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00134,
+      "step": 250,
+      "tokens/total": 4096000,
+      "tokens/train_per_sec_per_gpu": 15.06,
+      "tokens/trainable": 1301558
+    },
+    {
+      "epoch": 0.25212121212121213,
+      "grad_norm": 0.101528100669384,
+      "learning_rate": 0.00010058252427184467,
+      "loss": 0.0008709387853741646,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00087,
+      "step": 260,
+      "tokens/total": 4259840,
+      "tokens/train_per_sec_per_gpu": 15.16,
+      "tokens/trainable": 1353706
+    },
+    {
+      "epoch": 0.26181818181818184,
+      "grad_norm": 0.08298433572053909,
+      "learning_rate": 0.00010446601941747574,
+      "loss": 0.0013300922699272632,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00133,
+      "step": 270,
+      "tokens/total": 4423680,
+      "tokens/train_per_sec_per_gpu": 15.11,
+      "tokens/trainable": 1405519
+    },
+    {
+      "epoch": 0.27151515151515154,
+      "grad_norm": 0.03734389320015907,
+      "learning_rate": 0.00010834951456310681,
+      "loss": 0.0006868645548820495,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00069,
+      "step": 280,
+      "tokens/total": 4587520,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 1457494
+    },
+    {
+      "epoch": 0.2812121212121212,
+      "grad_norm": 0.07898428291082382,
+      "learning_rate": 0.00011223300970873786,
+      "loss": 0.0013550779782235622,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00136,
+      "step": 290,
+      "tokens/total": 4751360,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 1509320
+    },
+    {
+      "epoch": 0.2909090909090909,
+      "grad_norm": 0.06320006400346756,
+      "learning_rate": 0.00011611650485436893,
+      "loss": 0.0010121697559952736,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00101,
+      "step": 300,
+      "tokens/total": 4915200,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 1561332
+    },
+    {
+      "epoch": 0.3006060606060606,
+      "grad_norm": 0.013749867677688599,
+      "learning_rate": 0.00012,
+      "loss": 0.0006499682553112507,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 310,
+      "tokens/total": 5079040,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 1613189
+    },
+    {
+      "epoch": 0.3103030303030303,
+      "grad_norm": 0.033964402973651886,
+      "learning_rate": 0.00012388349514563107,
+      "loss": 0.0008866124786436558,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00089,
+      "step": 320,
+      "tokens/total": 5242880,
+      "tokens/train_per_sec_per_gpu": 15.78,
+      "tokens/trainable": 1665681
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.04327597841620445,
+      "learning_rate": 0.00012776699029126213,
+      "loss": 0.0005569641944020987,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00056,
+      "step": 330,
+      "tokens/total": 5406720,
+      "tokens/train_per_sec_per_gpu": 14.92,
+      "tokens/trainable": 1718317
+    },
+    {
+      "epoch": 0.3296969696969697,
+      "grad_norm": 0.02717934548854828,
+      "learning_rate": 0.0001316504854368932,
+      "loss": 0.0003776244120672345,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00038,
+      "step": 340,
+      "tokens/total": 5570560,
+      "tokens/train_per_sec_per_gpu": 14.42,
+      "tokens/trainable": 1770210
+    },
+    {
+      "epoch": 0.3393939393939394,
+      "grad_norm": 0.0028237912338227034,
+      "learning_rate": 0.0001355339805825243,
+      "loss": 0.0005292522720992566,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00053,
+      "step": 350,
+      "tokens/total": 5734400,
+      "tokens/train_per_sec_per_gpu": 16.4,
+      "tokens/trainable": 1821987
+    },
+    {
+      "epoch": 0.3490909090909091,
+      "grad_norm": 0.0310799703001976,
+      "learning_rate": 0.00013941747572815535,
+      "loss": 0.0006786303594708443,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 360,
+      "tokens/total": 5898240,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 1874266
+    },
+    {
+      "epoch": 0.35878787878787877,
+      "grad_norm": 0.17325043678283691,
+      "learning_rate": 0.0001433009708737864,
+      "loss": 0.0013975565321743487,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0014,
+      "step": 370,
+      "tokens/total": 6062080,
+      "tokens/train_per_sec_per_gpu": 13.73,
+      "tokens/trainable": 1926124
+    },
+    {
+      "epoch": 0.36848484848484847,
+      "grad_norm": 0.07738752663135529,
+      "learning_rate": 0.0001471844660194175,
+      "loss": 0.0006820175796747208,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 380,
+      "tokens/total": 6225920,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 1978693
+    },
+    {
+      "epoch": 0.3781818181818182,
+      "grad_norm": 0.10022349655628204,
+      "learning_rate": 0.00015106796116504855,
+      "loss": 0.00063879219815135,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 390,
+      "tokens/total": 6389760,
+      "tokens/train_per_sec_per_gpu": 13.34,
+      "tokens/trainable": 2030378
+    },
+    {
+      "epoch": 0.3878787878787879,
+      "grad_norm": 0.0495997779071331,
+      "learning_rate": 0.00015495145631067963,
+      "loss": 0.0021283581852912905,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00213,
+      "step": 400,
+      "tokens/total": 6553600,
+      "tokens/train_per_sec_per_gpu": 15.34,
+      "tokens/trainable": 2083047
+    },
+    {
+      "epoch": 0.3975757575757576,
+      "grad_norm": 0.07361701130867004,
+      "learning_rate": 0.0001588349514563107,
+      "loss": 0.001862115040421486,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00186,
+      "step": 410,
+      "tokens/total": 6717440,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 2135527
+    },
+    {
+      "epoch": 0.4072727272727273,
+      "grad_norm": 0.05466209724545479,
+      "learning_rate": 0.00016271844660194174,
+      "loss": 0.0011581303551793098,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00116,
+      "step": 420,
+      "tokens/total": 6881280,
+      "tokens/train_per_sec_per_gpu": 14.77,
+      "tokens/trainable": 2187636
+    },
+    {
+      "epoch": 0.416969696969697,
+      "grad_norm": 0.04331392049789429,
+      "learning_rate": 0.00016660194174757283,
+      "loss": 0.0051729224622249605,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00519,
+      "step": 430,
+      "tokens/total": 7045120,
+      "tokens/train_per_sec_per_gpu": 13.76,
+      "tokens/trainable": 2239006
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.05931795388460159,
+      "learning_rate": 0.00017048543689320388,
+      "loss": 0.00242764875292778,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00243,
+      "step": 440,
+      "tokens/total": 7208960,
+      "tokens/train_per_sec_per_gpu": 14.59,
+      "tokens/trainable": 2290540
+    },
+    {
+      "epoch": 0.43636363636363634,
+      "grad_norm": 0.04634418711066246,
+      "learning_rate": 0.00017436893203883494,
+      "loss": 0.001389546226710081,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00139,
+      "step": 450,
+      "tokens/total": 7372800,
+      "tokens/train_per_sec_per_gpu": 14.78,
+      "tokens/trainable": 2341852
+    },
+    {
+      "epoch": 0.44606060606060605,
+      "grad_norm": 0.04817213863134384,
+      "learning_rate": 0.00017825242718446602,
+      "loss": 0.001370794139802456,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00137,
+      "step": 460,
+      "tokens/total": 7536640,
+      "tokens/train_per_sec_per_gpu": 13.77,
+      "tokens/trainable": 2393320
+    },
+    {
+      "epoch": 0.45575757575757575,
+      "grad_norm": 0.011335949413478374,
+      "learning_rate": 0.00018213592233009708,
+      "loss": 0.0009715131483972073,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 470,
+      "tokens/total": 7700480,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 2445170
+    },
+    {
+      "epoch": 0.46545454545454545,
+      "grad_norm": 0.05298445746302605,
+      "learning_rate": 0.00018601941747572816,
+      "loss": 0.0008222623728215694,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 480,
+      "tokens/total": 7864320,
+      "tokens/train_per_sec_per_gpu": 13.87,
+      "tokens/trainable": 2497473
+    },
+    {
+      "epoch": 0.47515151515151516,
+      "grad_norm": 0.061686884611845016,
+      "learning_rate": 0.00018990291262135925,
+      "loss": 0.000748783303424716,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00075,
+      "step": 490,
+      "tokens/total": 8028160,
+      "tokens/train_per_sec_per_gpu": 15.41,
+      "tokens/trainable": 2549206
+    },
+    {
+      "epoch": 0.48484848484848486,
+      "grad_norm": 0.03281249850988388,
+      "learning_rate": 0.0001937864077669903,
+      "loss": 0.0006062469445168972,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00061,
+      "step": 500,
+      "tokens/total": 8192000,
+      "tokens/train_per_sec_per_gpu": 14.49,
+      "tokens/trainable": 2600583
+    },
+    {
+      "epoch": 0.49454545454545457,
+      "grad_norm": 0.008482079952955246,
+      "learning_rate": 0.0001976699029126214,
+      "loss": 0.0008583014830946922,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00086,
+      "step": 510,
+      "tokens/total": 8355840,
+      "tokens/train_per_sec_per_gpu": 13.86,
+      "tokens/trainable": 2652927
+    },
+    {
+      "epoch": 0.5003636363636363,
+      "eval_loss": 0.0009036393603309989,
+      "eval_ppl": 1.0009,
+      "eval_runtime": 12.7872,
+      "eval_samples_per_second": 15.641,
+      "eval_steps_per_second": 7.82,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "step": 516
+    },
+    {
+      "epoch": 0.5042424242424243,
+      "grad_norm": 0.04333305358886719,
+      "learning_rate": 0.0001999996332640321,
+      "loss": 0.0005093200132250785,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 520,
+      "tokens/total": 8519680,
+      "tokens/train_per_sec_per_gpu": 14.09,
+      "tokens/trainable": 2705083
+    },
+    {
+      "epoch": 0.5139393939393939,
+      "grad_norm": 0.02485118806362152,
+      "learning_rate": 0.00019999550751528488,
+      "loss": 0.0006649125367403031,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 530,
+      "tokens/total": 8683520,
+      "tokens/train_per_sec_per_gpu": 14.44,
+      "tokens/trainable": 2756975
+    },
+    {
+      "epoch": 0.5236363636363637,
+      "grad_norm": 0.03736363351345062,
+      "learning_rate": 0.00019998679778759294,
+      "loss": 0.0006726076360791921,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 540,
+      "tokens/total": 8847360,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 2808076
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.05156765505671501,
+      "learning_rate": 0.0001999735044802263,
+      "loss": 0.000789718609303236,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00079,
+      "step": 550,
+      "tokens/total": 9011200,
+      "tokens/train_per_sec_per_gpu": 16.36,
+      "tokens/trainable": 2859893
+    },
+    {
+      "epoch": 0.5430303030303031,
+      "grad_norm": 0.647550106048584,
+      "learning_rate": 0.00019995562820257474,
+      "loss": 0.003008325584232807,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00301,
+      "step": 560,
+      "tokens/total": 9175040,
+      "tokens/train_per_sec_per_gpu": 14.21,
+      "tokens/trainable": 2911399
+    },
+    {
+      "epoch": 0.5527272727272727,
+      "grad_norm": 0.185165673494339,
+      "learning_rate": 0.00019993316977411993,
+      "loss": 0.013715097308158874,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.01381,
+      "step": 570,
+      "tokens/total": 9338880,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 2962403
+    },
+    {
+      "epoch": 0.5624242424242424,
+      "grad_norm": 0.2401553839445114,
+      "learning_rate": 0.0001999061302243977,
+      "loss": 0.009026474505662917,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00907,
+      "step": 580,
+      "tokens/total": 9502720,
+      "tokens/train_per_sec_per_gpu": 14.38,
+      "tokens/trainable": 3015083
+    },
+    {
+      "epoch": 0.5721212121212121,
+      "grad_norm": 0.08092579245567322,
+      "learning_rate": 0.000199874510792951,
+      "loss": 0.005716494470834732,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00573,
+      "step": 590,
+      "tokens/total": 9666560,
+      "tokens/train_per_sec_per_gpu": 16.38,
+      "tokens/trainable": 3066501
+    },
+    {
+      "epoch": 0.5818181818181818,
+      "grad_norm": 3.418715476989746,
+      "learning_rate": 0.00019983831292927305,
+      "loss": 0.048504295945167544,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0497,
+      "step": 600,
+      "tokens/total": 9830400,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 3118633
+    },
+    {
+      "epoch": 0.5915151515151515,
+      "grad_norm": 0.2194036841392517,
+      "learning_rate": 0.00019979753829274085,
+      "loss": 0.03429323434829712,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.03489,
+      "step": 610,
+      "tokens/total": 9994240,
+      "tokens/train_per_sec_per_gpu": 13.14,
+      "tokens/trainable": 3170577
+    },
+    {
+      "epoch": 0.6012121212121212,
+      "grad_norm": 0.022929901257157326,
+      "learning_rate": 0.0001997521887525391,
+      "loss": 0.0015171168372035027,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00152,
+      "step": 620,
+      "tokens/total": 10158080,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 3221696
+    },
+    {
+      "epoch": 0.610909090909091,
+      "grad_norm": 0.10083670169115067,
+      "learning_rate": 0.00019970226638757458,
+      "loss": 0.0025377947837114333,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00254,
+      "step": 630,
+      "tokens/total": 10321920,
+      "tokens/train_per_sec_per_gpu": 14.7,
+      "tokens/trainable": 3273775
+    },
+    {
+      "epoch": 0.6206060606060606,
+      "grad_norm": 0.01761380024254322,
+      "learning_rate": 0.00019964777348638083,
+      "loss": 0.002281896211206913,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00228,
+      "step": 640,
+      "tokens/total": 10485760,
+      "tokens/train_per_sec_per_gpu": 14.89,
+      "tokens/trainable": 3325516
+    },
+    {
+      "epoch": 0.6303030303030303,
+      "grad_norm": 0.004510029684752226,
+      "learning_rate": 0.00019958871254701315,
+      "loss": 0.0009477110579609871,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 650,
+      "tokens/total": 10649600,
+      "tokens/train_per_sec_per_gpu": 16.46,
+      "tokens/trainable": 3377214
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.05332477018237114,
+      "learning_rate": 0.0001995250862769342,
+      "loss": 0.0005660496186465025,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00057,
+      "step": 660,
+      "tokens/total": 10813440,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 3428627
+    },
+    {
+      "epoch": 0.6496969696969697,
+      "grad_norm": 0.03861689195036888,
+      "learning_rate": 0.0001994568975928899,
+      "loss": 0.0008976863697171211,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0009,
+      "step": 670,
+      "tokens/total": 10977280,
+      "tokens/train_per_sec_per_gpu": 15.66,
+      "tokens/trainable": 3480170
+    },
+    {
+      "epoch": 0.6593939393939394,
+      "grad_norm": 0.021123304963111877,
+      "learning_rate": 0.00019938414962077553,
+      "loss": 0.0009612766094505787,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00096,
+      "step": 680,
+      "tokens/total": 11141120,
+      "tokens/train_per_sec_per_gpu": 15.15,
+      "tokens/trainable": 3532037
+    },
+    {
+      "epoch": 0.6690909090909091,
+      "grad_norm": 0.02421347238123417,
+      "learning_rate": 0.00019930684569549264,
+      "loss": 0.001021684519946575,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00102,
+      "step": 690,
+      "tokens/total": 11304960,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 3583461
+    },
+    {
+      "epoch": 0.6787878787878788,
+      "grad_norm": 0.05008835345506668,
+      "learning_rate": 0.00019922498936079613,
+      "loss": 0.0007617876864969731,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00076,
+      "step": 700,
+      "tokens/total": 11468800,
+      "tokens/train_per_sec_per_gpu": 14.08,
+      "tokens/trainable": 3634649
+    },
+    {
+      "epoch": 0.6884848484848485,
+      "grad_norm": 0.035733792930841446,
+      "learning_rate": 0.00019913858436913171,
+      "loss": 0.0012347914278507232,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00124,
+      "step": 710,
+      "tokens/total": 11632640,
+      "tokens/train_per_sec_per_gpu": 14.45,
+      "tokens/trainable": 3685786
+    },
+    {
+      "epoch": 0.6981818181818182,
+      "grad_norm": 0.010948767885565758,
+      "learning_rate": 0.00019904763468146393,
+      "loss": 0.0008165687322616577,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 720,
+      "tokens/total": 11796480,
+      "tokens/train_per_sec_per_gpu": 15.77,
+      "tokens/trainable": 3737566
+    },
+    {
+      "epoch": 0.7078787878787879,
+      "grad_norm": 0.03577027469873428,
+      "learning_rate": 0.00019895214446709463,
+      "loss": 0.001333119161427021,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00133,
+      "step": 730,
+      "tokens/total": 11960320,
+      "tokens/train_per_sec_per_gpu": 13.98,
+      "tokens/trainable": 3789817
+    },
+    {
+      "epoch": 0.7175757575757575,
+      "grad_norm": 0.03971279785037041,
+      "learning_rate": 0.00019885211810347184,
+      "loss": 0.0011184611357748508,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00112,
+      "step": 740,
+      "tokens/total": 12124160,
+      "tokens/train_per_sec_per_gpu": 14.67,
+      "tokens/trainable": 3841912
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 0.06546575576066971,
+      "learning_rate": 0.00019874756017598894,
+      "loss": 0.0012452728115022182,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00125,
+      "step": 750,
+      "tokens/total": 12288000,
+      "tokens/train_per_sec_per_gpu": 14.58,
+      "tokens/trainable": 3893725
+    },
+    {
+      "epoch": 0.7369696969696969,
+      "grad_norm": 0.047058816999197006,
+      "learning_rate": 0.00019863847547777467,
+      "loss": 0.0008146104402840138,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00081,
+      "step": 760,
+      "tokens/total": 12451840,
+      "tokens/train_per_sec_per_gpu": 13.49,
+      "tokens/trainable": 3945033
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.028811641037464142,
+      "learning_rate": 0.00019852486900947327,
+      "loss": 0.0008652995340526104,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00087,
+      "step": 770,
+      "tokens/total": 12615680,
+      "tokens/train_per_sec_per_gpu": 15.12,
+      "tokens/trainable": 3996749
+    },
+    {
+      "epoch": 0.7563636363636363,
+      "grad_norm": 0.012203546240925789,
+      "learning_rate": 0.0001984067459790153,
+      "loss": 0.000670672720298171,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 780,
+      "tokens/total": 12779520,
+      "tokens/train_per_sec_per_gpu": 13.71,
+      "tokens/trainable": 4048173
+    },
+    {
+      "epoch": 0.7660606060606061,
+      "grad_norm": 0.016218814998865128,
+      "learning_rate": 0.0001982841118013789,
+      "loss": 0.00046353964135050776,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 790,
+      "tokens/total": 12943360,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 4099789
+    },
+    {
+      "epoch": 0.7757575757575758,
+      "grad_norm": 0.034673016518354416,
+      "learning_rate": 0.00019815697209834147,
+      "loss": 0.000707306619733572,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00071,
+      "step": 800,
+      "tokens/total": 13107200,
+      "tokens/train_per_sec_per_gpu": 14.45,
+      "tokens/trainable": 4150960
+    },
+    {
+      "epoch": 0.7854545454545454,
+      "grad_norm": 0.0022127812262624502,
+      "learning_rate": 0.00019802533269822208,
+      "loss": 0.00021896373946219682,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 810,
+      "tokens/total": 13271040,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 4202984
+    },
+    {
+      "epoch": 0.7951515151515152,
+      "grad_norm": 0.000919274752959609,
+      "learning_rate": 0.00019788919963561422,
+      "loss": 0.00043264860287308695,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00043,
+      "step": 820,
+      "tokens/total": 13434880,
+      "tokens/train_per_sec_per_gpu": 14.06,
+      "tokens/trainable": 4254907
+    },
+    {
+      "epoch": 0.8048484848484848,
+      "grad_norm": 0.007699873298406601,
+      "learning_rate": 0.00019774857915110913,
+      "loss": 0.0003196246922016144,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 830,
+      "tokens/total": 13598720,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 4306095
+    },
+    {
+      "epoch": 0.8145454545454546,
+      "grad_norm": 0.015523642301559448,
+      "learning_rate": 0.00019760347769100987,
+      "loss": 0.0004476988688111305,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00045,
+      "step": 840,
+      "tokens/total": 13762560,
+      "tokens/train_per_sec_per_gpu": 14.14,
+      "tokens/trainable": 4357442
+    },
+    {
+      "epoch": 0.8242424242424242,
+      "grad_norm": 0.013460986316204071,
+      "learning_rate": 0.00019745390190703565,
+      "loss": 0.0004673306830227375,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 850,
+      "tokens/total": 13926400,
+      "tokens/train_per_sec_per_gpu": 14.1,
+      "tokens/trainable": 4409277
+    },
+    {
+      "epoch": 0.833939393939394,
+      "grad_norm": 0.0014691110700368881,
+      "learning_rate": 0.0001972998586560169,
+      "loss": 0.0003277578856796026,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 860,
+      "tokens/total": 14090240,
+      "tokens/train_per_sec_per_gpu": 14.28,
+      "tokens/trainable": 4460714
+    },
+    {
+      "epoch": 0.8436363636363636,
+      "grad_norm": 0.001358041656203568,
+      "learning_rate": 0.00019714135499958112,
+      "loss": 0.00032470382284373046,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 870,
+      "tokens/total": 14254080,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 4511989
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.04510723799467087,
+      "learning_rate": 0.0001969783982038289,
+      "loss": 0.00023182881996035575,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 880,
+      "tokens/total": 14417920,
+      "tokens/train_per_sec_per_gpu": 15.41,
+      "tokens/trainable": 4563354
+    },
+    {
+      "epoch": 0.863030303030303,
+      "grad_norm": 0.14508692920207977,
+      "learning_rate": 0.00019681099573900113,
+      "loss": 0.00026136748492717744,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 890,
+      "tokens/total": 14581760,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 4615691
+    },
+    {
+      "epoch": 0.8727272727272727,
+      "grad_norm": 0.010969490744173527,
+      "learning_rate": 0.00019663915527913625,
+      "loss": 0.00016044279327616097,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 900,
+      "tokens/total": 14745600,
+      "tokens/train_per_sec_per_gpu": 15.76,
+      "tokens/trainable": 4667433
+    },
+    {
+      "epoch": 0.8824242424242424,
+      "grad_norm": 0.03874114155769348,
+      "learning_rate": 0.00019646288470171868,
+      "loss": 0.0004159804433584213,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 910,
+      "tokens/total": 14909440,
+      "tokens/train_per_sec_per_gpu": 16.01,
+      "tokens/trainable": 4719807
+    },
+    {
+      "epoch": 0.8921212121212121,
+      "grad_norm": 0.044620465487241745,
+      "learning_rate": 0.00019628219208731756,
+      "loss": 0.0006739750038832426,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 920,
+      "tokens/total": 15073280,
+      "tokens/train_per_sec_per_gpu": 15.05,
+      "tokens/trainable": 4771772
+    },
+    {
+      "epoch": 0.9018181818181819,
+      "grad_norm": 0.024856949225068092,
+      "learning_rate": 0.00019609708571921645,
+      "loss": 0.00039347023703157903,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 930,
+      "tokens/total": 15237120,
+      "tokens/train_per_sec_per_gpu": 15.16,
+      "tokens/trainable": 4823415
+    },
+    {
+      "epoch": 0.9115151515151515,
+      "grad_norm": 0.022198157384991646,
+      "learning_rate": 0.0001959075740830335,
+      "loss": 0.0005907822400331497,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00059,
+      "step": 940,
+      "tokens/total": 15400960,
+      "tokens/train_per_sec_per_gpu": 15.36,
+      "tokens/trainable": 4875269
+    },
+    {
+      "epoch": 0.9212121212121213,
+      "grad_norm": 0.01670038513839245,
+      "learning_rate": 0.00019571366586633245,
+      "loss": 0.00027316866908222437,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 950,
+      "tokens/total": 15564800,
+      "tokens/train_per_sec_per_gpu": 15.11,
+      "tokens/trainable": 4927244
+    },
+    {
+      "epoch": 0.9309090909090909,
+      "grad_norm": 0.021392742171883583,
+      "learning_rate": 0.00019551536995822454,
+      "loss": 0.0004320886451750994,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00043,
+      "step": 960,
+      "tokens/total": 15728640,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 4979068
+    },
+    {
+      "epoch": 0.9406060606060606,
+      "grad_norm": 0.028143158182501793,
+      "learning_rate": 0.00019531269544896076,
+      "loss": 0.0005637989845126868,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00056,
+      "step": 970,
+      "tokens/total": 15892480,
+      "tokens/train_per_sec_per_gpu": 14.26,
+      "tokens/trainable": 5030980
+    },
+    {
+      "epoch": 0.9503030303030303,
+      "grad_norm": 0.077091746032238,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.0010597245767712594,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00106,
+      "step": 980,
+      "tokens/total": 16056320,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 5082759
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.04455556347966194,
+      "learning_rate": 0.00019489424799115984,
+      "loss": 0.0009517236612737179,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 990,
+      "tokens/total": 16220160,
+      "tokens/train_per_sec_per_gpu": 13.04,
+      "tokens/trainable": 5134379
+    },
+    {
+      "epoch": 0.9696969696969697,
+      "grad_norm": 0.03573840856552124,
+      "learning_rate": 0.00019467849422502784,
+      "loss": 0.0008812972344458103,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00088,
+      "step": 1000,
+      "tokens/total": 16384000,
+      "tokens/train_per_sec_per_gpu": 15.23,
+      "tokens/trainable": 5186184
+    },
+    {
+      "epoch": 0.9793939393939394,
+      "grad_norm": 0.0006549305398948491,
+      "learning_rate": 0.0001944584002216709,
+      "loss": 0.0006358013488352299,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1010,
+      "tokens/total": 16547840,
+      "tokens/train_per_sec_per_gpu": 16.1,
+      "tokens/trainable": 5238320
+    },
+    {
+      "epoch": 0.9890909090909091,
+      "grad_norm": 0.021742813289165497,
+      "learning_rate": 0.00019423397607060507,
+      "loss": 0.000400003744289279,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1020,
+      "tokens/total": 16711680,
+      "tokens/train_per_sec_per_gpu": 14.53,
+      "tokens/trainable": 5290445
+    },
+    {
+      "epoch": 0.9987878787878788,
+      "grad_norm": 0.04323820024728775,
+      "learning_rate": 0.00019400523205984833,
+      "loss": 0.0002954686991870403,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1030,
+      "tokens/total": 16875520,
+      "tokens/train_per_sec_per_gpu": 14.98,
+      "tokens/trainable": 5342720
+    },
+    {
+      "epoch": 1.001939393939394,
+      "eval_loss": 0.00047458006883971393,
+      "eval_ppl": 1.00047,
+      "eval_runtime": 11.7938,
+      "eval_samples_per_second": 16.958,
+      "eval_steps_per_second": 8.479,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.73,
+      "memory/max_allocated (GiB)": 16.73,
+      "step": 1032
+    },
+    {
+      "epoch": 1.0096969696969698,
+      "grad_norm": 0.000988126266747713,
+      "learning_rate": 0.00019377217867544907,
+      "loss": 0.0004762394353747368,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00048,
+      "step": 1040,
+      "tokens/total": 17051648,
+      "tokens/train_per_sec_per_gpu": 14.47,
+      "tokens/trainable": 5398184
+    },
+    {
+      "epoch": 1.0193939393939393,
+      "grad_norm": 0.0011711094994097948,
+      "learning_rate": 0.00019353482660100537,
+      "loss": 0.00022675264626741408,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 1050,
+      "tokens/total": 17215488,
+      "tokens/train_per_sec_per_gpu": 14.05,
+      "tokens/trainable": 5450329
+    },
+    {
+      "epoch": 1.029090909090909,
+      "grad_norm": 0.007319436874240637,
+      "learning_rate": 0.0001932931867171751,
+      "loss": 0.0003059083363041282,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1060,
+      "tokens/total": 17379328,
+      "tokens/train_per_sec_per_gpu": 13.66,
+      "tokens/trainable": 5502706
+    },
+    {
+      "epoch": 1.0387878787878788,
+      "grad_norm": 0.00967186689376831,
+      "learning_rate": 0.0001930472701011773,
+      "loss": 0.0003639918984845281,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 1070,
+      "tokens/total": 17543168,
+      "tokens/train_per_sec_per_gpu": 15.36,
+      "tokens/trainable": 5554957
+    },
+    {
+      "epoch": 1.0484848484848486,
+      "grad_norm": 0.0018478024285286665,
+      "learning_rate": 0.00019279708802628437,
+      "loss": 0.0002576910424977541,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 1080,
+      "tokens/total": 17707008,
+      "tokens/train_per_sec_per_gpu": 14.73,
+      "tokens/trainable": 5607534
+    },
+    {
+      "epoch": 1.0581818181818181,
+      "grad_norm": 0.018235478550195694,
+      "learning_rate": 0.00019254265196130517,
+      "loss": 0.0003647733014076948,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 1090,
+      "tokens/total": 17870848,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 5659689
+    },
+    {
+      "epoch": 1.0678787878787879,
+      "grad_norm": 0.024314021691679955,
+      "learning_rate": 0.0001922839735700593,
+      "loss": 0.00030459570698440077,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1100,
+      "tokens/total": 18034688,
+      "tokens/train_per_sec_per_gpu": 13.67,
+      "tokens/trainable": 5711346
+    },
+    {
+      "epoch": 1.0775757575757576,
+      "grad_norm": 0.0177497286349535,
+      "learning_rate": 0.0001920210647108425,
+      "loss": 0.00023341022897511722,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 1110,
+      "tokens/total": 18198528,
+      "tokens/train_per_sec_per_gpu": 14.13,
+      "tokens/trainable": 5763094
+    },
+    {
+      "epoch": 1.0872727272727274,
+      "grad_norm": 0.005781313870102167,
+      "learning_rate": 0.00019175393743588295,
+      "loss": 0.0002974884817376733,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1120,
+      "tokens/total": 18362368,
+      "tokens/train_per_sec_per_gpu": 14.55,
+      "tokens/trainable": 5815101
+    },
+    {
+      "epoch": 1.096969696969697,
+      "grad_norm": 0.0026403339579701424,
+      "learning_rate": 0.00019148260399078887,
+      "loss": 0.00010604445124045015,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 1130,
+      "tokens/total": 18526208,
+      "tokens/train_per_sec_per_gpu": 13.87,
+      "tokens/trainable": 5866763
+    },
+    {
+      "epoch": 1.1066666666666667,
+      "grad_norm": 0.03586777299642563,
+      "learning_rate": 0.000191207076813987,
+      "loss": 0.00027820770628750324,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00028,
+      "step": 1140,
+      "tokens/total": 18690048,
+      "tokens/train_per_sec_per_gpu": 13.83,
+      "tokens/trainable": 5918322
+    },
+    {
+      "epoch": 1.1163636363636364,
+      "grad_norm": 0.007715190295130014,
+      "learning_rate": 0.00019092736853615257,
+      "loss": 0.00029321699403226373,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00029,
+      "step": 1150,
+      "tokens/total": 18853888,
+      "tokens/train_per_sec_per_gpu": 13.95,
+      "tokens/trainable": 5970153
+    },
+    {
+      "epoch": 1.126060606060606,
+      "grad_norm": 0.05122547224164009,
+      "learning_rate": 0.00019064349197963013,
+      "loss": 0.0005070990417152643,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 1160,
+      "tokens/total": 19017728,
+      "tokens/train_per_sec_per_gpu": 15.51,
+      "tokens/trainable": 6021741
+    },
+    {
+      "epoch": 1.1357575757575757,
+      "grad_norm": 0.032420564442873,
+      "learning_rate": 0.000190355460157846,
+      "loss": 0.00031497194431722163,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1170,
+      "tokens/total": 19181568,
+      "tokens/train_per_sec_per_gpu": 16.05,
+      "tokens/trainable": 6074092
+    },
+    {
+      "epoch": 1.1454545454545455,
+      "grad_norm": 0.03688061609864235,
+      "learning_rate": 0.00019006328627471132,
+      "loss": 0.0003225028282031417,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1180,
+      "tokens/total": 19345408,
+      "tokens/train_per_sec_per_gpu": 14.1,
+      "tokens/trainable": 6126315
+    },
+    {
+      "epoch": 1.1551515151515153,
+      "grad_norm": 0.03359396383166313,
+      "learning_rate": 0.00018976698372401716,
+      "loss": 0.0004557626787573099,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 1190,
+      "tokens/total": 19509248,
+      "tokens/train_per_sec_per_gpu": 14.6,
+      "tokens/trainable": 6178392
+    },
+    {
+      "epoch": 1.1648484848484848,
+      "grad_norm": 0.020522581413388252,
+      "learning_rate": 0.0001894665660888202,
+      "loss": 0.0006435967981815339,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1200,
+      "tokens/total": 19673088,
+      "tokens/train_per_sec_per_gpu": 15.47,
+      "tokens/trainable": 6230984
+    },
+    {
+      "epoch": 1.1745454545454546,
+      "grad_norm": 0.0025893959682434797,
+      "learning_rate": 0.00018916204714082034,
+      "loss": 0.0005178887862712145,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00052,
+      "step": 1210,
+      "tokens/total": 19836928,
+      "tokens/train_per_sec_per_gpu": 14.13,
+      "tokens/trainable": 6282713
+    },
+    {
+      "epoch": 1.1842424242424243,
+      "grad_norm": 0.017288153991103172,
+      "learning_rate": 0.00018885344083972914,
+      "loss": 0.0005050559528172016,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 1220,
+      "tokens/total": 20000768,
+      "tokens/train_per_sec_per_gpu": 14.31,
+      "tokens/trainable": 6334555
+    },
+    {
+      "epoch": 1.1939393939393939,
+      "grad_norm": 0.00206086877733469,
+      "learning_rate": 0.00018854076133263003,
+      "loss": 0.00020185327157378196,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 1230,
+      "tokens/total": 20164608,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 6386137
+    },
+    {
+      "epoch": 1.2036363636363636,
+      "grad_norm": 0.02184407040476799,
+      "learning_rate": 0.0001882240229533297,
+      "loss": 0.00048260441981256007,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00048,
+      "step": 1240,
+      "tokens/total": 20328448,
+      "tokens/train_per_sec_per_gpu": 14.35,
+      "tokens/trainable": 6437493
+    },
+    {
+      "epoch": 1.2133333333333334,
+      "grad_norm": 0.04215926304459572,
+      "learning_rate": 0.00018790324022170118,
+      "loss": 0.0003190681803971529,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1250,
+      "tokens/total": 20492288,
+      "tokens/train_per_sec_per_gpu": 14.51,
+      "tokens/trainable": 6488834
+    },
+    {
+      "epoch": 1.2230303030303031,
+      "grad_norm": 0.006890668533742428,
+      "learning_rate": 0.00018757842784301784,
+      "loss": 0.0005027144681662322,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0005,
+      "step": 1260,
+      "tokens/total": 20656128,
+      "tokens/train_per_sec_per_gpu": 14.26,
+      "tokens/trainable": 6540606
+    },
+    {
+      "epoch": 1.2327272727272727,
+      "grad_norm": 0.005489532835781574,
+      "learning_rate": 0.00018724960070727972,
+      "loss": 0.0006080259568989277,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00061,
+      "step": 1270,
+      "tokens/total": 20819968,
+      "tokens/train_per_sec_per_gpu": 13.92,
+      "tokens/trainable": 6592727
+    },
+    {
+      "epoch": 1.2424242424242424,
+      "grad_norm": 0.005877023097127676,
+      "learning_rate": 0.00018691677388853068,
+      "loss": 0.0006749071180820465,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 1280,
+      "tokens/total": 20983808,
+      "tokens/train_per_sec_per_gpu": 14.93,
+      "tokens/trainable": 6645179
+    },
+    {
+      "epoch": 1.2521212121212122,
+      "grad_norm": 0.0061390516348183155,
+      "learning_rate": 0.00018657996264416745,
+      "loss": 0.0002642946550622582,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 1290,
+      "tokens/total": 21147648,
+      "tokens/train_per_sec_per_gpu": 14.92,
+      "tokens/trainable": 6697406
+    },
+    {
+      "epoch": 1.2618181818181817,
+      "grad_norm": 0.03444842994213104,
+      "learning_rate": 0.0001862391824142402,
+      "loss": 0.0004464905709028244,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00045,
+      "step": 1300,
+      "tokens/total": 21311488,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 6749589
+    },
+    {
+      "epoch": 1.2715151515151515,
+      "grad_norm": 0.0036635284777730703,
+      "learning_rate": 0.00018589444882074474,
+      "loss": 0.0002096141455695033,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00021,
+      "step": 1310,
+      "tokens/total": 21475328,
+      "tokens/train_per_sec_per_gpu": 13.69,
+      "tokens/trainable": 6801799
+    },
+    {
+      "epoch": 1.2812121212121212,
+      "grad_norm": 0.003200239036232233,
+      "learning_rate": 0.00018554577766690636,
+      "loss": 0.00026335257571190595,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 1320,
+      "tokens/total": 21639168,
+      "tokens/train_per_sec_per_gpu": 14.58,
+      "tokens/trainable": 6854205
+    },
+    {
+      "epoch": 1.290909090909091,
+      "grad_norm": 0.00109296350274235,
+      "learning_rate": 0.0001851931849364554,
+      "loss": 0.0003910743165761232,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 1330,
+      "tokens/total": 21803008,
+      "tokens/train_per_sec_per_gpu": 14.96,
+      "tokens/trainable": 6906145
+    },
+    {
+      "epoch": 1.3006060606060605,
+      "grad_norm": 0.0006913666147738695,
+      "learning_rate": 0.00018483668679289452,
+      "loss": 0.0003079640679061413,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1340,
+      "tokens/total": 21966848,
+      "tokens/train_per_sec_per_gpu": 15.13,
+      "tokens/trainable": 6957405
+    },
+    {
+      "epoch": 1.3103030303030303,
+      "grad_norm": 0.03036116063594818,
+      "learning_rate": 0.00018447629957875776,
+      "loss": 0.0003281526267528534,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 1350,
+      "tokens/total": 22130688,
+      "tokens/train_per_sec_per_gpu": 15.08,
+      "tokens/trainable": 7009256
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.012580045498907566,
+      "learning_rate": 0.00018411203981486134,
+      "loss": 0.0006514057982712984,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 1360,
+      "tokens/total": 22294528,
+      "tokens/train_per_sec_per_gpu": 14.66,
+      "tokens/trainable": 7060734
+    },
+    {
+      "epoch": 1.3296969696969696,
+      "grad_norm": 0.00828342791646719,
+      "learning_rate": 0.00018374392419954628,
+      "loss": 0.0003020781092345715,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1370,
+      "tokens/total": 22458368,
+      "tokens/train_per_sec_per_gpu": 15.09,
+      "tokens/trainable": 7112415
+    },
+    {
+      "epoch": 1.3393939393939394,
+      "grad_norm": 0.09482505917549133,
+      "learning_rate": 0.00018337196960791302,
+      "loss": 0.0006797847803682089,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 1380,
+      "tokens/total": 22622208,
+      "tokens/train_per_sec_per_gpu": 15.03,
+      "tokens/trainable": 7164110
+    },
+    {
+      "epoch": 1.3490909090909091,
+      "grad_norm": 0.04534842446446419,
+      "learning_rate": 0.00018299619309104773,
+      "loss": 0.000729580270126462,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00073,
+      "step": 1390,
+      "tokens/total": 22786048,
+      "tokens/train_per_sec_per_gpu": 15.49,
+      "tokens/trainable": 7215797
+    },
+    {
+      "epoch": 1.3587878787878789,
+      "grad_norm": 0.010737202130258083,
+      "learning_rate": 0.00018261661187524072,
+      "loss": 0.0007514740340411663,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00075,
+      "step": 1400,
+      "tokens/total": 22949888,
+      "tokens/train_per_sec_per_gpu": 14.14,
+      "tokens/trainable": 7267691
+    },
+    {
+      "epoch": 1.3684848484848484,
+      "grad_norm": 0.05600081756711006,
+      "learning_rate": 0.00018223324336119672,
+      "loss": 0.001420076284557581,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00142,
+      "step": 1410,
+      "tokens/total": 23113728,
+      "tokens/train_per_sec_per_gpu": 15.3,
+      "tokens/trainable": 7319876
+    },
+    {
+      "epoch": 1.3781818181818182,
+      "grad_norm": 0.019460471346974373,
+      "learning_rate": 0.00018184610512323718,
+      "loss": 0.0022406818345189093,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00224,
+      "step": 1420,
+      "tokens/total": 23277568,
+      "tokens/train_per_sec_per_gpu": 14.38,
+      "tokens/trainable": 7371762
+    },
+    {
+      "epoch": 1.387878787878788,
+      "grad_norm": 0.03277068957686424,
+      "learning_rate": 0.00018145521490849477,
+      "loss": 0.000915923435240984,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00092,
+      "step": 1430,
+      "tokens/total": 23441408,
+      "tokens/train_per_sec_per_gpu": 14.66,
+      "tokens/trainable": 7423685
+    },
+    {
+      "epoch": 1.3975757575757575,
+      "grad_norm": 0.0156385600566864,
+      "learning_rate": 0.0001810605906360996,
+      "loss": 0.000897888746112585,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0009,
+      "step": 1440,
+      "tokens/total": 23605248,
+      "tokens/train_per_sec_per_gpu": 13.99,
+      "tokens/trainable": 7476266
+    },
+    {
+      "epoch": 1.4072727272727272,
+      "grad_norm": 0.01643913984298706,
+      "learning_rate": 0.00018066225039635794,
+      "loss": 0.000922933965921402,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00092,
+      "step": 1450,
+      "tokens/total": 23769088,
+      "tokens/train_per_sec_per_gpu": 14.57,
+      "tokens/trainable": 7528208
+    },
+    {
+      "epoch": 1.416969696969697,
+      "grad_norm": 0.024322666227817535,
+      "learning_rate": 0.00018026021244992287,
+      "loss": 0.0011652217246592045,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00117,
+      "step": 1460,
+      "tokens/total": 23932928,
+      "tokens/train_per_sec_per_gpu": 13.91,
+      "tokens/trainable": 7580038
+    },
+    {
+      "epoch": 1.4266666666666667,
+      "grad_norm": 0.05165834724903107,
+      "learning_rate": 0.0001798544952269572,
+      "loss": 0.0009731135331094265,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 1470,
+      "tokens/total": 24096768,
+      "tokens/train_per_sec_per_gpu": 14.56,
+      "tokens/trainable": 7631772
+    },
+    {
+      "epoch": 1.4363636363636363,
+      "grad_norm": 0.02529827691614628,
+      "learning_rate": 0.0001794451173262885,
+      "loss": 0.0005802253726869822,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00058,
+      "step": 1480,
+      "tokens/total": 24260608,
+      "tokens/train_per_sec_per_gpu": 13.72,
+      "tokens/trainable": 7683048
+    },
+    {
+      "epoch": 1.446060606060606,
+      "grad_norm": 0.0670745000243187,
+      "learning_rate": 0.00017903209751455665,
+      "loss": 0.000642474414780736,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1490,
+      "tokens/total": 24424448,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 7735332
+    },
+    {
+      "epoch": 1.4557575757575758,
+      "grad_norm": 0.02367187850177288,
+      "learning_rate": 0.00017861545472535348,
+      "loss": 0.00032834114972501993,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 1500,
+      "tokens/total": 24588288,
+      "tokens/train_per_sec_per_gpu": 16.37,
+      "tokens/trainable": 7787186
+    },
+    {
+      "epoch": 1.4654545454545453,
+      "grad_norm": 0.011678172275424004,
+      "learning_rate": 0.00017819520805835475,
+      "loss": 0.0009690596722066403,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 1510,
+      "tokens/total": 24752128,
+      "tokens/train_per_sec_per_gpu": 13.55,
+      "tokens/trainable": 7838878
+    },
+    {
+      "epoch": 1.475151515151515,
+      "grad_norm": 0.05298800393939018,
+      "learning_rate": 0.00017777137677844461,
+      "loss": 0.0009098535403609276,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00091,
+      "step": 1520,
+      "tokens/total": 24915968,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 7890631
+    },
+    {
+      "epoch": 1.4848484848484849,
+      "grad_norm": 0.037918779999017715,
+      "learning_rate": 0.00017734398031483265,
+      "loss": 0.0006457697600126266,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 1530,
+      "tokens/total": 25079808,
+      "tokens/train_per_sec_per_gpu": 13.25,
+      "tokens/trainable": 7942366
+    },
+    {
+      "epoch": 1.4945454545454546,
+      "grad_norm": 0.02729674056172371,
+      "learning_rate": 0.0001769130382601629,
+      "loss": 0.0009943137876689434,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00099,
+      "step": 1540,
+      "tokens/total": 25243648,
+      "tokens/train_per_sec_per_gpu": 14.37,
+      "tokens/trainable": 7994307
+    },
+    {
+      "epoch": 1.5023030303030303,
+      "eval_loss": 0.0006865999894216657,
+      "eval_ppl": 1.00069,
+      "eval_runtime": 12.127,
+      "eval_samples_per_second": 16.492,
+      "eval_steps_per_second": 8.246,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "step": 1548
+    },
+    {
+      "epoch": 1.5042424242424244,
+      "grad_norm": 0.053267233073711395,
+      "learning_rate": 0.00017647857036961592,
+      "loss": 0.0006284893956035375,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00063,
+      "step": 1550,
+      "tokens/total": 25407488,
+      "tokens/train_per_sec_per_gpu": 14.87,
+      "tokens/trainable": 8046124
+    },
+    {
+      "epoch": 1.513939393939394,
+      "grad_norm": 0.05232734978199005,
+      "learning_rate": 0.0001760405965600031,
+      "loss": 0.0005064161494374275,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 1560,
+      "tokens/total": 25571328,
+      "tokens/train_per_sec_per_gpu": 14.39,
+      "tokens/trainable": 8098367
+    },
+    {
+      "epoch": 1.5236363636363637,
+      "grad_norm": 0.015440079383552074,
+      "learning_rate": 0.00017559913690885364,
+      "loss": 0.0004742793273180723,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 1570,
+      "tokens/total": 25735168,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 8150005
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.005799058359116316,
+      "learning_rate": 0.00017515421165349414,
+      "loss": 0.0005522690713405609,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00055,
+      "step": 1580,
+      "tokens/total": 25899008,
+      "tokens/train_per_sec_per_gpu": 14.94,
+      "tokens/trainable": 8201985
+    },
+    {
+      "epoch": 1.543030303030303,
+      "grad_norm": 0.025745827704668045,
+      "learning_rate": 0.00017470584119012094,
+      "loss": 0.0004415466450154781,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 1590,
+      "tokens/total": 26062848,
+      "tokens/train_per_sec_per_gpu": 14.76,
+      "tokens/trainable": 8253407
+    },
+    {
+      "epoch": 1.5527272727272727,
+      "grad_norm": 0.006111942231655121,
+      "learning_rate": 0.00017425404607286508,
+      "loss": 0.0004033858887851238,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1600,
+      "tokens/total": 26226688,
+      "tokens/train_per_sec_per_gpu": 13.45,
+      "tokens/trainable": 8305596
+    },
+    {
+      "epoch": 1.5624242424242425,
+      "grad_norm": 0.01315031573176384,
+      "learning_rate": 0.00017379884701285,
+      "loss": 0.0006456051021814346,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 1610,
+      "tokens/total": 26390528,
+      "tokens/train_per_sec_per_gpu": 15.34,
+      "tokens/trainable": 8357648
+    },
+    {
+      "epoch": 1.5721212121212123,
+      "grad_norm": 0.002383842132985592,
+      "learning_rate": 0.00017334026487724225,
+      "loss": 0.00028960562776774167,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00029,
+      "step": 1620,
+      "tokens/total": 26554368,
+      "tokens/train_per_sec_per_gpu": 14.29,
+      "tokens/trainable": 8410056
+    },
+    {
+      "epoch": 1.5818181818181818,
+      "grad_norm": 0.006294222082942724,
+      "learning_rate": 0.0001728783206882948,
+      "loss": 0.00025043871719390156,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00025,
+      "step": 1630,
+      "tokens/total": 26718208,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 8461798
+    },
+    {
+      "epoch": 1.5915151515151515,
+      "grad_norm": 8.702854393050075e-05,
+      "learning_rate": 0.00017241303562238336,
+      "loss": 0.00012461008736863732,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 1640,
+      "tokens/total": 26882048,
+      "tokens/train_per_sec_per_gpu": 15.61,
+      "tokens/trainable": 8514035
+    },
+    {
+      "epoch": 1.601212121212121,
+      "grad_norm": 0.07624056935310364,
+      "learning_rate": 0.00017194443100903558,
+      "loss": 0.00024855402298271654,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00025,
+      "step": 1650,
+      "tokens/total": 27045888,
+      "tokens/train_per_sec_per_gpu": 14.48,
+      "tokens/trainable": 8565875
+    },
+    {
+      "epoch": 1.6109090909090908,
+      "grad_norm": 0.02497026138007641,
+      "learning_rate": 0.00017147252832995337,
+      "loss": 0.00044286823831498625,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 1660,
+      "tokens/total": 27209728,
+      "tokens/train_per_sec_per_gpu": 14.47,
+      "tokens/trainable": 8617912
+    },
+    {
+      "epoch": 1.6206060606060606,
+      "grad_norm": 0.0016530955908820033,
+      "learning_rate": 0.00017099734921802802,
+      "loss": 0.0003104714211076498,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1670,
+      "tokens/total": 27373568,
+      "tokens/train_per_sec_per_gpu": 13.53,
+      "tokens/trainable": 8669875
+    },
+    {
+      "epoch": 1.6303030303030304,
+      "grad_norm": 0.02621961385011673,
+      "learning_rate": 0.00017051891545634854,
+      "loss": 0.0004010321106761694,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1680,
+      "tokens/total": 27537408,
+      "tokens/train_per_sec_per_gpu": 16.09,
+      "tokens/trainable": 8721709
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.043721288442611694,
+      "learning_rate": 0.00017003724897720316,
+      "loss": 0.00042473864741623404,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 1690,
+      "tokens/total": 27701248,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 8773762
+    },
+    {
+      "epoch": 1.6496969696969697,
+      "grad_norm": 0.01791808009147644,
+      "learning_rate": 0.00016955237186107387,
+      "loss": 0.0003858121577650309,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 1700,
+      "tokens/total": 27865088,
+      "tokens/train_per_sec_per_gpu": 14.87,
+      "tokens/trainable": 8825435
+    },
+    {
+      "epoch": 1.6593939393939394,
+      "grad_norm": 0.017175329849123955,
+      "learning_rate": 0.0001690643063356241,
+      "loss": 0.0003785108681768179,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00038,
+      "step": 1710,
+      "tokens/total": 28028928,
+      "tokens/train_per_sec_per_gpu": 13.63,
+      "tokens/trainable": 8877227
+    },
+    {
+      "epoch": 1.669090909090909,
+      "grad_norm": 0.03429865464568138,
+      "learning_rate": 0.0001685730747746799,
+      "loss": 0.0003128159558400512,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1720,
+      "tokens/total": 28192768,
+      "tokens/train_per_sec_per_gpu": 13.42,
+      "tokens/trainable": 8928835
+    },
+    {
+      "epoch": 1.6787878787878787,
+      "grad_norm": 0.008623798377811909,
+      "learning_rate": 0.0001680786996972043,
+      "loss": 0.0008884714916348457,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00089,
+      "step": 1730,
+      "tokens/total": 28356608,
+      "tokens/train_per_sec_per_gpu": 14.8,
+      "tokens/trainable": 8979863
+    },
+    {
+      "epoch": 1.6884848484848485,
+      "grad_norm": 0.007137796841561794,
+      "learning_rate": 0.00016758120376626488,
+      "loss": 0.000342932902276516,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00034,
+      "step": 1740,
+      "tokens/total": 28520448,
+      "tokens/train_per_sec_per_gpu": 13.64,
+      "tokens/trainable": 9031317
+    },
+    {
+      "epoch": 1.6981818181818182,
+      "grad_norm": 0.006754934322088957,
+      "learning_rate": 0.00016708060978799493,
+      "loss": 0.00031610706355422735,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1750,
+      "tokens/total": 28684288,
+      "tokens/train_per_sec_per_gpu": 16.63,
+      "tokens/trainable": 9082925
+    },
+    {
+      "epoch": 1.707878787878788,
+      "grad_norm": 0.012158721685409546,
+      "learning_rate": 0.00016657694071054794,
+      "loss": 0.00039324900135397913,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 1760,
+      "tokens/total": 28848128,
+      "tokens/train_per_sec_per_gpu": 14.31,
+      "tokens/trainable": 9134535
+    },
+    {
+      "epoch": 1.7175757575757575,
+      "grad_norm": 0.04653792828321457,
+      "learning_rate": 0.00016607021962304565,
+      "loss": 0.0003617320442572236,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 1770,
+      "tokens/total": 29011968,
+      "tokens/train_per_sec_per_gpu": 14.01,
+      "tokens/trainable": 9186666
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 0.009638557210564613,
+      "learning_rate": 0.00016556046975451963,
+      "loss": 0.00031410730443894865,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1780,
+      "tokens/total": 29175808,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 9238529
+    },
+    {
+      "epoch": 1.7369696969696968,
+      "grad_norm": 0.017064686864614487,
+      "learning_rate": 0.0001650477144728462,
+      "loss": 0.00043909624218940735,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 1790,
+      "tokens/total": 29339648,
+      "tokens/train_per_sec_per_gpu": 14.08,
+      "tokens/trainable": 9290289
+    },
+    {
+      "epoch": 1.7466666666666666,
+      "grad_norm": 0.0022802259773015976,
+      "learning_rate": 0.00016453197728367563,
+      "loss": 0.00032380607444792986,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1800,
+      "tokens/total": 29503488,
+      "tokens/train_per_sec_per_gpu": 13.73,
+      "tokens/trainable": 9341953
+    },
+    {
+      "epoch": 1.7563636363636363,
+      "grad_norm": 0.0036841712426394224,
+      "learning_rate": 0.00016401328182935417,
+      "loss": 0.0006712255533784627,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 1810,
+      "tokens/total": 29667328,
+      "tokens/train_per_sec_per_gpu": 16.36,
+      "tokens/trainable": 9393126
+    },
+    {
+      "epoch": 1.766060606060606,
+      "grad_norm": 0.0006454121321439743,
+      "learning_rate": 0.0001634916518878404,
+      "loss": 0.00010477005271241069,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 1820,
+      "tokens/total": 29831168,
+      "tokens/train_per_sec_per_gpu": 14.7,
+      "tokens/trainable": 9444494
+    },
+    {
+      "epoch": 1.7757575757575759,
+      "grad_norm": 0.035474907606840134,
+      "learning_rate": 0.00016296711137161535,
+      "loss": 0.00034273902419954536,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00034,
+      "step": 1830,
+      "tokens/total": 29995008,
+      "tokens/train_per_sec_per_gpu": 14.78,
+      "tokens/trainable": 9496432
+    },
+    {
+      "epoch": 1.7854545454545454,
+      "grad_norm": 0.0042278701439499855,
+      "learning_rate": 0.00016243968432658605,
+      "loss": 0.0004896576981991529,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00049,
+      "step": 1840,
+      "tokens/total": 30158848,
+      "tokens/train_per_sec_per_gpu": 15.01,
+      "tokens/trainable": 9547913
+    },
+    {
+      "epoch": 1.7951515151515152,
+      "grad_norm": 0.008337569423019886,
+      "learning_rate": 0.00016190939493098344,
+      "loss": 0.0003711160738021135,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00037,
+      "step": 1850,
+      "tokens/total": 30322688,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 9599023
+    },
+    {
+      "epoch": 1.8048484848484847,
+      "grad_norm": 0.033457424491643906,
+      "learning_rate": 0.00016137626749425377,
+      "loss": 0.0005191094242036343,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00052,
+      "step": 1860,
+      "tokens/total": 30486528,
+      "tokens/train_per_sec_per_gpu": 14.35,
+      "tokens/trainable": 9651048
+    },
+    {
+      "epoch": 1.8145454545454545,
+      "grad_norm": 0.014811063185334206,
+      "learning_rate": 0.0001608403264559445,
+      "loss": 0.0002689486602321267,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 1870,
+      "tokens/total": 30650368,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 9703354
+    },
+    {
+      "epoch": 1.8242424242424242,
+      "grad_norm": 0.011829032562673092,
+      "learning_rate": 0.00016030159638458376,
+      "loss": 0.0003055253764614463,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1880,
+      "tokens/total": 30814208,
+      "tokens/train_per_sec_per_gpu": 14.05,
+      "tokens/trainable": 9755371
+    },
+    {
+      "epoch": 1.833939393939394,
+      "grad_norm": 0.003898326540365815,
+      "learning_rate": 0.00015976010197655397,
+      "loss": 0.00023026440758258104,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 1890,
+      "tokens/total": 30978048,
+      "tokens/train_per_sec_per_gpu": 13.89,
+      "tokens/trainable": 9807011
+    },
+    {
+      "epoch": 1.8436363636363637,
+      "grad_norm": 0.00993694830685854,
+      "learning_rate": 0.00015921586805496004,
+      "loss": 0.000414779270067811,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00041,
+      "step": 1900,
+      "tokens/total": 31141888,
+      "tokens/train_per_sec_per_gpu": 14.42,
+      "tokens/trainable": 9859849
+    },
+    {
+      "epoch": 1.8533333333333335,
+      "grad_norm": 0.00715588079765439,
+      "learning_rate": 0.0001586689195684911,
+      "loss": 0.0004666011780500412,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 1910,
+      "tokens/total": 31305728,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 9911712
+    },
+    {
+      "epoch": 1.863030303030303,
+      "grad_norm": 0.021137356758117676,
+      "learning_rate": 0.000158119281590277,
+      "loss": 0.00046254890039563177,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 1920,
+      "tokens/total": 31469568,
+      "tokens/train_per_sec_per_gpu": 14.81,
+      "tokens/trainable": 9963813
+    },
+    {
+      "epoch": 1.8727272727272726,
+      "grad_norm": 0.0023340010084211826,
+      "learning_rate": 0.000157566979316739,
+      "loss": 0.0004919813480228185,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00049,
+      "step": 1930,
+      "tokens/total": 31633408,
+      "tokens/train_per_sec_per_gpu": 15.8,
+      "tokens/trainable": 10015724
+    },
+    {
+      "epoch": 1.8824242424242423,
+      "grad_norm": 0.01151804905384779,
+      "learning_rate": 0.00015701203806643433,
+      "loss": 0.00023937469813972712,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00024,
+      "step": 1940,
+      "tokens/total": 31797248,
+      "tokens/train_per_sec_per_gpu": 14.32,
+      "tokens/trainable": 10067073
+    },
+    {
+      "epoch": 1.892121212121212,
+      "grad_norm": 0.016535570845007896,
+      "learning_rate": 0.00015645448327889603,
+      "loss": 0.00021827330347150563,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 1950,
+      "tokens/total": 31961088,
+      "tokens/train_per_sec_per_gpu": 14.48,
+      "tokens/trainable": 10119393
+    },
+    {
+      "epoch": 1.9018181818181819,
+      "grad_norm": 0.0034130853600800037,
+      "learning_rate": 0.00015589434051346634,
+      "loss": 0.00017861993983387948,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00018,
+      "step": 1960,
+      "tokens/total": 32124928,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 10171930
+    },
+    {
+      "epoch": 1.9115151515151516,
+      "grad_norm": 0.02398502826690674,
+      "learning_rate": 0.0001553316354481253,
+      "loss": 0.00014141426654532552,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00014,
+      "step": 1970,
+      "tokens/total": 32288768,
+      "tokens/train_per_sec_per_gpu": 15.59,
+      "tokens/trainable": 10223639
+    },
+    {
+      "epoch": 1.9212121212121214,
+      "grad_norm": 0.0007365989149548113,
+      "learning_rate": 0.00015476639387831343,
+      "loss": 0.00011406640987843275,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 1980,
+      "tokens/total": 32452608,
+      "tokens/train_per_sec_per_gpu": 13.45,
+      "tokens/trainable": 10275019
+    },
+    {
+      "epoch": 1.930909090909091,
+      "grad_norm": 0.028317851945757866,
+      "learning_rate": 0.00015419864171574944,
+      "loss": 0.0004076042678207159,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00041,
+      "step": 1990,
+      "tokens/total": 32616448,
+      "tokens/train_per_sec_per_gpu": 14.68,
+      "tokens/trainable": 10327234
+    },
+    {
+      "epoch": 1.9406060606060604,
+      "grad_norm": 0.0007216805825009942,
+      "learning_rate": 0.00015362840498724215,
+      "loss": 0.0002287053968757391,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 2000,
+      "tokens/total": 32780288,
+      "tokens/train_per_sec_per_gpu": 14.77,
+      "tokens/trainable": 10379906
+    },
+    {
+      "epoch": 1.9503030303030302,
+      "grad_norm": 0.021391045302152634,
+      "learning_rate": 0.00015305570983349743,
+      "loss": 0.0006855262909084558,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00069,
+      "step": 2010,
+      "tokens/total": 32944128,
+      "tokens/train_per_sec_per_gpu": 13.75,
+      "tokens/trainable": 10431864
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.014411289244890213,
+      "learning_rate": 0.00015248058250792008,
+      "loss": 0.00020992583595216274,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00021,
+      "step": 2020,
+      "tokens/total": 33107968,
+      "tokens/train_per_sec_per_gpu": 14.32,
+      "tokens/trainable": 10483503
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 0.0019180785166099668,
+      "learning_rate": 0.00015190304937540993,
+      "loss": 0.000295165297575295,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 2030,
+      "tokens/total": 33271808,
+      "tokens/train_per_sec_per_gpu": 15.32,
+      "tokens/trainable": 10534682
+    },
+    {
+      "epoch": 1.9793939393939395,
+      "grad_norm": 0.027906686067581177,
+      "learning_rate": 0.00015132313691115367,
+      "loss": 0.00030230602715164423,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 2040,
+      "tokens/total": 33435648,
+      "tokens/train_per_sec_per_gpu": 13.52,
+      "tokens/trainable": 10586848
+    },
+    {
+      "epoch": 1.9890909090909092,
+      "grad_norm": 0.030775317922234535,
+      "learning_rate": 0.00015074087169941085,
+      "loss": 0.00011671001557260752,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2050,
+      "tokens/total": 33599488,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 10638485
+    },
+    {
+      "epoch": 1.9987878787878788,
+      "grad_norm": 0.054577309638261795,
+      "learning_rate": 0.00015015628043229523,
+      "loss": 0.0003703285474330187,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00037,
+      "step": 2060,
+      "tokens/total": 33763328,
+      "tokens/train_per_sec_per_gpu": 14.81,
+      "tokens/trainable": 10689855
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5155,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 1031,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.504593503083561e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
+size 7121

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "target_parameters": [],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:304cf3c64ceeb4dbb87e6d765e3fdd3d8b3df46600c6e4d2ab994562417e6d49
+size 264308896

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7d8f1e89d2e8184d2cc04e29ba3277d83504548164114bf1fa45b8def190b14
+size 528915403

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11373fdb5420e35d6d93ff498e2565c10ff01f1d221981eab3aa5b4440e7e839
+size 14645

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67dd49c975b5d448314d39403a62311e9125e433e71f19378514313c6ecb95fd
+size 1465

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokens_state. ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"total": 50685952, "trainable": 16045130}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
+size 7121

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "target_parameters": [],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7907c6a742aff25f84719b2e90e16acc2e79bd97ad9e7127dbd22e6e86445cc0
+size 264308896

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1243521d4e81e503200c0a7fe4556360192e75412d7a01df57b733f134d517d2
+size 528915403

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f603d8f7ba790664405ac7fd41c632b9b529eac52f0f9d90a909cf98e312030e
+size 14645

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a257e56f88f21994d367dc800f9fb8e354b66c8cc6ee4d584b76332e1d572c3c
+size 1465

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokens_state. ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"total": 67588096, "trainable": 21396854}

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
+size 7121

checkpoints/math_operations/lora_sft_primitive_atomic_50k/debug.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_results.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2	+ math_operations,balanced_test_alpaca_results,500,8,1.60,500,100.00,492

checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_summary.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "overall": {
+    "total": 500,
+    "correct": 8,
+    "accuracy": 1.6,
+    "format_found": 500,
+    "format_accuracy": 100.0
+  },
+  "per_operation": {
+    "a": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "b": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "c": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    },
+    "d": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    },
+    "e": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "f": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "g": {
+      "total": 25,
+      "correct": 2,
+      "accuracy": 8.0,
+      "format_found": 25
+    },
+    "h": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "i": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    },
+    "j": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "k": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "l": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "m": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "n": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "o": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    },
+    "p": {
+      "total": 25,
+      "correct": 2,
+      "accuracy": 8.0,
+      "format_found": 25
+    },
+    "q": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "r": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "s": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "t": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    }
+  },
+  "n_errors": 492,
+  "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_results.jsonl"
+}