Jerry999 commited on 4 days ago

Commit

8b68ee6

verified ·

1 Parent(s): 7280ef9

Upload checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308

Browse files

Files changed (30) hide show

.gitattributes +2 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/chat_template.jinja +4 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/config.json +71 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/generation_config.json +12 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/model.safetensors +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/optimizer.pt +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/rng_state.pth +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/scheduler.pt +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer.json +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer_config.json +29 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokens_state. +1 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/trainer_state.json +2994 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/training_args.bin +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/chat_template.jinja +4 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/config.json +71 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/generation_config.json +12 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/model.safetensors +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/optimizer.pt +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/rng_state.pth +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/scheduler.pt +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer.json +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer_config.json +29 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokens_state. +1 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/trainer_state.json +0 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/training_args.bin +3 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/debug.log +0 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_results.csv +2 -0
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_summary.json +133 -0

.gitattributes CHANGED Viewed

@@ -70,3 +70,5 @@ checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokeni
 checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-3090/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-4120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-5150/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-3090/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-4120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-5150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.0.0"
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41548483ee88f462265e30988e470cae80e429f667488e27290ea8dcd96c7df8
+size 8822894520

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b120fe9ab7d83a8f3aa901047393a5faab6a2e2a98a720c552f360e6688766b
+size 16090225449

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea11996454b5587fcf33ae0ab5cf14b2031bf5f53f8c2ed5a48e87de31e29c84
+size 14645

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c33112fd93bfc97f8f9bcbedcd3ae38bbd63fe54948a8b1440778efd51de260
+size 1465

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokens_state. ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"total": 34121728, "trainable": 10689603}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2994 @@

+{
+  "best_global_step": 1563,
+  "best_metric": 0.0007253550575114787,
+  "best_model_checkpoint": null,
+  "epoch": 1.9992800575953924,
+  "eval_steps": 521,
+  "global_step": 2082,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 0.891994059085846,
+      "eval_ppl": 2.43999,
+      "eval_runtime": 17.3569,
+      "eval_samples_per_second": 11.523,
+      "eval_steps_per_second": 11.523,
+      "memory/device_reserved (GiB)": 10.64,
+      "memory/max_active (GiB)": 10.41,
+      "memory/max_allocated (GiB)": 10.41,
+      "step": 0
+    },
+    {
+      "epoch": 0.009599232061435085,
+      "grad_norm": 19.625,
+      "learning_rate": 2.884615384615385e-06,
+      "loss": 0.8230592727661132,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 2.27746,
+      "step": 10,
+      "tokens/total": 163840,
+      "tokens/train_per_sec_per_gpu": 8.53,
+      "tokens/trainable": 51937
+    },
+    {
+      "epoch": 0.01919846412287017,
+      "grad_norm": 6.0625,
+      "learning_rate": 6.08974358974359e-06,
+      "loss": 0.4511241436004639,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.57008,
+      "step": 20,
+      "tokens/total": 327680,
+      "tokens/train_per_sec_per_gpu": 8.58,
+      "tokens/trainable": 103351
+    },
+    {
+      "epoch": 0.028797696184305256,
+      "grad_norm": 3.796875,
+      "learning_rate": 9.294871794871795e-06,
+      "loss": 0.13991469144821167,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.15018,
+      "step": 30,
+      "tokens/total": 491520,
+      "tokens/train_per_sec_per_gpu": 8.33,
+      "tokens/trainable": 154482
+    },
+    {
+      "epoch": 0.03839692824574034,
+      "grad_norm": 1.6171875,
+      "learning_rate": 1.25e-05,
+      "loss": 0.017821089923381807,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01798,
+      "step": 40,
+      "tokens/total": 655360,
+      "tokens/train_per_sec_per_gpu": 7.78,
+      "tokens/trainable": 205247
+    },
+    {
+      "epoch": 0.04799616030717543,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.5705128205128205e-05,
+      "loss": 0.005615117400884629,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00563,
+      "step": 50,
+      "tokens/total": 819200,
+      "tokens/train_per_sec_per_gpu": 9.73,
+      "tokens/trainable": 257032
+    },
+    {
+      "epoch": 0.05759539236861051,
+      "grad_norm": 0.72265625,
+      "learning_rate": 1.891025641025641e-05,
+      "loss": 0.003246866911649704,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00325,
+      "step": 60,
+      "tokens/total": 983040,
+      "tokens/train_per_sec_per_gpu": 7.76,
+      "tokens/trainable": 309407
+    },
+    {
+      "epoch": 0.06719462443004559,
+      "grad_norm": 0.609375,
+      "learning_rate": 2.2115384615384616e-05,
+      "loss": 0.002711128443479538,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00271,
+      "step": 70,
+      "tokens/total": 1146880,
+      "tokens/train_per_sec_per_gpu": 7.92,
+      "tokens/trainable": 361385
+    },
+    {
+      "epoch": 0.07679385649148068,
+      "grad_norm": 0.359375,
+      "learning_rate": 2.5320512820512822e-05,
+      "loss": 0.00267685167491436,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00268,
+      "step": 80,
+      "tokens/total": 1310720,
+      "tokens/train_per_sec_per_gpu": 7.9,
+      "tokens/trainable": 413011
+    },
+    {
+      "epoch": 0.08639308855291576,
+      "grad_norm": 1.3359375,
+      "learning_rate": 2.8525641025641025e-05,
+      "loss": 0.002553700841963291,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00256,
+      "step": 90,
+      "tokens/total": 1474560,
+      "tokens/train_per_sec_per_gpu": 7.84,
+      "tokens/trainable": 464413
+    },
+    {
+      "epoch": 0.09599232061435085,
+      "grad_norm": 0.1640625,
+      "learning_rate": 3.1730769230769234e-05,
+      "loss": 0.003567858040332794,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00357,
+      "step": 100,
+      "tokens/total": 1638400,
+      "tokens/train_per_sec_per_gpu": 7.55,
+      "tokens/trainable": 516315
+    },
+    {
+      "epoch": 0.10559155267578593,
+      "grad_norm": 0.7734375,
+      "learning_rate": 3.4935897435897436e-05,
+      "loss": 0.002936176210641861,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00294,
+      "step": 110,
+      "tokens/total": 1802240,
+      "tokens/train_per_sec_per_gpu": 8.34,
+      "tokens/trainable": 568044
+    },
+    {
+      "epoch": 0.11519078473722102,
+      "grad_norm": 0.333984375,
+      "learning_rate": 3.814102564102564e-05,
+      "loss": 0.0027721570804715157,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00278,
+      "step": 120,
+      "tokens/total": 1966080,
+      "tokens/train_per_sec_per_gpu": 9.06,
+      "tokens/trainable": 620018
+    },
+    {
+      "epoch": 0.1247900167986561,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 4.134615384615385e-05,
+      "loss": 0.0024721408262848854,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00248,
+      "step": 130,
+      "tokens/total": 2129920,
+      "tokens/train_per_sec_per_gpu": 7.08,
+      "tokens/trainable": 670706
+    },
+    {
+      "epoch": 0.13438924886009118,
+      "grad_norm": 0.337890625,
+      "learning_rate": 4.455128205128206e-05,
+      "loss": 0.003622889146208763,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00363,
+      "step": 140,
+      "tokens/total": 2293760,
+      "tokens/train_per_sec_per_gpu": 8.19,
+      "tokens/trainable": 722137
+    },
+    {
+      "epoch": 0.14398848092152627,
+      "grad_norm": 0.298828125,
+      "learning_rate": 4.775641025641026e-05,
+      "loss": 0.002823374792933464,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00283,
+      "step": 150,
+      "tokens/total": 2457600,
+      "tokens/train_per_sec_per_gpu": 7.49,
+      "tokens/trainable": 773351
+    },
+    {
+      "epoch": 0.15358771298296137,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 5.096153846153846e-05,
+      "loss": 0.00175777580589056,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00176,
+      "step": 160,
+      "tokens/total": 2621440,
+      "tokens/train_per_sec_per_gpu": 9.77,
+      "tokens/trainable": 824611
+    },
+    {
+      "epoch": 0.16318694504439646,
+      "grad_norm": 0.216796875,
+      "learning_rate": 5.4166666666666664e-05,
+      "loss": 0.0025411507114768027,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00254,
+      "step": 170,
+      "tokens/total": 2785280,
+      "tokens/train_per_sec_per_gpu": 8.71,
+      "tokens/trainable": 875746
+    },
+    {
+      "epoch": 0.17278617710583152,
+      "grad_norm": 0.310546875,
+      "learning_rate": 5.737179487179487e-05,
+      "loss": 0.0037163086235523224,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00372,
+      "step": 180,
+      "tokens/total": 2949120,
+      "tokens/train_per_sec_per_gpu": 7.88,
+      "tokens/trainable": 927000
+    },
+    {
+      "epoch": 0.18238540916726662,
+      "grad_norm": 0.3125,
+      "learning_rate": 6.0576923076923076e-05,
+      "loss": 0.0028080834075808526,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00281,
+      "step": 190,
+      "tokens/total": 3112960,
+      "tokens/train_per_sec_per_gpu": 9.06,
+      "tokens/trainable": 978689
+    },
+    {
+      "epoch": 0.1919846412287017,
+      "grad_norm": 2.765625,
+      "learning_rate": 6.378205128205128e-05,
+      "loss": 0.05687007904052734,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.05852,
+      "step": 200,
+      "tokens/total": 3276800,
+      "tokens/train_per_sec_per_gpu": 9.12,
+      "tokens/trainable": 1029912
+    },
+    {
+      "epoch": 0.2015838732901368,
+      "grad_norm": 2.046875,
+      "learning_rate": 6.698717948717949e-05,
+      "loss": 0.021659491956233977,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.0219,
+      "step": 210,
+      "tokens/total": 3440640,
+      "tokens/train_per_sec_per_gpu": 8.6,
+      "tokens/trainable": 1081023
+    },
+    {
+      "epoch": 0.21118310535157186,
+      "grad_norm": 2.21875,
+      "learning_rate": 7.019230769230769e-05,
+      "loss": 0.012803517282009125,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01289,
+      "step": 220,
+      "tokens/total": 3604480,
+      "tokens/train_per_sec_per_gpu": 9.27,
+      "tokens/trainable": 1132719
+    },
+    {
+      "epoch": 0.22078233741300696,
+      "grad_norm": 0.53125,
+      "learning_rate": 7.339743589743589e-05,
+      "loss": 0.012191119790077209,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01227,
+      "step": 230,
+      "tokens/total": 3768320,
+      "tokens/train_per_sec_per_gpu": 8.39,
+      "tokens/trainable": 1184217
+    },
+    {
+      "epoch": 0.23038156947444205,
+      "grad_norm": 4.0625,
+      "learning_rate": 7.660256410256411e-05,
+      "loss": 0.011262766271829604,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01133,
+      "step": 240,
+      "tokens/total": 3932160,
+      "tokens/train_per_sec_per_gpu": 7.51,
+      "tokens/trainable": 1235669
+    },
+    {
+      "epoch": 0.23998080153587714,
+      "grad_norm": 22.125,
+      "learning_rate": 7.980769230769231e-05,
+      "loss": 0.015024600923061371,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01514,
+      "step": 250,
+      "tokens/total": 4096000,
+      "tokens/train_per_sec_per_gpu": 8.82,
+      "tokens/trainable": 1287445
+    },
+    {
+      "epoch": 0.2495800335973122,
+      "grad_norm": 0.68359375,
+      "learning_rate": 8.301282051282053e-05,
+      "loss": 0.01258222907781601,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01266,
+      "step": 260,
+      "tokens/total": 4259840,
+      "tokens/train_per_sec_per_gpu": 9.24,
+      "tokens/trainable": 1339971
+    },
+    {
+      "epoch": 0.2591792656587473,
+      "grad_norm": 5.375,
+      "learning_rate": 8.621794871794873e-05,
+      "loss": 0.019256196916103363,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01944,
+      "step": 270,
+      "tokens/total": 4423680,
+      "tokens/train_per_sec_per_gpu": 7.26,
+      "tokens/trainable": 1391647
+    },
+    {
+      "epoch": 0.26877849772018236,
+      "grad_norm": 0.6015625,
+      "learning_rate": 8.942307692307693e-05,
+      "loss": 0.03441511988639832,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.03501,
+      "step": 280,
+      "tokens/total": 4587520,
+      "tokens/train_per_sec_per_gpu": 8.07,
+      "tokens/trainable": 1443110
+    },
+    {
+      "epoch": 0.27837772978161746,
+      "grad_norm": 1.78125,
+      "learning_rate": 9.262820512820513e-05,
+      "loss": 0.0174191877245903,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01757,
+      "step": 290,
+      "tokens/total": 4751360,
+      "tokens/train_per_sec_per_gpu": 8.75,
+      "tokens/trainable": 1495030
+    },
+    {
+      "epoch": 0.28797696184305255,
+      "grad_norm": 0.34375,
+      "learning_rate": 9.583333333333334e-05,
+      "loss": 0.010925143957138062,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01099,
+      "step": 300,
+      "tokens/total": 4915200,
+      "tokens/train_per_sec_per_gpu": 8.65,
+      "tokens/trainable": 1545313
+    },
+    {
+      "epoch": 0.29757619390448764,
+      "grad_norm": 0.3125,
+      "learning_rate": 9.903846153846155e-05,
+      "loss": 0.010976283252239228,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.01104,
+      "step": 310,
+      "tokens/total": 5079040,
+      "tokens/train_per_sec_per_gpu": 9.3,
+      "tokens/trainable": 1596578
+    },
+    {
+      "epoch": 0.30717542596592273,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 0.007863689959049226,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00789,
+      "step": 320,
+      "tokens/total": 5242880,
+      "tokens/train_per_sec_per_gpu": 8.65,
+      "tokens/trainable": 1648111
+    },
+    {
+      "epoch": 0.3167746580273578,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0052565749734640125,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00527,
+      "step": 330,
+      "tokens/total": 5406720,
+      "tokens/train_per_sec_per_gpu": 9.0,
+      "tokens/trainable": 1699588
+    },
+    {
+      "epoch": 0.3263738900887929,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 0.009607769548892975,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00965,
+      "step": 340,
+      "tokens/total": 5570560,
+      "tokens/train_per_sec_per_gpu": 9.17,
+      "tokens/trainable": 1751730
+    },
+    {
+      "epoch": 0.33597312215022795,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0001,
+      "loss": 0.007573225349187851,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.0076,
+      "step": 350,
+      "tokens/total": 5734400,
+      "tokens/train_per_sec_per_gpu": 9.0,
+      "tokens/trainable": 1802979
+    },
+    {
+      "epoch": 0.34557235421166305,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.006453585624694824,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00647,
+      "step": 360,
+      "tokens/total": 5898240,
+      "tokens/train_per_sec_per_gpu": 9.38,
+      "tokens/trainable": 1853732
+    },
+    {
+      "epoch": 0.35517158627309814,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.006070464849472046,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00609,
+      "step": 370,
+      "tokens/total": 6062080,
+      "tokens/train_per_sec_per_gpu": 8.22,
+      "tokens/trainable": 1904439
+    },
+    {
+      "epoch": 0.36477081833453323,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.005775686353445053,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00579,
+      "step": 380,
+      "tokens/total": 6225920,
+      "tokens/train_per_sec_per_gpu": 8.22,
+      "tokens/trainable": 1955620
+    },
+    {
+      "epoch": 0.3743700503959683,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.005018413811922073,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00503,
+      "step": 390,
+      "tokens/total": 6389760,
+      "tokens/train_per_sec_per_gpu": 9.88,
+      "tokens/trainable": 2007340
+    },
+    {
+      "epoch": 0.3839692824574034,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.003989457339048386,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.004,
+      "step": 400,
+      "tokens/total": 6553600,
+      "tokens/train_per_sec_per_gpu": 7.98,
+      "tokens/trainable": 2059592
+    },
+    {
+      "epoch": 0.3935685145188385,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.004139231517910957,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00415,
+      "step": 410,
+      "tokens/total": 6717440,
+      "tokens/train_per_sec_per_gpu": 8.12,
+      "tokens/trainable": 2111025
+    },
+    {
+      "epoch": 0.4031677465802736,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.004084679111838341,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00409,
+      "step": 420,
+      "tokens/total": 6881280,
+      "tokens/train_per_sec_per_gpu": 9.09,
+      "tokens/trainable": 2161939
+    },
+    {
+      "epoch": 0.41276697864170864,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0030223120003938673,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00303,
+      "step": 430,
+      "tokens/total": 7045120,
+      "tokens/train_per_sec_per_gpu": 7.33,
+      "tokens/trainable": 2213313
+    },
+    {
+      "epoch": 0.42236621070314373,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0029419407248497008,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00295,
+      "step": 440,
+      "tokens/total": 7208960,
+      "tokens/train_per_sec_per_gpu": 8.21,
+      "tokens/trainable": 2264334
+    },
+    {
+      "epoch": 0.4319654427645788,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0034121278673410415,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00342,
+      "step": 450,
+      "tokens/total": 7372800,
+      "tokens/train_per_sec_per_gpu": 7.85,
+      "tokens/trainable": 2315460
+    },
+    {
+      "epoch": 0.4415646748260139,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.002534863166511059,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00254,
+      "step": 460,
+      "tokens/total": 7536640,
+      "tokens/train_per_sec_per_gpu": 8.37,
+      "tokens/trainable": 2366296
+    },
+    {
+      "epoch": 0.451163906887449,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0022289998829364776,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00223,
+      "step": 470,
+      "tokens/total": 7700480,
+      "tokens/train_per_sec_per_gpu": 8.15,
+      "tokens/trainable": 2417678
+    },
+    {
+      "epoch": 0.4607631389488841,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.002741745673120022,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00275,
+      "step": 480,
+      "tokens/total": 7864320,
+      "tokens/train_per_sec_per_gpu": 9.5,
+      "tokens/trainable": 2469401
+    },
+    {
+      "epoch": 0.4703623710103192,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0031233657151460647,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00313,
+      "step": 490,
+      "tokens/total": 8028160,
+      "tokens/train_per_sec_per_gpu": 9.42,
+      "tokens/trainable": 2520706
+    },
+    {
+      "epoch": 0.4799616030717543,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.007012879848480225,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00704,
+      "step": 500,
+      "tokens/total": 8192000,
+      "tokens/train_per_sec_per_gpu": 8.62,
+      "tokens/trainable": 2571927
+    },
+    {
+      "epoch": 0.4895608351331893,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.007117580622434616,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.00714,
+      "step": 510,
+      "tokens/total": 8355840,
+      "tokens/train_per_sec_per_gpu": 8.66,
+      "tokens/trainable": 2623451
+    },
+    {
+      "epoch": 0.4991600671946244,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.004591656103730202,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "ppl": 1.0046,
+      "step": 520,
+      "tokens/total": 8519680,
+      "tokens/train_per_sec_per_gpu": 9.12,
+      "tokens/trainable": 2674443
+    },
+    {
+      "epoch": 0.5001199904007679,
+      "eval_loss": 0.0049089775420725346,
+      "eval_ppl": 1.00492,
+      "eval_runtime": 9.006,
+      "eval_samples_per_second": 22.207,
+      "eval_steps_per_second": 22.207,
+      "memory/device_reserved (GiB)": 36.5,
+      "memory/max_active (GiB)": 33.97,
+      "memory/max_allocated (GiB)": 33.97,
+      "step": 521
+    },
+    {
+      "epoch": 0.5087592992560596,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.004098504409193992,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.96,
+      "memory/max_allocated (GiB)": 33.96,
+      "ppl": 1.00411,
+      "step": 530,
+      "tokens/total": 8683520,
+      "tokens/train_per_sec_per_gpu": 7.44,
+      "tokens/trainable": 2725882
+    },
+    {
+      "epoch": 0.5183585313174947,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.004386116191744805,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0044,
+      "step": 540,
+      "tokens/total": 8847360,
+      "tokens/train_per_sec_per_gpu": 8.34,
+      "tokens/trainable": 2777518
+    },
+    {
+      "epoch": 0.5279577633789296,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.003661666065454483,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00367,
+      "step": 550,
+      "tokens/total": 9011200,
+      "tokens/train_per_sec_per_gpu": 9.27,
+      "tokens/trainable": 2828761
+    },
+    {
+      "epoch": 0.5375569954403647,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0033755451440811157,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00338,
+      "step": 560,
+      "tokens/total": 9175040,
+      "tokens/train_per_sec_per_gpu": 8.99,
+      "tokens/trainable": 2879740
+    },
+    {
+      "epoch": 0.5471562275017998,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.003271551802754402,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00328,
+      "step": 570,
+      "tokens/total": 9338880,
+      "tokens/train_per_sec_per_gpu": 8.89,
+      "tokens/trainable": 2931031
+    },
+    {
+      "epoch": 0.5567554595632349,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.00392816960811615,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00394,
+      "step": 580,
+      "tokens/total": 9502720,
+      "tokens/train_per_sec_per_gpu": 8.26,
+      "tokens/trainable": 2981776
+    },
+    {
+      "epoch": 0.56635469162467,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0031796425580978395,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00318,
+      "step": 590,
+      "tokens/total": 9666560,
+      "tokens/train_per_sec_per_gpu": 8.26,
+      "tokens/trainable": 3032962
+    },
+    {
+      "epoch": 0.5759539236861051,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.002615358680486679,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00262,
+      "step": 600,
+      "tokens/total": 9830400,
+      "tokens/train_per_sec_per_gpu": 7.91,
+      "tokens/trainable": 3084477
+    },
+    {
+      "epoch": 0.5855531557475402,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0032230135053396224,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00323,
+      "step": 610,
+      "tokens/total": 9994240,
+      "tokens/train_per_sec_per_gpu": 8.79,
+      "tokens/trainable": 3135491
+    },
+    {
+      "epoch": 0.5951523878089753,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.002824882231652737,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00283,
+      "step": 620,
+      "tokens/total": 10158080,
+      "tokens/train_per_sec_per_gpu": 7.72,
+      "tokens/trainable": 3186671
+    },
+    {
+      "epoch": 0.6047516198704104,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0019276419654488564,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00193,
+      "step": 630,
+      "tokens/total": 10321920,
+      "tokens/train_per_sec_per_gpu": 9.45,
+      "tokens/trainable": 3238286
+    },
+    {
+      "epoch": 0.6143508519318455,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.0023364221677184107,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00234,
+      "step": 640,
+      "tokens/total": 10485760,
+      "tokens/train_per_sec_per_gpu": 8.34,
+      "tokens/trainable": 3290171
+    },
+    {
+      "epoch": 0.6239500839932806,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0024619314819574354,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00246,
+      "step": 650,
+      "tokens/total": 10649600,
+      "tokens/train_per_sec_per_gpu": 8.26,
+      "tokens/trainable": 3341228
+    },
+    {
+      "epoch": 0.6335493160547156,
+      "grad_norm": 0.017333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014947694726288319,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0015,
+      "step": 660,
+      "tokens/total": 10813440,
+      "tokens/train_per_sec_per_gpu": 7.53,
+      "tokens/trainable": 3392191
+    },
+    {
+      "epoch": 0.6431485481161507,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016866009682416916,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00169,
+      "step": 670,
+      "tokens/total": 10977280,
+      "tokens/train_per_sec_per_gpu": 7.27,
+      "tokens/trainable": 3442697
+    },
+    {
+      "epoch": 0.6527477801775858,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014289443381130696,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00143,
+      "step": 680,
+      "tokens/total": 11141120,
+      "tokens/train_per_sec_per_gpu": 9.48,
+      "tokens/trainable": 3494178
+    },
+    {
+      "epoch": 0.6623470122390209,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012737856246531009,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00127,
+      "step": 690,
+      "tokens/total": 11304960,
+      "tokens/train_per_sec_per_gpu": 9.0,
+      "tokens/trainable": 3545238
+    },
+    {
+      "epoch": 0.6719462443004559,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016797658056020737,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00168,
+      "step": 700,
+      "tokens/total": 11468800,
+      "tokens/train_per_sec_per_gpu": 8.11,
+      "tokens/trainable": 3595974
+    },
+    {
+      "epoch": 0.681545476361891,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012735738418996334,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00127,
+      "step": 710,
+      "tokens/total": 11632640,
+      "tokens/train_per_sec_per_gpu": 7.24,
+      "tokens/trainable": 3646406
+    },
+    {
+      "epoch": 0.6911447084233261,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016826316714286804,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00168,
+      "step": 720,
+      "tokens/total": 11796480,
+      "tokens/train_per_sec_per_gpu": 8.32,
+      "tokens/trainable": 3697893
+    },
+    {
+      "epoch": 0.7007439404847612,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.001028579194098711,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00103,
+      "step": 730,
+      "tokens/total": 11960320,
+      "tokens/train_per_sec_per_gpu": 7.29,
+      "tokens/trainable": 3749410
+    },
+    {
+      "epoch": 0.7103431725461963,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013211018405854702,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00132,
+      "step": 740,
+      "tokens/total": 12124160,
+      "tokens/train_per_sec_per_gpu": 7.22,
+      "tokens/trainable": 3800410
+    },
+    {
+      "epoch": 0.7199424046076314,
+      "grad_norm": 3.546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5068239688873291,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.66001,
+      "step": 750,
+      "tokens/total": 12288000,
+      "tokens/train_per_sec_per_gpu": 8.82,
+      "tokens/trainable": 3852024
+    },
+    {
+      "epoch": 0.7295416366690665,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0001,
+      "loss": 0.042395052313804624,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.04331,
+      "step": 760,
+      "tokens/total": 12451840,
+      "tokens/train_per_sec_per_gpu": 9.1,
+      "tokens/trainable": 3903571
+    },
+    {
+      "epoch": 0.7391408687305016,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.054154080152511594,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.05565,
+      "step": 770,
+      "tokens/total": 12615680,
+      "tokens/train_per_sec_per_gpu": 8.87,
+      "tokens/trainable": 3954797
+    },
+    {
+      "epoch": 0.7487401007919366,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0001,
+      "loss": 0.015584257245063782,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.01571,
+      "step": 780,
+      "tokens/total": 12779520,
+      "tokens/train_per_sec_per_gpu": 7.53,
+      "tokens/trainable": 4005775
+    },
+    {
+      "epoch": 0.7583393328533717,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0001,
+      "loss": 0.02899232506752014,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.02942,
+      "step": 790,
+      "tokens/total": 12943360,
+      "tokens/train_per_sec_per_gpu": 8.92,
+      "tokens/trainable": 4056679
+    },
+    {
+      "epoch": 0.7679385649148068,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.011905992776155472,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.01198,
+      "step": 800,
+      "tokens/total": 13107200,
+      "tokens/train_per_sec_per_gpu": 9.32,
+      "tokens/trainable": 4107997
+    },
+    {
+      "epoch": 0.7775377969762419,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.006751462817192078,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00677,
+      "step": 810,
+      "tokens/total": 13271040,
+      "tokens/train_per_sec_per_gpu": 7.92,
+      "tokens/trainable": 4158984
+    },
+    {
+      "epoch": 0.787137029037677,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0001,
+      "loss": 0.005448491126298904,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00546,
+      "step": 820,
+      "tokens/total": 13434880,
+      "tokens/train_per_sec_per_gpu": 9.18,
+      "tokens/trainable": 4210162
+    },
+    {
+      "epoch": 0.7967362610991121,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.00531839057803154,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00533,
+      "step": 830,
+      "tokens/total": 13598720,
+      "tokens/train_per_sec_per_gpu": 7.73,
+      "tokens/trainable": 4261207
+    },
+    {
+      "epoch": 0.8063354931605472,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0001,
+      "loss": 0.004298893362283706,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00431,
+      "step": 840,
+      "tokens/total": 13762560,
+      "tokens/train_per_sec_per_gpu": 8.2,
+      "tokens/trainable": 4312249
+    },
+    {
+      "epoch": 0.8159347252219823,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.006566829234361649,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00659,
+      "step": 850,
+      "tokens/total": 13926400,
+      "tokens/train_per_sec_per_gpu": 9.0,
+      "tokens/trainable": 4363740
+    },
+    {
+      "epoch": 0.8255339572834173,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0052708122879266735,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00528,
+      "step": 860,
+      "tokens/total": 14090240,
+      "tokens/train_per_sec_per_gpu": 8.52,
+      "tokens/trainable": 4414546
+    },
+    {
+      "epoch": 0.8351331893448524,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.004131903126835823,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00414,
+      "step": 870,
+      "tokens/total": 14254080,
+      "tokens/train_per_sec_per_gpu": 9.16,
+      "tokens/trainable": 4465604
+    },
+    {
+      "epoch": 0.8447324214062875,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.003341007232666016,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00335,
+      "step": 880,
+      "tokens/total": 14417920,
+      "tokens/train_per_sec_per_gpu": 9.26,
+      "tokens/trainable": 4516755
+    },
+    {
+      "epoch": 0.8543316534677226,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0022021437063813208,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0022,
+      "step": 890,
+      "tokens/total": 14581760,
+      "tokens/train_per_sec_per_gpu": 9.37,
+      "tokens/trainable": 4568307
+    },
+    {
+      "epoch": 0.8639308855291576,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.001743432879447937,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00174,
+      "step": 900,
+      "tokens/total": 14745600,
+      "tokens/train_per_sec_per_gpu": 7.47,
+      "tokens/trainable": 4619967
+    },
+    {
+      "epoch": 0.8735301175905927,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.001867898181080818,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00187,
+      "step": 910,
+      "tokens/total": 14909440,
+      "tokens/train_per_sec_per_gpu": 9.38,
+      "tokens/trainable": 4671271
+    },
+    {
+      "epoch": 0.8831293496520278,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0017737392336130142,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00178,
+      "step": 920,
+      "tokens/total": 15073280,
+      "tokens/train_per_sec_per_gpu": 9.17,
+      "tokens/trainable": 4723106
+    },
+    {
+      "epoch": 0.8927285817134629,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0029280630871653555,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00293,
+      "step": 930,
+      "tokens/total": 15237120,
+      "tokens/train_per_sec_per_gpu": 8.5,
+      "tokens/trainable": 4774009
+    },
+    {
+      "epoch": 0.902327813774898,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.002748473361134529,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00275,
+      "step": 940,
+      "tokens/total": 15400960,
+      "tokens/train_per_sec_per_gpu": 8.9,
+      "tokens/trainable": 4825081
+    },
+    {
+      "epoch": 0.9119270458363331,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0015982367098331452,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0016,
+      "step": 950,
+      "tokens/total": 15564800,
+      "tokens/train_per_sec_per_gpu": 9.43,
+      "tokens/trainable": 4877201
+    },
+    {
+      "epoch": 0.9215262778977682,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018960090354084968,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0019,
+      "step": 960,
+      "tokens/total": 15728640,
+      "tokens/train_per_sec_per_gpu": 9.03,
+      "tokens/trainable": 4929476
+    },
+    {
+      "epoch": 0.9311255099592033,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0017032548785209656,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0017,
+      "step": 970,
+      "tokens/total": 15892480,
+      "tokens/train_per_sec_per_gpu": 8.97,
+      "tokens/trainable": 4981090
+    },
+    {
+      "epoch": 0.9407247420206384,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0012425887398421764,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00124,
+      "step": 980,
+      "tokens/total": 16056320,
+      "tokens/train_per_sec_per_gpu": 8.64,
+      "tokens/trainable": 5032187
+    },
+    {
+      "epoch": 0.9503239740820735,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0016014887019991874,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0016,
+      "step": 990,
+      "tokens/total": 16220160,
+      "tokens/train_per_sec_per_gpu": 9.25,
+      "tokens/trainable": 5083255
+    },
+    {
+      "epoch": 0.9599232061435086,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0015840081498026848,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00159,
+      "step": 1000,
+      "tokens/total": 16384000,
+      "tokens/train_per_sec_per_gpu": 8.04,
+      "tokens/trainable": 5133643
+    },
+    {
+      "epoch": 0.9695224382049437,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0026744097471237183,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00268,
+      "step": 1010,
+      "tokens/total": 16547840,
+      "tokens/train_per_sec_per_gpu": 7.69,
+      "tokens/trainable": 5185486
+    },
+    {
+      "epoch": 0.9791216702663786,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014427711255848407,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00144,
+      "step": 1020,
+      "tokens/total": 16711680,
+      "tokens/train_per_sec_per_gpu": 7.37,
+      "tokens/trainable": 5236078
+    },
+    {
+      "epoch": 0.9887209023278137,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015522641129791736,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00155,
+      "step": 1030,
+      "tokens/total": 16875520,
+      "tokens/train_per_sec_per_gpu": 8.48,
+      "tokens/trainable": 5287369
+    },
+    {
+      "epoch": 0.9983201343892488,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.001351279579102993,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00135,
+      "step": 1040,
+      "tokens/total": 17039360,
+      "tokens/train_per_sec_per_gpu": 8.94,
+      "tokens/trainable": 5338389
+    },
+    {
+      "epoch": 1.0009599232061435,
+      "eval_loss": 0.001463641761802137,
+      "eval_ppl": 1.00146,
+      "eval_runtime": 8.8734,
+      "eval_samples_per_second": 22.539,
+      "eval_steps_per_second": 22.539,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "step": 1042
+    },
+    {
+      "epoch": 1.0086393088552916,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0011446304619312287,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.96,
+      "memory/max_allocated (GiB)": 33.96,
+      "ppl": 1.00115,
+      "step": 1050,
+      "tokens/total": 17213440,
+      "tokens/train_per_sec_per_gpu": 10.07,
+      "tokens/trainable": 5392886
+    },
+    {
+      "epoch": 1.0182385409167267,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.001120314747095108,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00112,
+      "step": 1060,
+      "tokens/total": 17377280,
+      "tokens/train_per_sec_per_gpu": 8.48,
+      "tokens/trainable": 5444495
+    },
+    {
+      "epoch": 1.0278377729781618,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0010949315503239632,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0011,
+      "step": 1070,
+      "tokens/total": 17541120,
+      "tokens/train_per_sec_per_gpu": 8.47,
+      "tokens/trainable": 5496208
+    },
+    {
+      "epoch": 1.037437005039597,
+      "grad_norm": 0.0184326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.000931826326996088,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00093,
+      "step": 1080,
+      "tokens/total": 17704960,
+      "tokens/train_per_sec_per_gpu": 7.56,
+      "tokens/trainable": 5547530
+    },
+    {
+      "epoch": 1.047036237101032,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007877454161643981,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00079,
+      "step": 1090,
+      "tokens/total": 17868800,
+      "tokens/train_per_sec_per_gpu": 8.39,
+      "tokens/trainable": 5598017
+    },
+    {
+      "epoch": 1.056635469162467,
+      "grad_norm": 0.029052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.000616989703848958,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00062,
+      "step": 1100,
+      "tokens/total": 18032640,
+      "tokens/train_per_sec_per_gpu": 8.31,
+      "tokens/trainable": 5649223
+    },
+    {
+      "epoch": 1.0662347012239022,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008214156143367291,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 1110,
+      "tokens/total": 18196480,
+      "tokens/train_per_sec_per_gpu": 9.55,
+      "tokens/trainable": 5701017
+    },
+    {
+      "epoch": 1.0758339332853373,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007145676761865615,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00071,
+      "step": 1120,
+      "tokens/total": 18360320,
+      "tokens/train_per_sec_per_gpu": 7.91,
+      "tokens/trainable": 5752591
+    },
+    {
+      "epoch": 1.0854331653467724,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.0010736193507909775,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00107,
+      "step": 1130,
+      "tokens/total": 18524160,
+      "tokens/train_per_sec_per_gpu": 9.44,
+      "tokens/trainable": 5803980
+    },
+    {
+      "epoch": 1.0950323974082075,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0010662767104804515,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00107,
+      "step": 1140,
+      "tokens/total": 18688000,
+      "tokens/train_per_sec_per_gpu": 8.0,
+      "tokens/trainable": 5854955
+    },
+    {
+      "epoch": 1.1046316294696423,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005197681020945311,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00052,
+      "step": 1150,
+      "tokens/total": 18851840,
+      "tokens/train_per_sec_per_gpu": 7.34,
+      "tokens/trainable": 5905676
+    },
+    {
+      "epoch": 1.1142308615310774,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009939110837876796,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00099,
+      "step": 1160,
+      "tokens/total": 19015680,
+      "tokens/train_per_sec_per_gpu": 7.12,
+      "tokens/trainable": 5956999
+    },
+    {
+      "epoch": 1.1238300935925125,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008747033774852752,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00088,
+      "step": 1170,
+      "tokens/total": 19179520,
+      "tokens/train_per_sec_per_gpu": 7.56,
+      "tokens/trainable": 6008446
+    },
+    {
+      "epoch": 1.1334293256539476,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009802436456084252,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00098,
+      "step": 1180,
+      "tokens/total": 19343360,
+      "tokens/train_per_sec_per_gpu": 8.36,
+      "tokens/trainable": 6060444
+    },
+    {
+      "epoch": 1.1430285577153827,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006262516602873802,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00063,
+      "step": 1190,
+      "tokens/total": 19507200,
+      "tokens/train_per_sec_per_gpu": 9.44,
+      "tokens/trainable": 6112318
+    },
+    {
+      "epoch": 1.1526277897768178,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008734981529414654,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00087,
+      "step": 1200,
+      "tokens/total": 19671040,
+      "tokens/train_per_sec_per_gpu": 8.47,
+      "tokens/trainable": 6163992
+    },
+    {
+      "epoch": 1.1622270218382529,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009970812126994133,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.001,
+      "step": 1210,
+      "tokens/total": 19834880,
+      "tokens/train_per_sec_per_gpu": 7.83,
+      "tokens/trainable": 6214313
+    },
+    {
+      "epoch": 1.171826253899688,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009464750066399575,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00095,
+      "step": 1220,
+      "tokens/total": 19998720,
+      "tokens/train_per_sec_per_gpu": 9.08,
+      "tokens/trainable": 6265730
+    },
+    {
+      "epoch": 1.181425485961123,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.001540043018758297,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00154,
+      "step": 1230,
+      "tokens/total": 20162560,
+      "tokens/train_per_sec_per_gpu": 9.53,
+      "tokens/trainable": 6317338
+    },
+    {
+      "epoch": 1.1910247180225582,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.001301754917949438,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0013,
+      "step": 1240,
+      "tokens/total": 20326400,
+      "tokens/train_per_sec_per_gpu": 6.71,
+      "tokens/trainable": 6368478
+    },
+    {
+      "epoch": 1.2006239500839933,
+      "grad_norm": 0.01708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007991308346390724,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0008,
+      "step": 1250,
+      "tokens/total": 20490240,
+      "tokens/train_per_sec_per_gpu": 8.48,
+      "tokens/trainable": 6420144
+    },
+    {
+      "epoch": 1.2102231821454283,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011655298061668874,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00117,
+      "step": 1260,
+      "tokens/total": 20654080,
+      "tokens/train_per_sec_per_gpu": 9.43,
+      "tokens/trainable": 6471183
+    },
+    {
+      "epoch": 1.2198224142068634,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007856052368879318,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00079,
+      "step": 1270,
+      "tokens/total": 20817920,
+      "tokens/train_per_sec_per_gpu": 9.04,
+      "tokens/trainable": 6522523
+    },
+    {
+      "epoch": 1.2294216462682985,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009363952092826366,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00094,
+      "step": 1280,
+      "tokens/total": 20981760,
+      "tokens/train_per_sec_per_gpu": 8.04,
+      "tokens/trainable": 6573574
+    },
+    {
+      "epoch": 1.2390208783297336,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008232606574892998,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 1290,
+      "tokens/total": 21145600,
+      "tokens/train_per_sec_per_gpu": 8.19,
+      "tokens/trainable": 6624754
+    },
+    {
+      "epoch": 1.2486201103911687,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007726194337010384,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00077,
+      "step": 1300,
+      "tokens/total": 21309440,
+      "tokens/train_per_sec_per_gpu": 8.56,
+      "tokens/trainable": 6676467
+    },
+    {
+      "epoch": 1.2582193424526038,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007264631800353527,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00073,
+      "step": 1310,
+      "tokens/total": 21473280,
+      "tokens/train_per_sec_per_gpu": 6.8,
+      "tokens/trainable": 6727717
+    },
+    {
+      "epoch": 1.267818574514039,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0010542750358581543,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00105,
+      "step": 1320,
+      "tokens/total": 21637120,
+      "tokens/train_per_sec_per_gpu": 8.18,
+      "tokens/trainable": 6778400
+    },
+    {
+      "epoch": 1.277417806575474,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007948096841573715,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0008,
+      "step": 1330,
+      "tokens/total": 21800960,
+      "tokens/train_per_sec_per_gpu": 8.11,
+      "tokens/trainable": 6829680
+    },
+    {
+      "epoch": 1.287017038636909,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0010158532299101354,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00102,
+      "step": 1340,
+      "tokens/total": 21964800,
+      "tokens/train_per_sec_per_gpu": 8.58,
+      "tokens/trainable": 6880865
+    },
+    {
+      "epoch": 1.2966162706983442,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007738139480352402,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00077,
+      "step": 1350,
+      "tokens/total": 22128640,
+      "tokens/train_per_sec_per_gpu": 7.45,
+      "tokens/trainable": 6932720
+    },
+    {
+      "epoch": 1.3062155027597793,
+      "grad_norm": 0.01361083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.00043031726963818075,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00043,
+      "step": 1360,
+      "tokens/total": 22292480,
+      "tokens/train_per_sec_per_gpu": 7.66,
+      "tokens/trainable": 6983558
+    },
+    {
+      "epoch": 1.3158147348212144,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005287491250783205,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00053,
+      "step": 1370,
+      "tokens/total": 22456320,
+      "tokens/train_per_sec_per_gpu": 8.04,
+      "tokens/trainable": 7035633
+    },
+    {
+      "epoch": 1.3254139668826495,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0010193496011197567,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00102,
+      "step": 1380,
+      "tokens/total": 22620160,
+      "tokens/train_per_sec_per_gpu": 8.95,
+      "tokens/trainable": 7087613
+    },
+    {
+      "epoch": 1.3350131989440845,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008692140690982342,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00087,
+      "step": 1390,
+      "tokens/total": 22784000,
+      "tokens/train_per_sec_per_gpu": 9.06,
+      "tokens/trainable": 7138863
+    },
+    {
+      "epoch": 1.3446124310055196,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008631485514342784,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00086,
+      "step": 1400,
+      "tokens/total": 22947840,
+      "tokens/train_per_sec_per_gpu": 8.53,
+      "tokens/trainable": 7190354
+    },
+    {
+      "epoch": 1.3542116630669545,
+      "grad_norm": 0.028564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0010508927516639233,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00105,
+      "step": 1410,
+      "tokens/total": 23111680,
+      "tokens/train_per_sec_per_gpu": 8.97,
+      "tokens/trainable": 7241829
+    },
+    {
+      "epoch": 1.3638108951283896,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008184337988495827,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 1420,
+      "tokens/total": 23275520,
+      "tokens/train_per_sec_per_gpu": 7.94,
+      "tokens/trainable": 7292651
+    },
+    {
+      "epoch": 1.3734101271898247,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009149087592959404,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00092,
+      "step": 1430,
+      "tokens/total": 23439360,
+      "tokens/train_per_sec_per_gpu": 8.76,
+      "tokens/trainable": 7344149
+    },
+    {
+      "epoch": 1.3830093592512598,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.000701209157705307,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0007,
+      "step": 1440,
+      "tokens/total": 23603200,
+      "tokens/train_per_sec_per_gpu": 7.78,
+      "tokens/trainable": 7394829
+    },
+    {
+      "epoch": 1.3926085913126949,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005787152796983719,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00058,
+      "step": 1450,
+      "tokens/total": 23767040,
+      "tokens/train_per_sec_per_gpu": 9.0,
+      "tokens/trainable": 7446945
+    },
+    {
+      "epoch": 1.40220782337413,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005680257920175791,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00057,
+      "step": 1460,
+      "tokens/total": 23930880,
+      "tokens/train_per_sec_per_gpu": 7.56,
+      "tokens/trainable": 7498437
+    },
+    {
+      "epoch": 1.411807055435565,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.001165725290775299,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00117,
+      "step": 1470,
+      "tokens/total": 24094720,
+      "tokens/train_per_sec_per_gpu": 6.74,
+      "tokens/trainable": 7549736
+    },
+    {
+      "epoch": 1.4214062874970002,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007651465013623238,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00077,
+      "step": 1480,
+      "tokens/total": 24258560,
+      "tokens/train_per_sec_per_gpu": 9.69,
+      "tokens/trainable": 7601091
+    },
+    {
+      "epoch": 1.4310055195584352,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.000851003173738718,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00085,
+      "step": 1490,
+      "tokens/total": 24422400,
+      "tokens/train_per_sec_per_gpu": 9.01,
+      "tokens/trainable": 7652714
+    },
+    {
+      "epoch": 1.4406047516198703,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009756641462445259,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00098,
+      "step": 1500,
+      "tokens/total": 24586240,
+      "tokens/train_per_sec_per_gpu": 8.81,
+      "tokens/trainable": 7703464
+    },
+    {
+      "epoch": 1.4502039836813054,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.001250309031456709,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00125,
+      "step": 1510,
+      "tokens/total": 24750080,
+      "tokens/train_per_sec_per_gpu": 8.21,
+      "tokens/trainable": 7754782
+    },
+    {
+      "epoch": 1.4598032157427405,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0014243194833397864,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00143,
+      "step": 1520,
+      "tokens/total": 24913920,
+      "tokens/train_per_sec_per_gpu": 7.23,
+      "tokens/trainable": 7806224
+    },
+    {
+      "epoch": 1.4694024478041756,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0011884530074894428,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00119,
+      "step": 1530,
+      "tokens/total": 25077760,
+      "tokens/train_per_sec_per_gpu": 8.68,
+      "tokens/trainable": 7857372
+    },
+    {
+      "epoch": 1.4790016798656107,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008188777603209019,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 1540,
+      "tokens/total": 25241600,
+      "tokens/train_per_sec_per_gpu": 9.24,
+      "tokens/trainable": 7909316
+    },
+    {
+      "epoch": 1.4886009119270458,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008213745430111885,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 1550,
+      "tokens/total": 25405440,
+      "tokens/train_per_sec_per_gpu": 8.18,
+      "tokens/trainable": 7960179
+    },
+    {
+      "epoch": 1.498200143988481,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0010140080004930497,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00101,
+      "step": 1560,
+      "tokens/total": 25569280,
+      "tokens/train_per_sec_per_gpu": 9.44,
+      "tokens/trainable": 8011395
+    },
+    {
+      "epoch": 1.5010799136069113,
+      "eval_loss": 0.0007253550575114787,
+      "eval_ppl": 1.00073,
+      "eval_runtime": 9.2519,
+      "eval_samples_per_second": 21.617,
+      "eval_steps_per_second": 21.617,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "step": 1563
+    },
+    {
+      "epoch": 1.507799376049916,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008709205314517022,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.96,
+      "memory/max_allocated (GiB)": 33.96,
+      "ppl": 1.00087,
+      "step": 1570,
+      "tokens/total": 25733120,
+      "tokens/train_per_sec_per_gpu": 9.84,
+      "tokens/trainable": 8062743
+    },
+    {
+      "epoch": 1.517398608111351,
+      "grad_norm": 0.004058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.00034918386954814197,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00035,
+      "step": 1580,
+      "tokens/total": 25896960,
+      "tokens/train_per_sec_per_gpu": 9.47,
+      "tokens/trainable": 8113834
+    },
+    {
+      "epoch": 1.5269978401727862,
+      "grad_norm": 0.0030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003432748606428504,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00034,
+      "step": 1590,
+      "tokens/total": 26060800,
+      "tokens/train_per_sec_per_gpu": 7.8,
+      "tokens/trainable": 8165964
+    },
+    {
+      "epoch": 1.5365970722342213,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005480392836034298,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00055,
+      "step": 1600,
+      "tokens/total": 26224640,
+      "tokens/train_per_sec_per_gpu": 9.52,
+      "tokens/trainable": 8217121
+    },
+    {
+      "epoch": 1.5461963042956564,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005881413817405701,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00059,
+      "step": 1610,
+      "tokens/total": 26388480,
+      "tokens/train_per_sec_per_gpu": 9.15,
+      "tokens/trainable": 8268301
+    },
+    {
+      "epoch": 1.5557955363570914,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004818507470190525,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00048,
+      "step": 1620,
+      "tokens/total": 26552320,
+      "tokens/train_per_sec_per_gpu": 8.56,
+      "tokens/trainable": 8320320
+    },
+    {
+      "epoch": 1.5653947684185265,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007268225774168969,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00073,
+      "step": 1630,
+      "tokens/total": 26716160,
+      "tokens/train_per_sec_per_gpu": 8.69,
+      "tokens/trainable": 8372031
+    },
+    {
+      "epoch": 1.5749940004799616,
+      "grad_norm": 0.0147705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006106278859078884,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00061,
+      "step": 1640,
+      "tokens/total": 26880000,
+      "tokens/train_per_sec_per_gpu": 9.11,
+      "tokens/trainable": 8422723
+    },
+    {
+      "epoch": 1.5845932325413967,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009039029479026795,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0009,
+      "step": 1650,
+      "tokens/total": 27043840,
+      "tokens/train_per_sec_per_gpu": 9.5,
+      "tokens/trainable": 8474243
+    },
+    {
+      "epoch": 1.5941924646028318,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013276168145239353,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00133,
+      "step": 1660,
+      "tokens/total": 27207680,
+      "tokens/train_per_sec_per_gpu": 7.72,
+      "tokens/trainable": 8524834
+    },
+    {
+      "epoch": 1.603791696664267,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019244521856307984,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00193,
+      "step": 1670,
+      "tokens/total": 27371520,
+      "tokens/train_per_sec_per_gpu": 8.16,
+      "tokens/trainable": 8576115
+    },
+    {
+      "epoch": 1.613390928725702,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014983797445893288,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0015,
+      "step": 1680,
+      "tokens/total": 27535360,
+      "tokens/train_per_sec_per_gpu": 8.39,
+      "tokens/trainable": 8627475
+    },
+    {
+      "epoch": 1.622990160787137,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012701219879090787,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00127,
+      "step": 1690,
+      "tokens/total": 27699200,
+      "tokens/train_per_sec_per_gpu": 7.92,
+      "tokens/trainable": 8678654
+    },
+    {
+      "epoch": 1.6325893928485722,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013377158902585506,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00134,
+      "step": 1700,
+      "tokens/total": 27863040,
+      "tokens/train_per_sec_per_gpu": 8.21,
+      "tokens/trainable": 8729739
+    },
+    {
+      "epoch": 1.6421886249100073,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.001280638948082924,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00128,
+      "step": 1710,
+      "tokens/total": 28026880,
+      "tokens/train_per_sec_per_gpu": 8.23,
+      "tokens/trainable": 8781378
+    },
+    {
+      "epoch": 1.6517878569714424,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007919369265437127,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00079,
+      "step": 1720,
+      "tokens/total": 28190720,
+      "tokens/train_per_sec_per_gpu": 8.31,
+      "tokens/trainable": 8832775
+    },
+    {
+      "epoch": 1.6613870890328775,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0013359258882701397,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00134,
+      "step": 1730,
+      "tokens/total": 28354560,
+      "tokens/train_per_sec_per_gpu": 7.45,
+      "tokens/trainable": 8884196
+    },
+    {
+      "epoch": 1.6709863210943126,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0010936973616480828,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00109,
+      "step": 1740,
+      "tokens/total": 28518400,
+      "tokens/train_per_sec_per_gpu": 8.84,
+      "tokens/trainable": 8935603
+    },
+    {
+      "epoch": 1.6805855531557476,
+      "grad_norm": 0.0216064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009528477676212788,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00095,
+      "step": 1750,
+      "tokens/total": 28682240,
+      "tokens/train_per_sec_per_gpu": 7.94,
+      "tokens/trainable": 8987083
+    },
+    {
+      "epoch": 1.6901847852171827,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006039697211235762,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0006,
+      "step": 1760,
+      "tokens/total": 28846080,
+      "tokens/train_per_sec_per_gpu": 8.42,
+      "tokens/trainable": 9038209
+    },
+    {
+      "epoch": 1.6997840172786178,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006967922672629356,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0007,
+      "step": 1770,
+      "tokens/total": 29009920,
+      "tokens/train_per_sec_per_gpu": 8.48,
+      "tokens/trainable": 9088795
+    },
+    {
+      "epoch": 1.709383249340053,
+      "grad_norm": 0.016357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008365864865481854,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00084,
+      "step": 1780,
+      "tokens/total": 29173760,
+      "tokens/train_per_sec_per_gpu": 9.15,
+      "tokens/trainable": 9140155
+    },
+    {
+      "epoch": 1.718982481401488,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005419908091425895,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00054,
+      "step": 1790,
+      "tokens/total": 29337600,
+      "tokens/train_per_sec_per_gpu": 9.11,
+      "tokens/trainable": 9191457
+    },
+    {
+      "epoch": 1.728581713462923,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008764880709350109,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00088,
+      "step": 1800,
+      "tokens/total": 29501440,
+      "tokens/train_per_sec_per_gpu": 9.69,
+      "tokens/trainable": 9242381
+    },
+    {
+      "epoch": 1.7381809455243582,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016637198626995088,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00167,
+      "step": 1810,
+      "tokens/total": 29665280,
+      "tokens/train_per_sec_per_gpu": 7.77,
+      "tokens/trainable": 9294105
+    },
+    {
+      "epoch": 1.7477801775857933,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0023251190781593324,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00233,
+      "step": 1820,
+      "tokens/total": 29829120,
+      "tokens/train_per_sec_per_gpu": 9.14,
+      "tokens/trainable": 9345913
+    },
+    {
+      "epoch": 1.7573794096472284,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.004122686386108398,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00413,
+      "step": 1830,
+      "tokens/total": 29992960,
+      "tokens/train_per_sec_per_gpu": 8.09,
+      "tokens/trainable": 9397553
+    },
+    {
+      "epoch": 1.7669786417086633,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.00475989505648613,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00477,
+      "step": 1840,
+      "tokens/total": 30156800,
+      "tokens/train_per_sec_per_gpu": 9.12,
+      "tokens/trainable": 9448264
+    },
+    {
+      "epoch": 1.7765778737700983,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0030113702639937403,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00302,
+      "step": 1850,
+      "tokens/total": 30320640,
+      "tokens/train_per_sec_per_gpu": 8.87,
+      "tokens/trainable": 9499714
+    },
+    {
+      "epoch": 1.7861771058315334,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021218497306108473,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00212,
+      "step": 1860,
+      "tokens/total": 30484480,
+      "tokens/train_per_sec_per_gpu": 8.7,
+      "tokens/trainable": 9551484
+    },
+    {
+      "epoch": 1.7957763378929685,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0019322805106639861,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00193,
+      "step": 1870,
+      "tokens/total": 30648320,
+      "tokens/train_per_sec_per_gpu": 8.71,
+      "tokens/trainable": 9603376
+    },
+    {
+      "epoch": 1.8053755699544036,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.002130831032991409,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00213,
+      "step": 1880,
+      "tokens/total": 30812160,
+      "tokens/train_per_sec_per_gpu": 8.24,
+      "tokens/trainable": 9654126
+    },
+    {
+      "epoch": 1.8149748020158387,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013952101580798626,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0014,
+      "step": 1890,
+      "tokens/total": 30976000,
+      "tokens/train_per_sec_per_gpu": 9.71,
+      "tokens/trainable": 9704260
+    },
+    {
+      "epoch": 1.8245740340772738,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013564865104854107,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00136,
+      "step": 1900,
+      "tokens/total": 31139840,
+      "tokens/train_per_sec_per_gpu": 8.05,
+      "tokens/trainable": 9755570
+    },
+    {
+      "epoch": 1.834173266138709,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014380639418959617,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00144,
+      "step": 1910,
+      "tokens/total": 31303680,
+      "tokens/train_per_sec_per_gpu": 8.37,
+      "tokens/trainable": 9806775
+    },
+    {
+      "epoch": 1.843772498200144,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013548688031733036,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00136,
+      "step": 1920,
+      "tokens/total": 31467520,
+      "tokens/train_per_sec_per_gpu": 8.63,
+      "tokens/trainable": 9858270
+    },
+    {
+      "epoch": 1.853371730261579,
+      "grad_norm": 0.02587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.001307238917797804,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00131,
+      "step": 1930,
+      "tokens/total": 31631360,
+      "tokens/train_per_sec_per_gpu": 8.62,
+      "tokens/trainable": 9909516
+    },
+    {
+      "epoch": 1.8629709623230142,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.001157990377396345,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00116,
+      "step": 1940,
+      "tokens/total": 31795200,
+      "tokens/train_per_sec_per_gpu": 8.34,
+      "tokens/trainable": 9960863
+    },
+    {
+      "epoch": 1.8725701943844493,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011683990247547626,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00117,
+      "step": 1950,
+      "tokens/total": 31959040,
+      "tokens/train_per_sec_per_gpu": 7.79,
+      "tokens/trainable": 10013368
+    },
+    {
+      "epoch": 1.8821694264458844,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007935081608593464,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00079,
+      "step": 1960,
+      "tokens/total": 32122880,
+      "tokens/train_per_sec_per_gpu": 9.08,
+      "tokens/trainable": 10064774
+    },
+    {
+      "epoch": 1.8917686585073195,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008186689577996731,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 1970,
+      "tokens/total": 32286720,
+      "tokens/train_per_sec_per_gpu": 8.17,
+      "tokens/trainable": 10116172
+    },
+    {
+      "epoch": 1.9013678905687545,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007270051632076502,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00073,
+      "step": 1980,
+      "tokens/total": 32450560,
+      "tokens/train_per_sec_per_gpu": 8.34,
+      "tokens/trainable": 10167347
+    },
+    {
+      "epoch": 1.9109671226301894,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011019655503332615,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0011,
+      "step": 1990,
+      "tokens/total": 32614400,
+      "tokens/train_per_sec_per_gpu": 9.52,
+      "tokens/trainable": 10218140
+    },
+    {
+      "epoch": 1.9205663546916245,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009611468762159347,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00096,
+      "step": 2000,
+      "tokens/total": 32778240,
+      "tokens/train_per_sec_per_gpu": 6.9,
+      "tokens/trainable": 10269179
+    },
+    {
+      "epoch": 1.9301655867530596,
+      "grad_norm": 0.0247802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.000824358593672514,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00082,
+      "step": 2010,
+      "tokens/total": 32942080,
+      "tokens/train_per_sec_per_gpu": 8.34,
+      "tokens/trainable": 10320155
+    },
+    {
+      "epoch": 1.9397648188144947,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0006628005299717188,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00066,
+      "step": 2020,
+      "tokens/total": 33105920,
+      "tokens/train_per_sec_per_gpu": 6.81,
+      "tokens/trainable": 10371059
+    },
+    {
+      "epoch": 1.9493640508759298,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009558702819049359,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00096,
+      "step": 2030,
+      "tokens/total": 33269760,
+      "tokens/train_per_sec_per_gpu": 8.88,
+      "tokens/trainable": 10422089
+    },
+    {
+      "epoch": 1.9589632829373649,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0006137116346508264,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00061,
+      "step": 2040,
+      "tokens/total": 33433600,
+      "tokens/train_per_sec_per_gpu": 7.84,
+      "tokens/trainable": 10473462
+    },
+    {
+      "epoch": 1.9685625149988,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007995942607522011,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0008,
+      "step": 2050,
+      "tokens/total": 33597440,
+      "tokens/train_per_sec_per_gpu": 8.46,
+      "tokens/trainable": 10524388
+    },
+    {
+      "epoch": 1.978161747060235,
+      "grad_norm": 0.01318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0012844327837228775,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00129,
+      "step": 2060,
+      "tokens/total": 33761280,
+      "tokens/train_per_sec_per_gpu": 8.64,
+      "tokens/trainable": 10575622
+    },
+    {
+      "epoch": 1.9877609791216702,
+      "grad_norm": 0.0113525390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0011016235686838627,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.0011,
+      "step": 2070,
+      "tokens/total": 33925120,
+      "tokens/train_per_sec_per_gpu": 9.57,
+      "tokens/trainable": 10627503
+    },
+    {
+      "epoch": 1.9973602111831052,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009904997423291206,
+      "memory/device_reserved (GiB)": 35.97,
+      "memory/max_active (GiB)": 33.95,
+      "memory/max_allocated (GiB)": 33.95,
+      "ppl": 1.00099,
+      "step": 2080,
+      "tokens/total": 34088960,
+      "tokens/train_per_sec_per_gpu": 8.51,
+      "tokens/trainable": 10679027
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3123,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1041,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.438902357896724e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc5dc0a6b631434a1e530ec14cbf9d04e0cb0394c28ae6df258badbdff9da4e
+size 7121

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.0.0"
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20f299eec38f2ee6b3400fd956fdd92266f72da5225fa3b04e2fe1e66ccf72d5
+size 8822894520

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1bc46ef1ffa4a3e07cac67e81070ecec954323920d27e8b2388f5f89d6909ec
+size 16090225449

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20ea3a198ff666cb4ace1c684b598fe43fc7c3c276b83efc553a1b787e12a304
+size 14645

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78c3c62dddcf61ce76eba74e4febde7485ae697ca0a51e1ac7b67acf61c1d077
+size 1465

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokens_state. ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"total": 51173376, "trainable": 16031558}

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc5dc0a6b631434a1e530ec14cbf9d04e0cb0394c28ae6df258badbdff9da4e
+size 7121

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/debug.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_results.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2	+ math_operations,balanced_test_alpaca_results,500,3,0.60,500,100.00,497

checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_summary.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "overall": {
+    "total": 500,
+    "correct": 3,
+    "accuracy": 0.6,
+    "format_found": 500,
+    "format_accuracy": 100.0
+  },
+  "per_operation": {
+    "a": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "b": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "c": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "d": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "e": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "f": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "g": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    },
+    "h": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "i": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    },
+    "j": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "k": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "l": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "m": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "n": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "o": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "p": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "q": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "r": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "s": {
+      "total": 25,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 25
+    },
+    "t": {
+      "total": 25,
+      "correct": 1,
+      "accuracy": 4.0,
+      "format_found": 25
+    }
+  },
+  "n_errors": 497,
+  "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_results.jsonl"
+}