diff --git "a/debug.log" "b/debug.log"
new file mode 100644--- /dev/null
+++ "b/debug.log"
@@ -0,0 +1,604 @@
+[2026-01-06 06:31:33,733] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:5347] baseline 0.000GB ()
+[2026-01-06 06:31:33,734] [INFO] [axolotl.cli.config.load_cfg:248] [PID:5347] config:
+{
+  "activation_offloading": false,
+  "axolotl_config_path": "sft-axolotl-olmo3-7b-think.yaml",
+  "base_model": "allenai/Olmo-3-1025-7B",
+  "base_model_config": "allenai/Olmo-3-1025-7B",
+  "batch_size": 8,
+  "bf16": true,
+  "capabilities": {
+    "bf16": true,
+    "compute_capability": "sm_90",
+    "fp8": false,
+    "n_gpu": 2,
+    "n_node": 1
+  },
+  "chat_template": "jinja",
+  "chat_template_jinja": "/workspace/data/model-output/chat_template.jinja",
+  "context_parallel_size": 2,
+  "dataloader_num_workers": 2,
+  "dataloader_pin_memory": true,
+  "dataloader_prefetch_factor": 256,
+  "dataset_prepared_path": "last_run_prepared",
+  "dataset_processes": 48,
+  "datasets": [
+    {
+      "chat_template": "tokenizer_default",
+      "field_messages": "messages",
+      "message_property_mappings": {
+        "content": "content",
+        "role": "role"
+      },
+      "path": "dataset-tfs-mk-IMP-SOS-processed-olmo3-think.jsonl",
+      "roles": {
+        "assistant": [
+          "assistant"
+        ],
+        "system": [
+          "system"
+        ],
+        "user": [
+          "user"
+        ]
+      },
+      "roles_to_train": [
+        "assistant"
+      ],
+      "trust_remote_code": false,
+      "type": "chat_template"
+    }
+  ],
+  "ddp": true,
+  "device": "cuda:0",
+  "device_map": {
+    "": 0
+  },
+  "dion_rank_fraction": 1.0,
+  "dion_rank_multiple_of": 1,
+  "env_capabilities": {
+    "torch_version": "2.7.1"
+  },
+  "eval_batch_size": 1,
+  "eval_causal_lm_metrics": [
+    "sacrebleu",
+    "comet",
+    "ter",
+    "chrf"
+  ],
+  "eval_max_new_tokens": 128,
+  "eval_sample_packing": true,
+  "eval_steps": 0.5,
+  "eval_table_size": 0,
+  "evals_per_epoch": 1,
+  "experimental_skip_move_to_device": true,
+  "flash_attention": true,
+  "fp16": false,
+  "gradient_accumulation_steps": 4,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": {
+    "use_reentrant": true
+  },
+  "group_by_length": false,
+  "hub_model_id": "Auditt/O37BB",
+  "include_tkps": true,
+  "learning_rate": 1e-05,
+  "lisa_layers_attribute": "model.layers",
+  "load_best_model_at_end": false,
+  "load_in_4bit": false,
+  "load_in_8bit": false,
+  "local_rank": 0,
+  "logging_steps": 1,
+  "lora_dropout": 0.0,
+  "loraplus_lr_embedding": 1e-06,
+  "lr_scheduler": "cosine",
+  "mean_resizing_embeddings": false,
+  "micro_batch_size": 1,
+  "model_config_type": "olmo3",
+  "num_epochs": 2.0,
+  "optimizer": "adamw_torch",
+  "output_dir": "/workspace/data/model-output-base",
+  "pad_to_sequence_len": true,
+  "pretrain_multipack_attn": true,
+  "profiler_steps_start": 0,
+  "qlora_sharded_model_loading": false,
+  "ray_num_workers": 1,
+  "resources_per_worker": {
+    "GPU": 1
+  },
+  "ring_attn_func": "varlen_llama3",
+  "sample_packing": true,
+  "sample_packing_bin_size": 200,
+  "sample_packing_group_size": 100000,
+  "save_only_model": false,
+  "save_safetensors": true,
+  "sequence_len": 60000,
+  "shuffle_before_merging_datasets": false,
+  "shuffle_merged_datasets": true,
+  "skip_prepare_dataset": false,
+  "streaming_multipack_buffer_size": 10000,
+  "strict": false,
+  "tensor_parallel_size": 1,
+  "tf32": true,
+  "tiled_mlp_use_original_mlp": true,
+  "tokenizer_config": "allenai/Olmo-3-1025-7B",
+  "tokenizer_save_jinja_files": true,
+  "tokens": [
+    "\ud801\udd32",
+    "\ud801\udd3e",
+    "\u3009",
+    "\ud835\udf0e",
+    "\u22c1",
+    "\ud801\udd60",
+    "\ud801\udd5c",
+    "\ud801\udd38",
+    "\u2227",
+    "\u2265",
+    "\ud801\udd5f",
+    "\ud801\udd56",
+    "\u27c2",
+    "\ud801\udd4f",
+    "\u22c0",
+    "\ud801\udd63",
+    "\ud801\udd43",
+    "\ud801\udd59",
+    "\ud801\udd55",
+    "\u03c7",
+    "\ud801\udd4a",
+    "\u3008",
+    "\ud801\udd50",
+    "\ud801\udd3b",
+    "\ud801\udd40",
+    "\ud801\udd33",
+    "\u2260",
+    "\ud801\udd37",
+    "\u2264",
+    "\ud801\udd5e",
+    "\ud801\udd31",
+    "\ud801\udd42",
+    "\u21a6",
+    "\ud801\udd4e",
+    "\u2192",
+    "\ud801\udd5b",
+    "\ud801\udd30",
+    "\u03b5"
+  ],
+  "torch_dtype": "torch.bfloat16",
+  "train_on_inputs": false,
+  "trl": {
+    "log_completions": false,
+    "mask_truncated_completions": false,
+    "ref_model_mixup_alpha": 0.9,
+    "ref_model_sync_steps": 64,
+    "scale_rewards": true,
+    "sync_ref_model": false,
+    "use_vllm": false,
+    "vllm_server_host": "0.0.0.0",
+    "vllm_server_port": 8000
+  },
+  "trust_remote_code": true,
+  "use_ray": false,
+  "val_set_size": 0.1,
+  "vllm": {
+    "device": "auto",
+    "dtype": "auto",
+    "gpu_memory_utilization": 0.9,
+    "host": "0.0.0.0",
+    "port": 8000
+  },
+  "weight_decay": 0.0,
+  "world_size": 2
+}
+[2026-01-06 06:31:34,116] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:5347] EOS: 100257 / <|endoftext|>
+[2026-01-06 06:31:34,116] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:5347] BOS: None / None
+[2026-01-06 06:31:34,117] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:5347] PAD: 100277 / <|pad|>
+[2026-01-06 06:31:34,117] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:5347] UNK: 100257 / <|endoftext|>
+[2026-01-06 06:31:48,563] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:5347] Loading prepared dataset from disk at last_run_prepared/521442581534a9837f30b55bdde4d057...
+[2026-01-06 06:31:48,578] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:5347] total_num_tokens: 4_377_664
+[2026-01-06 06:31:48,586] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:5347] `total_supervised_tokens: 3_345_873`
+[2026-01-06 06:31:49,583] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.4595468044281006
+[2026-01-06 06:31:50,030] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.44755101203918457
+[2026-01-06 06:31:50,477] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.44657182693481445
+[2026-01-06 06:31:50,952] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.4744563102722168
+[2026-01-06 06:31:52,230] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]
+[2026-01-06 06:31:52,231] [WARNING] [py.warnings._showwarnmsg:110] [PID:5347] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
+  warnings.warn(  # warn only once
+
+[2026-01-06 06:31:52,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:5347] data_loader_len: 9
+[2026-01-06 06:31:52,429] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:5347] sample_packing_eff_est across ranks: [0.9235578179359436, 0.9235578179359436]
+[2026-01-06 06:31:52,430] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:5347] sample_packing_eff_est: None
+[2026-01-06 06:31:52,430] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:5347] total_num_steps: 36
+[2026-01-06 06:31:52,542] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:5347] total_num_tokens: 39_860_654
+[2026-01-06 06:31:53,267] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:5347] `total_supervised_tokens: 30_599_793`
+[2026-01-06 06:31:54,333] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.5050613880157471
+[2026-01-06 06:31:54,830] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.49697208404541016
+[2026-01-06 06:31:55,317] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.48656487464904785
+[2026-01-06 06:31:55,795] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.47708654403686523
+[2026-01-06 06:31:55,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [699, 697]
+[2026-01-06 06:31:55,797] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:5347] data_loader_len: 87
+[2026-01-06 06:31:55,798] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:5347] sample_packing_eff_est across ranks: [0.9517825841903687, 0.947709321975708]
+[2026-01-06 06:31:55,798] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:5347] sample_packing_eff_est: 0.96
+[2026-01-06 06:31:55,802] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:5347] total_num_steps: 348
+[2026-01-06 06:31:55,802] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:5347] Maximum number of steps set at 348
+[2026-01-06 06:31:55,821] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:5347] Loading tokenizer... allenai/Olmo-3-1025-7B
+[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:5347] EOS: 100257 / <|endoftext|>
+[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:5347] BOS: None / None
+[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:5347] PAD: 100277 / <|pad|>
+[2026-01-06 06:31:56,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:5347] UNK: 100257 / <|endoftext|>
+[2026-01-06 06:31:56,144] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:5347] Loading model
+[2026-01-06 06:31:56,194] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:5347] Patched Trainer.evaluation_loop with nanmean loss calculation
+[2026-01-06 06:31:56,195] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:5347] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
+[2026-01-06 06:31:56,197] [DEBUG] [axolotl.monkeypatch.transformers.trainer_context_parallel.patch_prepare_context_parallel_inputs:66] [PID:5347] Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP
+[2026-01-06 06:31:56,198] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:5347] Applying multipack dataloader patch for sample packing...
+Loading checkpoint shards:   0%|                                                                                               | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|█████████████████████████████                                                          | 1/3 [00:01<00:02,  1.06s/it]Loading checkpoint shards:  67%|██████████████████████████████████████████████████████████                             | 2/3 [00:02<00:01,  1.03s/it]Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.00it/s]Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.01s/it]
+generation_config.json:   0%|                                                                                             | 0.00/69.0 [00:00<?, ?B/s]generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████| 69.0/69.0 [00:00<00:00, 789kB/s]
+[2026-01-06 06:32:35,096] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:5347] Converting modules to torch.bfloat16
+[2026-01-06 06:32:35,099] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:5347] Memory usage after model load 15.893GB (+15.893GB allocated, +18.191GB reserved)
+[2026-01-06 06:32:39,828] [INFO] [axolotl.train.save_initial_configs:402] [PID:5347] Pre-saving tokenizer to /workspace/data/model-output-base...
+[2026-01-06 06:32:39,939] [INFO] [axolotl.train.save_initial_configs:407] [PID:5347] Pre-saving model config to /workspace/data/model-output-base...
+[2026-01-06 06:32:39,943] [INFO] [axolotl.monkeypatch.ring_attn.patch.register_ring_attn_from_device_mesh:154] [PID:5347] Enabling ring attention sequence parallelism using DeviceMesh dimension '('cp',)'
+[2026-01-06 06:32:39,943] [INFO] [axolotl.monkeypatch.ring_attn.patch.register_ring_attn_from_device_mesh:174] [PID:5347] Sequence parallel degree: 2, mesh shape: torch.Size([2])
+[2026-01-06 06:32:39,943] [INFO] [axolotl.train.execute_training:196] [PID:5347] Starting trainer...
+[2026-01-06 06:32:50,347] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6783885955810547
+[2026-01-06 06:32:50,988] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6399080753326416
+[2026-01-06 06:32:51,625] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6367971897125244
+[2026-01-06 06:32:52,267] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6410634517669678
+[2026-01-06 06:32:53,405] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [700, 700]
+[2026-01-06 06:32:53,406] [WARNING] [py.warnings._showwarnmsg:110] [PID:5347] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
+  warnings.warn(  # warn only once
+
+  0%|                                                                                                                        | 0/348 [00:00<?, ?it/s][2026-01-06 06:32:53,498] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:5347] Running evaluation step...
+[2026-01-06 06:32:55,925] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6825177669525146
+[2026-01-06 06:32:56,556] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6304805278778076
+[2026-01-06 06:32:57,222] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6658525466918945
+[2026-01-06 06:32:57,878] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.6549110412597656
+[2026-01-06 06:32:58,138] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]
+
+  0%|                                                                                                                         | 0/78 [00:00<?, ?it/s][A
+  3%|██▉                                                                                                              | 2/78 [00:01<00:49,  1.54it/s][A
+  4%|████▎                                                                                                            | 3/78 [00:02<01:17,  1.04s/it][A
+  5%|█████▊                                                                                                           | 4/78 [00:04<01:31,  1.24s/it][A
+  6%|███████▏                                                                                                         | 5/78 [00:06<01:43,  1.42s/it][A
+  8%|████████▋                                                                                                        | 6/78 [00:07<01:49,  1.52s/it][A
+  9%|██████████▏                                                                                                      | 7/78 [00:09<01:49,  1.54s/it][A
+ 10%|███████████▌                                                                                                     | 8/78 [00:11<01:51,  1.59s/it][A
+ 12%|█████████████                                                                                                    | 9/78 [00:12<01:50,  1.61s/it][A
+ 13%|██████████████▎                                                                                                 | 10/78 [00:14<01:50,  1.62s/it][A
+ 14%|███████████████▊                                                                                                | 11/78 [00:16<01:48,  1.62s/it][A
+ 15%|█████████████████▏                                                                                              | 12/78 [00:17<01:47,  1.63s/it][A
+ 17%|██████████████████▋                                                                                             | 13/78 [00:19<01:46,  1.64s/it][A
+ 18%|████████████████████                                                                                            | 14/78 [00:21<01:43,  1.62s/it][A
+ 19%|█████████████████████▌                                                                                          | 15/78 [00:22<01:41,  1.61s/it][A
+ 21%|██████████████████████▉                                                                                         | 16/78 [00:24<01:39,  1.60s/it][A
+ 22%|████████████████████████▍                                                                                       | 17/78 [00:25<01:40,  1.65s/it][A
+ 23%|█████████████████████████▊                                                                                      | 18/78 [00:27<01:37,  1.62s/it][A
+ 24%|███████████████████████████▎                                                                                    | 19/78 [00:29<01:36,  1.64s/it][A
+ 26%|████████████████████████████▋                                                                                   | 20/78 [00:30<01:34,  1.63s/it][A
+ 27%|██████████████████████████████▏                                                                                 | 21/78 [00:32<01:33,  1.64s/it][A
+ 28%|███████████████████████████████▌                                                                                | 22/78 [00:34<01:30,  1.62s/it][A
+ 29%|█████████████████████████████████                                                                               | 23/78 [00:35<01:27,  1.59s/it][A
+ 31%|██████████████████████████████████▍                                                                             | 24/78 [00:37<01:25,  1.58s/it][A
+ 32%|███████████████████████████████████▉                                                                            | 25/78 [00:38<01:23,  1.58s/it][A
+ 33%|█████████████████████████████████████▎                                                                          | 26/78 [00:40<01:22,  1.58s/it][A
+ 35%|████████████████████████████████████���█▊                                                                         | 27/78 [00:41<01:20,  1.58s/it][A
+ 36%|████████████████████████████████████████▏                                                                       | 28/78 [00:43<01:19,  1.59s/it][A
+ 37%|█████████████████████████████████████████▋                                                                      | 29/78 [00:45<01:19,  1.62s/it][A
+ 38%|███████████████████████████████████████████                                                                     | 30/78 [00:46<01:17,  1.61s/it][A
+ 40%|████████████████████████████████████████████▌                                                                   | 31/78 [00:48<01:14,  1.59s/it][A
+ 41%|█████████████████████████████████████████████▉                                                                  | 32/78 [00:49<01:13,  1.59s/it][A
+ 42%|███████████████████████████████████████████████▍                                                                | 33/78 [00:51<01:12,  1.62s/it][A
+ 44%|████████████████████████████████████████████████▊                                                               | 34/78 [00:53<01:12,  1.64s/it][A
+ 45%|██████████████████████████████████████████████████▎                                                             | 35/78 [00:54<01:10,  1.65s/it][A
+ 46%|███████████████████████████████████████████████████▋                                                            | 36/78 [00:56<01:08,  1.62s/it][A
+ 47%|█████████████████████████████████████████████████████▏                                                          | 37/78 [00:58<01:06,  1.63s/it][A
+ 49%|██████████████████████████████████████████████████████▌                                                         | 38/78 [00:59<01:04,  1.62s/it][A
+ 50%|████████████████████████████████████████████████████████                                                        | 39/78 [01:01<01:02,  1.61s/it][A
+ 51%|█████████████████████████████████████████████████████████▍                                                      | 40/78 [01:02<01:01,  1.61s/it][A
+ 53%|██████████████████████████████████████████████████████████▊                                                     | 41/78 [01:04<01:01,  1.66s/it][A
+ 54%|████████████████████████████████████████████████████████████▎                                                   | 42/78 [01:06<00:59,  1.65s/it][A
+ 55%|█████████████████████████████████████████████████████████████▋                                                  | 43/78 [01:08<00:58,  1.67s/it][A
+ 56%|███████████████████████████████████████████████████████████████▏                                                | 44/78 [01:09<00:57,  1.69s/it][A
+ 58%|████████████████████████████████████████████████████████████████▌                                               | 45/78 [01:11<00:56,  1.72s/it][A
+ 59%|██████████████████████████████████████████████████████████████████                                              | 46/78 [01:13<00:55,  1.72s/it][A
+ 60%|█████��█████████████████████████████████████████████████████████████▍                                            | 47/78 [01:15<00:53,  1.73s/it][A
+ 62%|████████████████████████████████████████████████████████████████████▉                                           | 48/78 [01:16<00:51,  1.71s/it][A
+ 63%|██████████████████████████████████████████████████████████████████████▎                                         | 49/78 [01:18<00:48,  1.69s/it][A
+ 64%|███████████████████████████████████████████████████████████████████████▊                                        | 50/78 [01:20<00:46,  1.68s/it][A
+ 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 51/78 [01:21<00:45,  1.67s/it][A
+ 67%|██████████████████████████████████████████████████████████████████████████▋                                     | 52/78 [01:23<00:43,  1.66s/it][A
+ 68%|████████████████████████████████████████████████████████████████████████████                                    | 53/78 [01:25<00:42,  1.69s/it][A
+ 69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 54/78 [01:26<00:39,  1.66s/it][A
+ 71%|██████████████████████████████████████████████████████████████████████████████▉                                 | 55/78 [01:28<00:37,  1.65s/it][A
+ 72%|████████████████████████████████████████████████████████████████████████████████▍                               | 56/78 [01:29<00:35,  1.62s/it][A
+ 73%|█████████████████████████████████████████████████████████████████████████████████▊                              | 57/78 [01:31<00:33,  1.60s/it][A
+ 74%|███████████████████████████████████████████████████████████████████████████████████▎                            | 58/78 [01:32<00:31,  1.60s/it][A
+ 76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 59/78 [01:34<00:30,  1.58s/it][A
+ 77%|██████████████████████████████████████████████████████████████████████████████████████▏                         | 60/78 [01:36<00:28,  1.58s/it][A
+ 78%|███████████████████████████████████████████████████████████████████████████████████████▌                        | 61/78 [01:37<00:26,  1.57s/it][A
+ 79%|█████████████████████████████████████████████████████████████████████████████████████████                       | 62/78 [01:39<00:25,  1.57s/it][A
+ 81%|██████████████████████████████████████████████████████████████████████████████████████████▍                     | 63/78 [01:40<00:24,  1.61s/it][A
+ 82%|███████████████████████████████████████████████████████████████████████████████████████████▉                    | 64/78 [01:42<00:22,  1.61s/it][A
+ 83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                  | 65/78 [01:44<00:21,  1.63s/it][A
+ 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                 | 66/78 [01:45<00:19,  1.60s/it][A
+ 86%|████████████████████████████████████████████████████████████████████████████████████████████████▏               | 67/78 [01:47<00:17,  1.61s/it][A
+ 87%|█████████████████████████████████████████████████████████████████████████████████████████████████▋              | 68/78 [01:49<00:16,  1.63s/it][A
+ 88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 69/78 [01:50<00:14,  1.64s/it][A
+ 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 70/78 [01:52<00:13,  1.66s/it][A
+ 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 71/78 [01:54<00:11,  1.67s/it][A
+ 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 72/78 [01:55<00:10,  1.67s/it][A
+ 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 73/78 [01:57<00:08,  1.66s/it][A
+ 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 74/78 [01:58<00:06,  1.64s/it][A
+ 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 75/78 [02:00<00:04,  1.64s/it][A
+ 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 76/78 [02:02<00:03,  1.65s/it][A
+ 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 77/78 [02:03<00:01,  1.63s/it][A
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.62s/it][A                                                                                                                                                     
+                                                                                                                                                     [A{'eval_loss': 1.0679770708084106, 'eval_runtime': 129.1457, 'eval_samples_per_second': 1.44, 'eval_steps_per_second': 0.72, 'memory/max_active (GiB)': 58.72, 'memory/max_allocated (GiB)': 55.5, 'memory/device_reserved (GiB)': 65.44, 'epoch': 0}
+  0%|                                                                                                                        | 0/348 [02:13<?, ?it/s]
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.62s/it][A
+                                                                                                                                                     [A  0%|▎                                                                                                           | 1/348 [02:29<14:24:11, 149.43s/it]                                                                                                                                                     {'loss': 4.9629, 'grad_norm': 57.0, 'learning_rate': 0.0, 'memory/max_active (GiB)': 81.64, 'memory/max_allocated (GiB)': 81.64, 'memory/device_reserved (GiB)': 83.59, 'tokens_per_second_per_gpu': 124835.68, 'epoch': 0.01}
+  0%|▎                                                                                                           | 1/348 [02:29<14:24:11, 149.43s/it]  1%|▋                                                                                                             | 2/348 [02:43<6:42:55, 69.87s/it]                                                                                                                                                     {'loss': 4.4324, 'grad_norm': 45.25, 'learning_rate': 1.0000000000000002e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2715.0, 'epoch': 0.01}
+  1%|▋                                                                                                             | 2/348 [02:43<6:42:55, 69.87s/it]  1%|▉                                                                                                             | 3/348 [02:57<4:13:55, 44.16s/it]                                                                                                                                                     {'loss': 4.3548, 'grad_norm': 63.5, 'learning_rate': 2.0000000000000003e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2697.0, 'epoch': 0.02}
+  1%|▉                                                                                                             | 3/348 [02:57<4:13:55, 44.16s/it]  1%|█▎                                                                                                            | 4/348 [03:10<3:03:18, 31.97s/it]                                                                                                                                                     {'loss': 4.2327, 'grad_norm': 48.0, 'learning_rate': 3e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2655.0, 'epoch': 0.02}
+  1%|█▎                                                                                                            | 4/348 [03:10<3:03:18, 31.97s/it]  1%|█▌                                                                                                            | 5/348 [03:23<2:24:33, 25.29s/it]                                                                                                                                                     {'loss': 4.218, 'grad_norm': 46.0, 'learning_rate': 4.000000000000001e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2937.2, 'epoch': 0.03}
+  1%|█▌                                                                                                            | 5/348 [03:23<2:24:33, 25.29s/it]  2%|█▉                                                                                                            | 6/348 [03:37<2:02:04, 21.42s/it]                                                                                                                                                     {'loss': 4.0187, 'grad_norm': 39.75, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2416.65, 'epoch': 0.03}
+  2%|█▉                                                                                                            | 6/348 [03:37<2:02:04, 21.42s/it]  2%|██▏                                                                                                           | 7/348 [03:51<1:47:01, 18.83s/it]                                                                                                                                                     {'loss': 3.7985, 'grad_norm': 47.25, 'learning_rate': 6e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3094.42, 'epoch': 0.04}
+  2%|██▏                                                                                                           | 7/348 [03:51<1:47:01, 18.83s/it]  2%|██▌                                                                                                           | 8/348 [04:05<1:38:15, 17.34s/it]                                                                                                                                                     {'loss': 3.6982, 'grad_norm': 39.5, 'learning_rate': 7e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2475.28, 'epoch': 0.05}
+  2%|██▌                                                                                                           | 8/348 [04:05<1:38:15, 17.34s/it]  3%|██▊                                                                                                           | 9/348 [04:18<1:31:02, 16.11s/it]                                                                                                                                                     {'loss': 3.5344, 'grad_norm': 42.75, 'learning_rate': 8.000000000000001e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2863.05, 'epoch': 0.05}
+  3%|██▊                                                                                                           | 9/348 [04:18<1:31:02, 16.11s/it]  3%|███▏                                                                                                         | 10/348 [04:32<1:26:45, 15.40s/it]                                                                                                                                                     {'loss': 2.9472, 'grad_norm': 37.5, 'learning_rate': 9e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2346.54, 'epoch': 0.06}
+  3%|███▏                                                                                                         | 10/348 [04:32<1:26:45, 15.40s/it]  3%|███▍                                                                                                         | 11/348 [04:46<1:23:36, 14.89s/it]                                                                                                                                                     {'loss': 2.1595, 'grad_norm': 30.875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2992.9, 'epoch': 0.06}
+  3%|███▍                                                                                                         | 11/348 [04:46<1:23:36, 14.89s/it]  3%|███▊                                                                                                         | 12/348 [05:00<1:21:15, 14.51s/it]                                                                                                                                                     {'loss': 1.5164, 'grad_norm': 24.0, 'learning_rate': 9.999784025127187e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2840.54, 'epoch': 0.07}
+  3%|███▊                                                                                                         | 12/348 [05:00<1:21:15, 14.51s/it]  4%|████                                                                                                         | 13/348 [05:13<1:19:31, 14.24s/it]                                                                                                                                                     {'loss': 1.4181, 'grad_norm': 12.8125, 'learning_rate': 9.999136119166803e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2374.7, 'epoch': 0.07}
+  4%|████                                                                                                         | 13/348 [05:13<1:19:31, 14.24s/it]  4%|████▍                                                                                                        | 14/348 [05:27<1:18:20, 14.07s/it]                                                                                                                                                     {'loss': 1.1872, 'grad_norm': 7.84375, 'learning_rate': 9.998056338091415e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2537.33, 'epoch': 0.08}
+  4%|████▍                                                                                                        | 14/348 [05:27<1:18:20, 14.07s/it]  4%|████▋                                                                                                        | 15/348 [05:40<1:17:16, 13.92s/it]                                                                                                                                                     {'loss': 1.1644, 'grad_norm': 6.0, 'learning_rate': 9.99654477518325e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2530.44, 'epoch': 0.09}
+  4%|████▋                                                                                                        | 15/348 [05:40<1:17:16, 13.92s/it]  5%|█████                                                                                                        | 16/348 [05:54<1:16:58, 13.91s/it]                                                                                                                                                     {'loss': 1.045, 'grad_norm': 4.9375, 'learning_rate': 9.994601561026156e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2575.3, 'epoch': 0.09}
+  5%|█████                                                                                                        | 16/348 [05:54<1:16:58, 13.91s/it]  5%|█████▎                                                                                                       | 17/348 [06:08<1:16:29, 13.86s/it]                                                                                                                                                     {'loss': 1.1482, 'grad_norm': 5.09375, 'learning_rate': 9.9922268634943e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2933.78, 'epoch': 0.1}
+  5%|█████▎                                                                                                       | 17/348 [06:08<1:16:29, 13.86s/it]  5%|█████▋                                                                                                       | 18/348 [06:22<1:15:43, 13.77s/it]                                                                                                                                                     {'loss': 1.0253, 'grad_norm': 5.71875, 'learning_rate': 9.989420887737684e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2700.81, 'epoch': 0.1}
+  5%|█████▋                                                                                                       | 18/348 [06:22<1:15:43, 13.77s/it]  5%|█████▉                                                                                                       | 19/348 [06:35<1:14:59, 13.68s/it]                                                                                                                                                     {'loss': 0.8181, 'grad_norm': 4.96875, 'learning_rate': 9.986183876164412e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2656.52, 'epoch': 0.11}
+  5%|█████▉                                                                                                       | 19/348 [06:35<1:14:59, 13.68s/it]  6%|██████▎                                                                                                      | 20/348 [06:48<1:14:08, 13.56s/it]                                                                                                                                                     {'loss': 0.7374, 'grad_norm': 3.15625, 'learning_rate': 9.982516108419746e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2864.5, 'epoch': 0.11}
+  6%|██████▎                                                                                                      | 20/348 [06:48<1:14:08, 13.56s/it]  6%|██████▌                                                                                                      | 21/348 [07:02<1:13:43, 13.53s/it]                                                                                                                                                     {'loss': 0.7305, 'grad_norm': 6.75, 'learning_rate': 9.978417901361958e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2728.43, 'epoch': 0.12}
+  6%|██████▌                                                                                                      | 21/348 [07:02<1:13:43, 13.53s/it]  6%|██████▉                                                                                                      | 22/348 [07:16<1:13:47, 13.58s/it]                                                                                                                                                     {'loss': 0.8355, 'grad_norm': 3.328125, 'learning_rate': 9.973889609034945e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2616.55, 'epoch': 0.13}
+  6%|██████▉                                                                                                      | 22/348 [07:16<1:13:47, 13.58s/it]  7%|███████▏                                                                                                     | 23/348 [07:29<1:13:49, 13.63s/it]                                                                                                                                                     {'loss': 0.73, 'grad_norm': 3.703125, 'learning_rate': 9.968931622637652e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2806.47, 'epoch': 0.13}
+  7%|███████▏                                                                                                     | 23/348 [07:29<1:13:49, 13.63s/it]  7%|███████▌                                                                                                     | 24/348 [07:43<1:14:15, 13.75s/it]                                                                                                                                                     {'loss': 0.6998, 'grad_norm': 2.921875, 'learning_rate': 9.96354437049027e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2495.19, 'epoch': 0.14}
+  7%|███████▌                                                                                                     | 24/348 [07:43<1:14:15, 13.75s/it]  7%|███████▊                                                                                                     | 25/348 [07:57<1:13:53, 13.73s/it]                                                                                                                                                     {'loss': 0.6858, 'grad_norm': 2.734375, 'learning_rate': 9.95772831799724e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2663.83, 'epoch': 0.14}
+  7%|███████▊                                                                                                     | 25/348 [07:57<1:13:53, 13.73s/it]  7%|████████▏                                                                                                    | 26/348 [08:11<1:13:31, 13.70s/it]                                                                                                                                                     {'loss': 0.6134, 'grad_norm': 2.703125, 'learning_rate': 9.95148396760704e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2848.04, 'epoch': 0.15}
+  7%|████████▏                                                                                                    | 26/348 [08:11<1:13:31, 13.70s/it]  8%|████████▍                                                                                                    | 27/348 [08:25<1:13:36, 13.76s/it]                                                                                                                                                     {'loss': 0.588, 'grad_norm': 2.453125, 'learning_rate': 9.944811858768782e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2746.78, 'epoch': 0.15}
+  8%|████████▍                                                                                                    | 27/348 [08:25<1:13:36, 13.76s/it]  8%|████████▊                                                                                                    | 28/348 [08:38<1:13:11, 13.72s/it]                                                                                                                                                     {'loss': 0.5296, 'grad_norm': 1.9453125, 'learning_rate': 9.93771256788561e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2550.39, 'epoch': 0.16}
+  8%|████████▊                                                                                                    | 28/348 [08:38<1:13:11, 13.72s/it]  8%|█████████                                                                                                    | 29/348 [08:52<1:12:42, 13.67s/it]                                                                                                                                                     {'loss': 0.5367, 'grad_norm': 2.390625, 'learning_rate': 9.930186708264902e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2874.74, 'epoch': 0.17}
+  8%|█████████                                                                                                    | 29/348 [08:52<1:12:42, 13.67s/it]  9%|█████████▍                                                                                                   | 30/348 [09:05<1:12:10, 13.62s/it]                                                                                                                                                     {'loss': 0.4341, 'grad_norm': 2.28125, 'learning_rate': 9.922234930065286e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3108.48, 'epoch': 0.17}
+  9%|█████████▍                                                                                                   | 30/348 [09:05<1:12:10, 13.62s/it]  9%|█████████▋                                                                                                   | 31/348 [09:19<1:11:58, 13.62s/it]                                                                                                                                                     {'loss': 0.4675, 'grad_norm': 1.8046875, 'learning_rate': 9.913857920240481e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2722.71, 'epoch': 0.18}
+  9%|█████████▋                                                                                                   | 31/348 [09:19<1:11:58, 13.62s/it]  9%|██████████                                                                                                   | 32/348 [09:33<1:12:15, 13.72s/it]                                                                                                                                                     {'loss': 0.3299, 'grad_norm': 1.4609375, 'learning_rate': 9.905056402479933e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2522.27, 'epoch': 0.18}
+  9%|██████████                                                                                                   | 32/348 [09:33<1:12:15, 13.72s/it]  9%|██████████▎                                                                                                  | 33/348 [09:46<1:11:44, 13.66s/it]                                                                                                                                                     {'loss': 0.4499, 'grad_norm': 2.171875, 'learning_rate': 9.895831137146319e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2755.38, 'epoch': 0.19}
+  9%|██████████▎                                                                                                  | 33/348 [09:46<1:11:44, 13.66s/it] 10%|██████████▋                                                                                                  | 34/348 [10:00<1:11:27, 13.65s/it]                                                                                                                                                     {'loss': 0.5226, 'grad_norm': 2.4375, 'learning_rate': 9.88618292120984e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2203.91, 'epoch': 0.19}
+ 10%|██████████▋                                                                                                  | 34/348 [10:00<1:11:27, 13.65s/it] 10%|██████████▉                                                                                                  | 35/348 [10:13<1:10:49, 13.58s/it]                                                                                                                                                     {'loss': 0.3125, 'grad_norm': 1.15625, 'learning_rate': 9.876112588179378e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2790.6, 'epoch': 0.2}
+ 10%|██████████▉                                                                                                  | 35/348 [10:13<1:10:49, 13.58s/it] 10%|███████████▎                                                                                                 | 36/348 [10:27<1:11:02, 13.66s/it]                                                                                                                                                     {'loss': 0.3661, 'grad_norm': 1.84375, 'learning_rate': 9.865621008030492e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3064.27, 'epoch': 0.21}
+ 10%|███████████▎                                                                                                 | 36/348 [10:27<1:11:02, 13.66s/it] 11%|███████████▌                                                                                                 | 37/348 [10:41<1:11:05, 13.72s/it]                                                                                                                                                     {'loss': 0.2538, 'grad_norm': 1.0625, 'learning_rate': 9.854709087130261e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2660.37, 'epoch': 0.21}
+ 11%|███████████▌                                                                                                 | 37/348 [10:41<1:11:05, 13.72s/it] 11%|███████████▉                                                                                                 | 38/348 [10:55<1:11:16, 13.80s/it]                                                                                                                                                     {'loss': 0.3445, 'grad_norm': 1.359375, 'learning_rate': 9.843377768158972e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2517.7, 'epoch': 0.22}
+ 11%|███████████▉                                                                                                 | 38/348 [10:55<1:11:16, 13.80s/it] 11%|████████████▏                                                                                                | 39/348 [11:09<1:10:36, 13.71s/it]                                                                                                                                                     {'loss': 0.3786, 'grad_norm': 1.421875, 'learning_rate': 9.831628030028698e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2826.59, 'epoch': 0.22}
+ 11%|████████████▏                                                                                                | 39/348 [11:09<1:10:36, 13.71s/it] 11%|████████████▌                                                                                                | 40/348 [11:22<1:10:32, 13.74s/it]                                                                                                                                                     {'loss': 0.3688, 'grad_norm': 1.2265625, 'learning_rate': 9.819460887798714e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2734.91, 'epoch': 0.23}
+ 11%|████████████▌                                                                                                | 40/348 [11:22<1:10:32, 13.74s/it] 12%|████████████▊                                                                                                | 41/348 [11:36<1:10:08, 13.71s/it]                                                                                                                                                     {'loss': 0.3712, 'grad_norm': 1.3984375, 'learning_rate': 9.80687739258782e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2603.09, 'epoch': 0.23}
+ 12%|████████████▊                                                                                                | 41/348 [11:36<1:10:08, 13.71s/it] 12%|█████████████▏                                                                                               | 42/348 [11:50<1:10:14, 13.77s/it]                                                                                                                                                     {'loss': 0.3584, 'grad_norm': 1.171875, 'learning_rate': 9.79387863148353e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2706.94, 'epoch': 0.24}
+ 12%|█████████████▏                                                                                               | 42/348 [11:50<1:10:14, 13.77s/it] 12%|█████████████▍                                                                                               | 43/348 [12:04<1:09:52, 13.75s/it]                                                                                                                                                     {'loss': 0.3005, 'grad_norm': 1.03125, 'learning_rate': 9.78046572744815e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2375.4, 'epoch': 0.25}
+ 12%|█████████████▍                                                                                               | 43/348 [12:04<1:09:52, 13.75s/it] 13%|█████████████▊                                                                                               | 44/348 [12:17<1:09:09, 13.65s/it]                                                                                                                                                     {'loss': 0.2191, 'grad_norm': 1.3125, 'learning_rate': 9.76663983922178e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2784.8, 'epoch': 0.25}
+ 13%|█████████████▊                                                                                               | 44/348 [12:17<1:09:09, 13.65s/it] 13%|██████████████                                                                                               | 45/348 [12:31<1:09:00, 13.67s/it]                                                                                                                                                     {'loss': 0.2729, 'grad_norm': 1.484375, 'learning_rate': 9.7524021612222e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2618.22, 'epoch': 0.26}
+ 13%|██████████████                                                                                               | 45/348 [12:31<1:09:00, 13.67s/it] 13%|██████████████▍                                                                                              | 46/348 [12:45<1:09:03, 13.72s/it]                                                                                                                                                     {'loss': 0.2154, 'grad_norm': 1.296875, 'learning_rate': 9.737753923441689e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2498.29, 'epoch': 0.26}
+ 13%|██████████████▍                                                                                              | 46/348 [12:45<1:09:03, 13.72s/it] 14%|██████████████▋                                                                                              | 47/348 [12:58<1:08:25, 13.64s/it]                                                                                                                                                     {'loss': 0.1515, 'grad_norm': 0.9921875, 'learning_rate': 9.722696391340762e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3038.62, 'epoch': 0.27}
+ 14%|██████████████▋                                                                                              | 47/348 [12:58<1:08:25, 13.64s/it] 14%|███████████████                                                                                              | 48/348 [13:12<1:08:53, 13.78s/it]                                                                                                                                                     {'loss': 0.1541, 'grad_norm': 0.8671875, 'learning_rate': 9.70723086573885e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2398.35, 'epoch': 0.27}
+ 14%|███████████████                                                                                              | 48/348 [13:12<1:08:53, 13.78s/it] 14%|███████████████▎                                                                                             | 49/348 [13:26<1:08:32, 13.76s/it]                                                                                                                                                     {'loss': 0.1502, 'grad_norm': 0.9375, 'learning_rate': 9.691358682701927e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2829.14, 'epoch': 0.28}
+ 14%|███████████████▎                                                                                             | 49/348 [13:26<1:08:32, 13.76s/it] 14%|███████████████▋                                                                                             | 50/348 [13:40<1:08:26, 13.78s/it]                                                                                                                                                     {'loss': 0.1268, 'grad_norm': 0.7890625, 'learning_rate': 9.675081213427076e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2848.71, 'epoch': 0.29}
+ 14%|███████████████▋                                                                                             | 50/348 [13:40<1:08:26, 13.78s/it] 15%|███████████████▉                                                                                             | 51/348 [13:53<1:07:59, 13.74s/it]                                                                                                                                                     {'loss': 0.1389, 'grad_norm': 0.87109375, 'learning_rate': 9.658399864124037e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2699.26, 'epoch': 0.29}
+ 15%|███████████████▉                                                                                             | 51/348 [13:53<1:07:59, 13.74s/it] 15%|████████████████▎                                                                                            | 52/348 [14:07<1:07:51, 13.76s/it]                                                                                                                                                     {'loss': 0.1829, 'grad_norm': 0.89453125, 'learning_rate': 9.641316075893731e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2923.15, 'epoch': 0.3}
+ 15%|████████████████▎                                                                                            | 52/348 [14:07<1:07:51, 13.76s/it] 15%|████████████████▌                                                                                            | 53/348 [14:21<1:07:10, 13.66s/it]                                                                                                                                                     {'loss': 0.1099, 'grad_norm': 0.62890625, 'learning_rate': 9.623831324603755e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2939.37, 'epoch': 0.3}
+ 15%|████████████████▌                                                                                            | 53/348 [14:21<1:07:10, 13.66s/it] 16%|████████████████▉                                                                                            | 54/348 [14:34<1:06:37, 13.60s/it]                                                                                                                                                     {'loss': 0.1941, 'grad_norm': 0.98046875, 'learning_rate': 9.605947120760878e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2832.64, 'epoch': 0.31}
+ 16%|████████████████▉                                                                                            | 54/348 [14:34<1:06:37, 13.60s/it] 16%|█████████████████▏                                                                                           | 55/348 [14:48<1:07:05, 13.74s/it]                                                                                                                                                     {'loss': 0.1666, 'grad_norm': 1.0, 'learning_rate': 9.587665009380565e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2217.25, 'epoch': 0.31}
+ 16%|█████████████████▏                                                                                           | 55/348 [14:48<1:07:05, 13.74s/it] 16%|█████████████████▌                                                                                           | 56/348 [15:02<1:06:50, 13.73s/it]                                                                                                                                                     {'loss': 0.1247, 'grad_norm': 0.86328125, 'learning_rate': 9.568986569853487e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2741.51, 'epoch': 0.32}
+ 16%|█████████████████▌                                                                                           | 56/348 [15:02<1:06:50, 13.73s/it] 16%|█████████████████▊                                                                                           | 57/348 [15:16<1:06:36, 13.73s/it]                                                                                                                                                     {'loss': 0.1752, 'grad_norm': 1.046875, 'learning_rate': 9.549913415809084e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2814.19, 'epoch': 0.33}
+ 16%|█████████████████▊                                                                                           | 57/348 [15:16<1:06:36, 13.73s/it] 17%|██████████████████▏                                                                                          | 58/348 [15:29<1:06:45, 13.81s/it]                                                                                                                                                     {'loss': 0.1254, 'grad_norm': 0.8046875, 'learning_rate': 9.530447194976164e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2592.9, 'epoch': 0.33}
+ 17%|██████████████████▏                                                                                          | 58/348 [15:30<1:06:45, 13.81s/it] 17%|██████████████████▍                                                                                          | 59/348 [15:43<1:06:16, 13.76s/it]                                                                                                                                                     {'loss': 0.1293, 'grad_norm': 0.73828125, 'learning_rate': 9.510589589040554e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2650.89, 'epoch': 0.34}
+ 17%|██████████████████▍                                                                                          | 59/348 [15:43<1:06:16, 13.76s/it] 17%|██████████████████▊                                                                                          | 60/348 [15:57<1:05:49, 13.71s/it]                                                                                                                                                     {'loss': 0.2146, 'grad_norm': 0.96484375, 'learning_rate': 9.49034231349982e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2992.86, 'epoch': 0.34}
+ 17%|██████████████████▊                                                                                          | 60/348 [15:57<1:05:49, 13.71s/it] 18%|███████████████████                                                                                          | 61/348 [16:10<1:05:22, 13.67s/it]                                                                                                                                                     {'loss': 0.1197, 'grad_norm': 0.87890625, 'learning_rate': 9.469707117515068e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2502.36, 'epoch': 0.35}
+ 18%|███████████████████                                                                                          | 61/348 [16:10<1:05:22, 13.67s/it] 18%|███████████████████▍                                                                                         | 62/348 [16:24<1:05:23, 13.72s/it]                                                                                                                                                     {'loss': 0.0597, 'grad_norm': 0.37109375, 'learning_rate': 9.448685783759825e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2860.21, 'epoch': 0.35}
+ 18%|███████████████████▍                                                                                         | 62/348 [16:24<1:05:23, 13.72s/it] 18%|███████████████████▋                                                                                         | 63/348 [16:38<1:05:05, 13.70s/it]                                                                                                                                                     {'loss': 0.1111, 'grad_norm': 0.7109375, 'learning_rate': 9.427280128266049e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2664.12, 'epoch': 0.36}
+ 18%|███████████████████▋                                                                                         | 63/348 [16:38<1:05:05, 13.70s/it] 18%|████████████████████                                                                                         | 64/348 [16:51<1:04:25, 13.61s/it]                                                                                                                                                     {'loss': 0.0786, 'grad_norm': 0.53515625, 'learning_rate': 9.405492000267228e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3019.66, 'epoch': 0.37}
+ 18%|████████████████████                                                                                         | 64/348 [16:51<1:04:25, 13.61s/it] 19%|████████████████████▎                                                                                        | 65/348 [17:05<1:04:56, 13.77s/it]                                                                                                                                                     {'loss': 0.0974, 'grad_norm': 0.60546875, 'learning_rate': 9.383323282038632e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2576.77, 'epoch': 0.37}
+ 19%|████████████████████▎                                                                                        | 65/348 [17:05<1:04:56, 13.77s/it] 19%|████████████████████▋                                                                                        | 66/348 [17:19<1:04:52, 13.80s/it]                                                                                                                                                     {'loss': 0.1319, 'grad_norm': 0.77734375, 'learning_rate': 9.360775888734699e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2465.95, 'epoch': 0.38}
+ 19%|████████████████████▋                                                                                        | 66/348 [17:19<1:04:52, 13.80s/it] 19%|████████████████████▉                                                                                        | 67/348 [17:33<1:04:21, 13.74s/it]                                                                                                                                                     {'loss': 0.0652, 'grad_norm': 0.53515625, 'learning_rate': 9.337851768223589e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2894.11, 'epoch': 0.38}
+ 19%|████████████████████▉                                                                                        | 67/348 [17:33<1:04:21, 13.74s/it] 20%|█████████████████████▎                                                                                       | 68/348 [17:47<1:04:44, 13.87s/it]                                                                                                                                                     {'loss': 0.0583, 'grad_norm': 0.52734375, 'learning_rate': 9.31455290091891e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2574.91, 'epoch': 0.39}
+ 20%|█████████████████████▎                                                                                       | 68/348 [17:47<1:04:44, 13.87s/it] 20%|█████████████████████▌                                                                                       | 69/348 [18:01<1:04:42, 13.92s/it]                                                                                                                                                     {'loss': 0.0442, 'grad_norm': 0.408203125, 'learning_rate': 9.29088129960862e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2519.51, 'epoch': 0.39}
+ 20%|█████████████████████▌                                                                                       | 69/348 [18:01<1:04:42, 13.92s/it] 20%|█████████████████████▉                                                                                       | 70/348 [18:15<1:04:19, 13.88s/it]                                                                                                                                                     {'loss': 0.1398, 'grad_norm': 0.80078125, 'learning_rate': 9.266839009281154e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2783.16, 'epoch': 0.4}
+ 20%|█████████████████████▉                                                                                       | 70/348 [18:15<1:04:19, 13.88s/it] 20%|██████████████████████▏                                                                                      | 71/348 [18:29<1:03:57, 13.85s/it]                                                                                                                                                     {'loss': 0.075, 'grad_norm': 0.46875, 'learning_rate': 9.242428106948748e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2950.36, 'epoch': 0.41}
+ 20%|██████████████████████▏                                                                                      | 71/348 [18:29<1:03:57, 13.85s/it] 21%|██████████████████████▌                                                                                      | 72/348 [18:42<1:02:59, 13.69s/it]                                                                                                                                                     {'loss': 0.1491, 'grad_norm': 0.81640625, 'learning_rate': 9.217650701468016e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2875.92, 'epoch': 0.41}
+ 21%|██████████████████████▌                                                                                      | 72/348 [18:42<1:02:59, 13.69s/it] 21%|██████████████████████▊                                                                                      | 73/348 [18:55<1:02:34, 13.65s/it]                                                                                                                                                     {'loss': 0.1263, 'grad_norm': 0.82421875, 'learning_rate': 9.192508933357753e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2569.93, 'epoch': 0.42}
+ 21%|██████████████████████▊                                                                                      | 73/348 [18:56<1:02:34, 13.65s/it] 21%|███████████████████████▏                                                                                     | 74/348 [19:09<1:02:25, 13.67s/it]                                                                                                                                                     {'loss': 0.0605, 'grad_norm': 0.43359375, 'learning_rate': 9.16700497461403e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2438.7, 'epoch': 0.42}
+ 21%|███████████████████████▏                                                                                     | 74/348 [19:09<1:02:25, 13.67s/it] 22%|███████████████████████▍                                                                                     | 75/348 [19:23<1:02:40, 13.77s/it]                                                                                                                                                     {'loss': 0.1145, 'grad_norm': 0.69921875, 'learning_rate': 9.141141028522544e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2611.75, 'epoch': 0.43}
+ 22%|███████████████████████▍                                                                                     | 75/348 [19:23<1:02:40, 13.77s/it] 22%|███████████████████████▊                                                                                     | 76/348 [19:37<1:02:05, 13.70s/it]                                                                                                                                                     {'loss': 0.0514, 'grad_norm': 0.37890625, 'learning_rate': 9.114919329468283e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3101.25, 'epoch': 0.43}
+ 22%|███████████████████████▊                                                                                     | 76/348 [19:37<1:02:05, 13.70s/it] 22%|████████████████████████                                                                                     | 77/348 [19:50<1:01:47, 13.68s/it]                                                                                                                                                     {'loss': 0.0562, 'grad_norm': 0.34765625, 'learning_rate': 9.088342142742493e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2733.41, 'epoch': 0.44}
+ 22%|████████████████████████                                                                                     | 77/348 [19:50<1:01:47, 13.68s/it] 22%|████████████████████████▍                                                                                    | 78/348 [20:04<1:01:41, 13.71s/it]                                                                                                                                                     {'loss': 0.0333, 'grad_norm': 0.2890625, 'learning_rate': 9.061411764346983e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2626.95, 'epoch': 0.45}
+ 22%|████████████████████████▍                                                                                    | 78/348 [20:04<1:01:41, 13.71s/it] 23%|████████████████████████▋                                                                                    | 79/348 [20:18<1:01:14, 13.66s/it]                                                                                                                                                     {'loss': 0.0568, 'grad_norm': 0.3984375, 'learning_rate': 9.034130520795774e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3008.89, 'epoch': 0.45}
+ 23%|████████████████████████▋                                                                                    | 79/348 [20:18<1:01:14, 13.66s/it] 23%|█████████████████████████                                                                                    | 80/348 [20:31<1:00:49, 13.62s/it]                                                                                                                                                     {'loss': 0.0463, 'grad_norm': 0.359375, 'learning_rate': 9.006500768914106e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3027.76, 'epoch': 0.46}
+ 23%|█████████████████████████                                                                                    | 80/348 [20:31<1:00:49, 13.62s/it] 23%|█████████████████████████▎                                                                                   | 81/348 [20:45<1:00:23, 13.57s/it]                                                                                                                                                     {'loss': 0.1023, 'grad_norm': 1.25, 'learning_rate': 8.978524895634842e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2492.67, 'epoch': 0.46}
+ 23%|█████████████████████████▎                                                                                   | 81/348 [20:45<1:00:23, 13.57s/it] 24%|█████████████████████████▋                                                                                   | 82/348 [20:58<1:00:03, 13.55s/it]                                                                                                                                                     {'loss': 0.1399, 'grad_norm': 0.875, 'learning_rate': 8.95020531779225e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2912.47, 'epoch': 0.47}
+ 24%|█████████████████████████▋                                                                                   | 82/348 [20:58<1:00:03, 13.55s/it] 24%|█████████████████████████▉                                                                                   | 83/348 [21:12<1:00:02, 13.59s/it]                                                                                                                                                     {'loss': 0.0874, 'grad_norm': 0.6328125, 'learning_rate': 8.921544481913218e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2789.59, 'epoch': 0.47}
+ 24%|█████████████████████████▉                                                                                   | 83/348 [21:12<1:00:02, 13.59s/it] 24%|██████████████████████████▊                                                                                    | 84/348 [21:25<59:48, 13.59s/it]                                                                                                                                                     {'loss': 0.0489, 'grad_norm': 0.341796875, 'learning_rate': 8.892544864005899e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2560.69, 'epoch': 0.48}
+ 24%|██████████████████████████▊                                                                                    | 84/348 [21:25<59:48, 13.59s/it] 24%|███████████████████████████                                                                                    | 85/348 [21:39<59:47, 13.64s/it]                                                                                                                                                     {'loss': 0.058, 'grad_norm': 0.49609375, 'learning_rate': 8.86320896934581e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2585.27, 'epoch': 0.49}
+ 24%|███████████████████████████                                                                                    | 85/348 [21:39<59:47, 13.64s/it] 25%|███████████████████████████▍                                                                                   | 86/348 [21:53<59:23, 13.60s/it]                                                                                                                                                     {'loss': 0.0867, 'grad_norm': 0.7109375, 'learning_rate': 8.833539332259398e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2720.59, 'epoch': 0.49}
+ 25%|███████████████████████████▍                                                                                   | 86/348 [21:53<59:23, 13.60s/it] 25%|███████████████████████████▊                                                                                   | 87/348 [22:06<59:21, 13.64s/it]                                                                                                                                                     {'loss': 0.033, 'grad_norm': 0.283203125, 'learning_rate': 8.803538515905102e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2553.95, 'epoch': 0.5}
+ 25%|███████████████████████████▊                                                                                   | 87/348 [22:06<59:21, 13.64s/it] 25%|████████████████████████████                                                                                   | 88/348 [22:20<59:05, 13.64s/it]                                                                                                                                                     {'loss': 0.0489, 'grad_norm': 0.4609375, 'learning_rate': 8.773209112051919e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2679.76, 'epoch': 0.5}
+ 25%|████████████████████████████                                                                                   | 88/348 [22:20<59:05, 13.64s/it] 26%|████████████████████████████▍                                                                                  | 89/348 [22:34<58:58, 13.66s/it]                                                                                                                                                     {'loss': 0.0393, 'grad_norm': 0.375, 'learning_rate': 8.742553740855507e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2581.67, 'epoch': 0.51}
+ 26%|████████████████████████████▍                                                                                  | 89/348 [22:34<58:58, 13.66s/it] 26%|████████████████████████████▋                                                                                  | 90/348 [22:48<58:49, 13.68s/it]                                                                                                                                                     {'loss': 0.04, 'grad_norm': 0.345703125, 'learning_rate': 8.711575050631823e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2911.56, 'epoch': 0.51}
+ 26%|████████████████████████████▋                                                                                  | 90/348 [22:48<58:49, 13.68s/it] 26%|█████████████████████████████                                                                                  | 91/348 [23:02<58:59, 13.77s/it]                                                                                                                                                     {'loss': 0.0431, 'grad_norm': 0.3671875, 'learning_rate': 8.680275717628336e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2518.89, 'epoch': 0.52}
+ 26%|█████████████████████████████                                                                                  | 91/348 [23:02<58:59, 13.77s/it] 26%|█████████████████████████████▎                                                                                 | 92/348 [23:15<58:32, 13.72s/it]                                                                                                                                                     {'loss': 0.0298, 'grad_norm': 0.296875, 'learning_rate': 8.64865844579284e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2680.58, 'epoch': 0.53}
+ 26%|█████████████████████████████▎                                                                                 | 92/348 [23:15<58:32, 13.72s/it] 27%|█████████████████████████████▋                                                                                 | 93/348 [23:29<58:17, 13.71s/it]                                                                                                                                                     {'loss': 0.0265, 'grad_norm': 0.275390625, 'learning_rate': 8.616725966539831e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3005.02, 'epoch': 0.53}
+ 27%|█████████████████████████████▋                                                                                 | 93/348 [23:29<58:17, 13.71s/it] 27%|█████████████████████████████▉                                                                                 | 94/348 [23:42<57:44, 13.64s/it]                                                                                                                                                     {'loss': 0.0373, 'grad_norm': 0.306640625, 'learning_rate': 8.584481038514573e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3016.43, 'epoch': 0.54}
+ 27%|█████████████████████████████▉                                                                                 | 94/348 [23:42<57:44, 13.64s/it] 27%|██████████████████████████████▎                                                                                | 95/348 [23:56<57:22, 13.61s/it]                                                                                                                                                     {'loss': 0.0458, 'grad_norm': 0.478515625, 'learning_rate': 8.551926447354759e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2795.81, 'epoch': 0.54}
+ 27%|██████████████████████████████▎                                                                                | 95/348 [23:56<57:22, 13.61s/it] 28%|██████████████████████████████▌                                                                                | 96/348 [24:09<56:56, 13.56s/it]                                                                                                                                                     {'loss': 0.0232, 'grad_norm': 0.27734375, 'learning_rate': 8.519065005449858e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2878.8, 'epoch': 0.55}
+ 28%|██████████████████████████████▌                                                                                | 96/348 [24:09<56:56, 13.56s/it] 28%|██████████████████████████████▉                                                                                | 97/348 [24:23<56:43, 13.56s/it]                                                                                                                                                     {'loss': 0.0453, 'grad_norm': 0.29296875, 'learning_rate': 8.485899551698166e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2725.95, 'epoch': 0.55}
+ 28%|██████████████████████████████▉                                                                                | 97/348 [24:23<56:43, 13.56s/it] 28%|███████████████████████████████▎                                                                               | 98/348 [24:36<56:36, 13.59s/it]                                                                                                                                                     {'loss': 0.0938, 'grad_norm': 0.6484375, 'learning_rate': 8.452432951261549e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2575.12, 'epoch': 0.56}
+ 28%|███████████████████████████████▎                                                                               | 98/348 [24:36<56:36, 13.59s/it] 28%|███████████████████████████████▌                                                                               | 99/348 [24:50<56:18, 13.57s/it]                                                                                                                                                     {'loss': 0.0753, 'grad_norm': 0.59765625, 'learning_rate': 8.418668095317912e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3209.7, 'epoch': 0.57}
+ 28%|███████████████████████████████▌                                                                               | 99/348 [24:50<56:18, 13.57s/it] 29%|████████████████████████��██████▌                                                                              | 100/348 [25:04<56:07, 13.58s/it]                                                                                                                                                     {'loss': 0.1434, 'grad_norm': 1.2421875, 'learning_rate': 8.384607900811442e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2850.4, 'epoch': 0.57}
+ 29%|███████████████████████████████▌                                                                              | 100/348 [25:04<56:07, 13.58s/it] 29%|███████████████████████████████▉                                                                              | 101/348 [25:17<55:38, 13.52s/it]                                                                                                                                                     {'loss': 0.0273, 'grad_norm': 0.26953125, 'learning_rate': 8.350255310200611e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2721.27, 'epoch': 0.58}
+ 29%|███████████████████████████████▉                                                                              | 101/348 [25:17<55:38, 13.52s/it] 29%|████████████████████████████████▏                                                                             | 102/348 [25:31<56:00, 13.66s/it]                                                                                                                                                     {'loss': 0.0736, 'grad_norm': 0.546875, 'learning_rate': 8.315613291203977e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2675.92, 'epoch': 0.58}
+ 29%|████████████████████████████████▏                                                                             | 102/348 [25:31<56:00, 13.66s/it] 30%|████████████████████████████████▌                                                                             | 103/348 [25:45<55:40, 13.63s/it]                                                                                                                                                     {'loss': 0.058, 'grad_norm': 0.49609375, 'learning_rate': 8.280684836543794e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2559.37, 'epoch': 0.59}
+ 30%|████████████████████████████████▌                                                                             | 103/348 [25:45<55:40, 13.63s/it] 30%|████████████████████████████████▊                                                                             | 104/348 [25:59<55:55, 13.75s/it]                                                                                                                                                     {'loss': 0.0298, 'grad_norm': 0.25390625, 'learning_rate': 8.245472963687484e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2787.37, 'epoch': 0.59}
+ 30%|████████████████████████████████▊                                                                             | 104/348 [25:59<55:55, 13.75s/it] 30%|█████████████████████████████████▏                                                                            | 105/348 [26:12<55:36, 13.73s/it]                                                                                                                                                     {'loss': 0.0485, 'grad_norm': 0.400390625, 'learning_rate': 8.209980714586955e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2944.0, 'epoch': 0.6}
+ 30%|█████████████████████████████████▏                                                                            | 105/348 [26:12<55:36, 13.73s/it] 30%|█████████████████████████████████▌                                                                            | 106/348 [26:26<55:07, 13.67s/it]                                                                                                                                                     {'loss': 0.0967, 'grad_norm': 0.84375, 'learning_rate': 8.1742111554158e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2716.67, 'epoch': 0.61}
+ 30%|█████████████████████████████████▌                                                                            | 106/348 [26:26<55:07, 13.67s/it] 31%|█████████████████████████████████▊                                                                            | 107/348 [26:39<54:57, 13.68s/it]                                                                                                                                                     {'loss': 0.0364, 'grad_norm': 0.275390625, 'learning_rate': 8.138167376304411e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3009.11, 'epoch': 0.61}
+ 31%|█████████████████████████████████▊                                                                            | 107/348 [26:39<54:57, 13.68s/it] 31%|██████████████████████████████████▏                                                                           | 108/348 [26:53<54:28, 13.62s/it]                                                                                                                                                     {'loss': 0.0341, 'grad_norm': 0.271484375, 'learning_rate': 8.101852491073036e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3138.5, 'epoch': 0.62}
+ 31%|██████████████████████████████████▏                                                                           | 108/348 [26:53<54:28, 13.62s/it] 31%|██████████████████████████████████▍                                                                           | 109/348 [27:07<54:24, 13.66s/it]                                                                                                                                                     {'loss': 0.0313, 'grad_norm': 0.2734375, 'learning_rate': 8.065269636962765e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2719.79, 'epoch': 0.62}
+ 31%|██████████████████████████████████▍                                                                           | 109/348 [27:07<54:24, 13.66s/it] 32%|██████████████████████████████████▊                                                                           | 110/348 [27:21<54:29, 13.74s/it]                                                                                                                                                     {'loss': 0.0643, 'grad_norm': 0.439453125, 'learning_rate': 8.0284219743645e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2799.12, 'epoch': 0.63}
+ 32%|██████████████████████████████████▊                                                                           | 110/348 [27:21<54:29, 13.74s/it] 32%|███████████████████████████████████                                                                           | 111/348 [27:34<54:09, 13.71s/it]                                                                                                                                                     {'loss': 0.0418, 'grad_norm': 0.310546875, 'learning_rate': 7.991312686545939e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2738.02, 'epoch': 0.63}
+ 32%|███████████████████████████████████                                                                           | 111/348 [27:34<54:09, 13.71s/it] 32%|███████████████████████████████████▍                                                                          | 112/348 [27:48<53:45, 13.67s/it]                                                                                                                                                     {'loss': 0.0289, 'grad_norm': 0.310546875, 'learning_rate': 7.953944979376567e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2974.26, 'epoch': 0.64}
+ 32%|███████████████████████████████████▍                                                                          | 112/348 [27:48<53:45, 13.67s/it] 32%|███████████████████████████████████▋                                                                          | 113/348 [28:02<53:50, 13.75s/it]                                                                                                                                                     {'loss': 0.0832, 'grad_norm': 0.5625, 'learning_rate': 7.916322081050708e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2340.37, 'epoch': 0.65}
+ 32%|███████████████████████████████████▋                                                                          | 113/348 [28:02<53:50, 13.75s/it] 33%|████████████████████████████████████                                                                          | 114/348 [28:16<53:37, 13.75s/it]                                                                                                                                                     {'loss': 0.0861, 'grad_norm': 0.53515625, 'learning_rate': 7.878447241808634e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2676.66, 'epoch': 0.65}
+ 33%|████████████████████████████████████                                                                          | 114/348 [28:16<53:37, 13.75s/it] 33%|████████████████████████████████████▎                                                                         | 115/348 [28:30<53:52, 13.87s/it]                                                                                                                                                     {'loss': 0.0463, 'grad_norm': 0.41015625, 'learning_rate': 7.84032373365578e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2801.52, 'epoch': 0.66}
+ 33%|████████████████████████████████████▎                                                                         | 115/348 [28:30<53:52, 13.87s/it] 33%|████████████████████████████████████▋                                                                         | 116/348 [28:43<53:19, 13.79s/it]                                                                                                                                                     {'loss': 0.0391, 'grad_norm': 0.3359375, 'learning_rate': 7.801954850080075e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2962.88, 'epoch': 0.66}
+ 33%|████████████████████████████████████▋                                                                         | 116/348 [28:43<53:19, 13.79s/it] 34%|████████████████████████████████████▉                                                                         | 117/348 [28:57<52:55, 13.75s/it]                                                                                                                                                     {'loss': 0.0404, 'grad_norm': 0.46875, 'learning_rate': 7.76334390576742e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3018.36, 'epoch': 0.67}
+ 34%|████████████████████████████████████▉                                                                         | 117/348 [28:57<52:55, 13.75s/it] 34%|█████████████████████████████████████▎                                                                        | 118/348 [29:11<52:47, 13.77s/it]                                                                                                                                                     {'loss': 0.0464, 'grad_norm': 0.375, 'learning_rate': 7.724494236315327e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2427.37, 'epoch': 0.67}
+ 34%|█████████████████████████████████████▎                                                                        | 118/348 [29:11<52:47, 13.77s/it] 34%|█████████████████████████████████████▌                                                                        | 119/348 [29:25<52:48, 13.84s/it]                                                                                                                                                     {'loss': 0.0333, 'grad_norm': 0.32421875, 'learning_rate': 7.685409197944768e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2232.12, 'epoch': 0.68}
+ 34%|█████████████████████████████████████▌                                                                        | 119/348 [29:25<52:48, 13.84s/it] 34%|█████████████████████████████████████▉                                                                        | 120/348 [29:39<52:49, 13.90s/it]                                                                                                                                                     {'loss': 0.0861, 'grad_norm': 0.64453125, 'learning_rate': 7.646092167210217e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2813.2, 'epoch': 0.69}
+ 34%|█████████████████████████████████████▉                                                                        | 120/348 [29:39<52:49, 13.90s/it] 35%|██████████████████████████████████████▏                                                                       | 121/348 [29:53<52:45, 13.94s/it]                                                                                                                                                     {'loss': 0.021, 'grad_norm': 0.3046875, 'learning_rate': 7.60654654070796e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2352.58, 'epoch': 0.69}
+ 35%|██████████████████████████████████████▏                                                                       | 121/348 [29:53<52:45, 13.94s/it] 35%|██████████████████████████████████████▌                                                                       | 122/348 [30:07<53:02, 14.08s/it]                                                                                                                                                     {'loss': 0.0577, 'grad_norm': 0.46484375, 'learning_rate': 7.566775734782656e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2475.15, 'epoch': 0.7}
+ 35%|██████████████████████████████████████▌                                                                       | 122/348 [30:07<53:02, 14.08s/it] 35%|██████████████████████████████████████▉                                                                       | 123/348 [30:21<52:12, 13.92s/it]                                                                                                                                                     {'loss': 0.0475, 'grad_norm': 0.3984375, 'learning_rate': 7.526783185232208e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3012.45, 'epoch': 0.7}
+ 35%|██████████████████████████████████████▉                                                                       | 123/348 [30:21<52:12, 13.92s/it] 36%|███████████████████████████████████████▏                                                                      | 124/348 [30:35<52:01, 13.94s/it]                                                                                                                                                     {'loss': 0.0258, 'grad_norm': 0.228515625, 'learning_rate': 7.486572347010937e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2496.1, 'epoch': 0.71}
+ 36%|███████████████████████████████████████▏                                                                      | 124/348 [30:35<52:01, 13.94s/it] 36%|███████████████████████████████████████▌                                                                      | 125/348 [30:49<51:37, 13.89s/it]                                                                                                                                                     {'loss': 0.0246, 'grad_norm': 0.2236328125, 'learning_rate': 7.446146693931111e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2444.31, 'epoch': 0.71}
+ 36%|███████████████████████████████████████▌                                                                      | 125/348 [30:49<51:37, 13.89s/it] 36%|███████████████████████████████████████▊                                                                      | 126/348 [31:02<50:53, 13.76s/it]                                                                                                                                                     {'loss': 0.0754, 'grad_norm': 0.71484375, 'learning_rate': 7.405509718362842e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2953.81, 'epoch': 0.72}
+ 36%|███████████████████████████████████████▊                                                                      | 126/348 [31:02<50:53, 13.76s/it] 36%|████████████████████████████████████████▏                                                                     | 127/348 [31:16<50:59, 13.84s/it]                                                                                                                                                     {'loss': 0.0189, 'grad_norm': 0.216796875, 'learning_rate': 7.364664930932385e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2673.89, 'epoch': 0.73}
+ 36%|████████████████████████████████████████▏                                                                     | 127/348 [31:16<50:59, 13.84s/it] 37%|████████████████████████████████████████▍                                                                     | 128/348 [31:30<50:22, 13.74s/it]                                                                                                                                                     {'loss': 0.0288, 'grad_norm': 0.322265625, 'learning_rate': 7.323615860218844e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2798.32, 'epoch': 0.73}
+ 37%|████████████████████████████████████████▍                                                                     | 128/348 [31:30<50:22, 13.74s/it] 37%|████████████████████████████████████████▊                                                                     | 129/348 [31:43<49:56, 13.68s/it]                                                                                                                                                     {'loss': 0.0181, 'grad_norm': 0.232421875, 'learning_rate': 7.282366052449351e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2975.23, 'epoch': 0.74}
+ 37%|████████████████████████████████████████▊                                                                     | 129/348 [31:43<49:56, 13.68s/it] 37%|█████████████████████████████████████████                                                                     | 130/348 [31:57<50:01, 13.77s/it]                                                                                                                                                     {'loss': 0.013, 'grad_norm': 0.1787109375, 'learning_rate': 7.2409190711927015e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2568.09, 'epoch': 0.74}
+ 37%|█████████████████████████████████████████                                                                     | 130/348 [31:57<50:01, 13.77s/it] 38%|█████████████████████████████████████████▍                                                                    | 131/348 [32:11<49:54, 13.80s/it]                                                                                                                                                     {'loss': 0.0173, 'grad_norm': 0.22265625, 'learning_rate': 7.199278497051498e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2476.61, 'epoch': 0.75}
+ 38%|█████████████████████████████████████████▍                                                                    | 131/348 [32:11<49:54, 13.80s/it] 38%|█████████████████████████████████████████▋                                                                    | 132/348 [32:25<49:38, 13.79s/it]                                                                                                                                                     {'loss': 0.0213, 'grad_norm': 0.291015625, 'learning_rate': 7.157447927352821e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2714.73, 'epoch': 0.75}
+ 38%|█████████████████████████████████████████▋                                                                    | 132/348 [32:25<49:38, 13.79s/it] 38%|██████████████████████████████████████████                                                                    | 133/348 [32:38<49:20, 13.77s/it]                                                                                                                                                     {'loss': 0.0218, 'grad_norm': 0.248046875, 'learning_rate': 7.115430975837457e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2486.93, 'epoch': 0.76}
+ 38%|██████████████████████████████████████████                                                                    | 133/348 [32:38<49:20, 13.77s/it] 39%|██████████████████████████████████████████▎                                                                   | 134/348 [32:52<49:25, 13.86s/it]                                                                                                                                                     {'loss': 0.0264, 'grad_norm': 0.205078125, 'learning_rate': 7.073231272347714e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2564.86, 'epoch': 0.77}
+ 39%|██████████████████████████████████████████▎                                                                   | 134/348 [32:52<49:25, 13.86s/it] 39%|██████████████████████████████████████████▋                                                                   | 135/348 [33:06<49:14, 13.87s/it]                                                                                                                                                     {'loss': 0.0154, 'grad_norm': 0.2060546875, 'learning_rate': 7.030852462513827e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2190.97, 'epoch': 0.77}
+ 39%|██████████████████████████████████████████▋                                                                   | 135/348 [33:06<49:14, 13.87s/it] 39%|██████████████████████████████████████████▉                                                                   | 136/348 [33:20<48:27, 13.72s/it]                                                                                                                                                     {'loss': 0.0298, 'grad_norm': 0.455078125, 'learning_rate': 6.988298207439022e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3016.94, 'epoch': 0.78}
+ 39%|██████████████████████████████████████████▉                                                                   | 136/348 [33:20<48:27, 13.72s/it] 39%|███████████████████████████████████████████▎                                                                  | 137/348 [33:33<48:03, 13.67s/it]                                                                                                                                                     {'loss': 0.084, 'grad_norm': 1.09375, 'learning_rate': 6.945572183383229e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2727.31, 'epoch': 0.78}
+ 39%|███████████████████████████████████████████▎                                                                  | 137/348 [33:33<48:03, 13.67s/it] 40%|███████████████████████████████████████████▌                                                                  | 138/348 [33:47<47:50, 13.67s/it]                                                                                                                                                     {'loss': 0.0272, 'grad_norm': 0.2392578125, 'learning_rate': 6.902678081445495e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2607.28, 'epoch': 0.79}
+ 40%|███████████████████████████████████████████▌                                                                  | 138/348 [33:47<47:50, 13.67s/it] 40%|███████████████████████████████████████████▉                                                                  | 139/348 [34:01<47:31, 13.64s/it]                                                                                                                                                     {'loss': 0.0556, 'grad_norm': 0.5546875, 'learning_rate': 6.859619607245102e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2771.64, 'epoch': 0.79}
+ 40%|███████████████████████████████████████████▉                                                                  | 139/348 [34:01<47:31, 13.64s/it] 40%|████████████████████████████████████████████▎                                                                 | 140/348 [34:14<47:24, 13.68s/it]                                                                                                                                                     {'loss': 0.0207, 'grad_norm': 0.2265625, 'learning_rate': 6.816400480601445e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2401.32, 'epoch': 0.8}
+ 40%|████████████████████████████████████████████▎                                                                 | 140/348 [34:14<47:24, 13.68s/it] 41%|████████████████████████████████████████████▌                                                                 | 141/348 [34:28<46:53, 13.59s/it]                                                                                                                                                     {'loss': 0.0703, 'grad_norm': 0.66796875, 'learning_rate': 6.773024435212678e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3091.92, 'epoch': 0.81}
+ 41%|████████████████████████████████████████████▌                                                                 | 141/348 [34:28<46:53, 13.59s/it] 41%|████████████████████████████████████████████▉                                                                 | 142/348 [34:41<46:42, 13.61s/it]                                                                                                                                                     {'loss': 0.0279, 'grad_norm': 0.2373046875, 'learning_rate': 6.729495218333157e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2746.16, 'epoch': 0.81}
+ 41%|████████████████████████████████████████████▉                                                                 | 142/348 [34:41<46:42, 13.61s/it] 41%|█████████████████████████████████████████████▏                                                                | 143/348 [34:56<47:05, 13.79s/it]                                                                                                                                                     {'loss': 0.0265, 'grad_norm': 0.259765625, 'learning_rate': 6.685816590449708e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2282.45, 'epoch': 0.82}
+ 41%|█████████████████████████████████████████████▏                                                                | 143/348 [34:56<47:05, 13.79s/it] 41%|█████████████████████████████████████████████▌                                                                | 144/348 [35:09<46:44, 13.75s/it]                                                                                                                                                     {'loss': 0.0278, 'grad_norm': 0.2265625, 'learning_rate': 6.641992324956776e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3038.36, 'epoch': 0.82}
+ 41%|█████████████████████████████████████████████▌                                                                | 144/348 [35:09<46:44, 13.75s/it] 42%|█████████████████████████████████████████████▊                                                                | 145/348 [35:23<46:25, 13.72s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.2236328125, 'learning_rate': 6.598026207830428e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2970.55, 'epoch': 0.83}
+ 42%|█████████████████████████████████████████████▊                                                                | 145/348 [35:23<46:25, 13.72s/it] 42%|██████████████████████████████████████████████▏                                                               | 146/348 [35:36<46:04, 13.69s/it]                                                                                                                                                     {'loss': 0.0247, 'grad_norm': 0.2421875, 'learning_rate': 6.553922037301283e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2920.1, 'epoch': 0.83}
+ 42%|██████████████████████████████████████████████▏                                                               | 146/348 [35:36<46:04, 13.69s/it] 42%|██████████████████████████████████████████████▍                                                               | 147/348 [35:50<45:42, 13.64s/it]                                                                                                                                                     {'loss': 0.0285, 'grad_norm': 0.24609375, 'learning_rate': 6.5096836235263904e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2520.75, 'epoch': 0.84}
+ 42%|██████████████████████████████████████████████▍                                                               | 147/348 [35:50<45:42, 13.64s/it] 43%|█████████���████████████████████████████████████▊                                                               | 148/348 [36:04<45:46, 13.73s/it]                                                                                                                                                     {'loss': 0.0354, 'grad_norm': 0.29296875, 'learning_rate': 6.465314788260067e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2114.22, 'epoch': 0.85}
+ 43%|██████████████████████████████████████████████▊                                                               | 148/348 [36:04<45:46, 13.73s/it] 43%|███████████████████████████████████████████████                                                               | 149/348 [36:17<45:19, 13.67s/it]                                                                                                                                                     {'loss': 0.0203, 'grad_norm': 0.259765625, 'learning_rate': 6.4208193645237314e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3177.33, 'epoch': 0.85}
+ 43%|███████████████████████████████████████████████                                                               | 149/348 [36:17<45:19, 13.67s/it] 43%|███████████████████████████████████████████████▍                                                              | 150/348 [36:31<44:48, 13.58s/it]                                                                                                                                                     {'loss': 0.0413, 'grad_norm': 0.333984375, 'learning_rate': 6.376201196274778e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2889.73, 'epoch': 0.86}
+ 43%|███████████████████████████████████████████████▍                                                              | 150/348 [36:31<44:48, 13.58s/it] 43%|███████████████████████████████████████████████▋                                                              | 151/348 [36:44<44:35, 13.58s/it]                                                                                                                                                     {'loss': 0.0287, 'grad_norm': 0.275390625, 'learning_rate': 6.331464138074493e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3087.28, 'epoch': 0.86}
+ 43%|███████████████████████████████████████████████▋                                                              | 151/348 [36:44<44:35, 13.58s/it] 44%|████████████████████████████████████████████████                                                              | 152/348 [36:58<44:47, 13.71s/it]                                                                                                                                                     {'loss': 0.0682, 'grad_norm': 0.7890625, 'learning_rate': 6.286612054755056e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2073.56, 'epoch': 0.87}
+ 44%|████████████████████████████████████████████████                                                              | 152/348 [36:58<44:47, 13.71s/it] 44%|████████████████████████████████████████████████▎                                                             | 153/348 [37:12<44:31, 13.70s/it]                                                                                                                                                     {'loss': 0.0603, 'grad_norm': 0.458984375, 'learning_rate': 6.241648821085666e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2501.33, 'epoch': 0.87}
+ 44%|████████████████████��███████████████████████████▎                                                             | 153/348 [37:12<44:31, 13.70s/it] 44%|████████████████████████████████████████████████▋                                                             | 154/348 [37:26<44:38, 13.81s/it]                                                                                                                                                     {'loss': 0.0738, 'grad_norm': 0.546875, 'learning_rate': 6.1965783214377895e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2625.96, 'epoch': 0.88}
+ 44%|████████████████████████████████████████████████▋                                                             | 154/348 [37:26<44:38, 13.81s/it] 45%|████████████████████████████████████████████████▉                                                             | 155/348 [37:40<44:32, 13.85s/it]                                                                                                                                                     {'loss': 0.0474, 'grad_norm': 0.361328125, 'learning_rate': 6.1514044494496e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2711.4, 'epoch': 0.89}
+ 45%|████████████████████████████████████████████████▉                                                             | 155/348 [37:40<44:32, 13.85s/it] 45%|█████████████████████████████████████████████████▎                                                            | 156/348 [37:54<44:11, 13.81s/it]                                                                                                                                                     {'loss': 0.0393, 'grad_norm': 0.400390625, 'learning_rate': 6.106131107689599e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2709.86, 'epoch': 0.89}
+ 45%|█████████████████████████████████████████████████▎                                                            | 156/348 [37:54<44:11, 13.81s/it] 45%|█████████████████████████████████████████████████▋                                                            | 157/348 [38:07<43:39, 13.71s/it]                                                                                                                                                     {'loss': 0.0365, 'grad_norm': 0.333984375, 'learning_rate': 6.060762207319479e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2546.78, 'epoch': 0.9}
+ 45%|█████████████████████████████████████████████████▋                                                            | 157/348 [38:07<43:39, 13.71s/it] 45%|█████████████████████████████████████████████████▉                                                            | 158/348 [38:21<43:47, 13.83s/it]                                                                                                                                                     {'loss': 0.0474, 'grad_norm': 0.357421875, 'learning_rate': 6.015301667756234e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2539.39, 'epoch': 0.9}
+ 45%|█████████████████████████████████████████████████▉                                                            | 158/348 [38:21<43:47, 13.83s/it] 46%|██████████████████████████████████████████████████▎                                                           | 159/348 [38:35<43:33, 13.83s/it]                                                                                                                                                     {'loss': 0.0204, 'grad_norm': 0.21875, 'learning_rate': 5.9697534163335645e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2512.72, 'epoch': 0.91}
+ 46%|██████████████████████████████████████████████████▎                                                           | 159/348 [38:35<43:33, 13.83s/it] 46%|██████████████████████████████████████████████████▌                                                           | 160/348 [38:49<43:18, 13.82s/it]                                                                                                                                                     {'loss': 0.0652, 'grad_norm': 0.48046875, 'learning_rate': 5.924121387962594e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2464.49, 'epoch': 0.91}
+ 46%|██████████████████████████████████████████████████▌                                                           | 160/348 [38:49<43:18, 13.82s/it] 46%|██████████████████████████████████████████████████▉                                                           | 161/348 [39:03<42:50, 13.75s/it]                                                                                                                                                     {'loss': 0.0327, 'grad_norm': 0.28515625, 'learning_rate': 5.878409524791931e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2703.38, 'epoch': 0.92}
+ 46%|██████████████████████████████████████████████████▉                                                           | 161/348 [39:03<42:50, 13.75s/it] 47%|███████████████████████████████████████████████████▏                                                          | 162/348 [39:16<42:39, 13.76s/it]                                                                                                                                                     {'loss': 0.0409, 'grad_norm': 0.298828125, 'learning_rate': 5.83262177586711e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2593.95, 'epoch': 0.93}
+ 47%|███████████████████████████████████████████████████▏                                                          | 162/348 [39:16<42:39, 13.76s/it] 47%|███████████████████████████████████████████████████▌                                                          | 163/348 [39:30<42:19, 13.73s/it]                                                                                                                                                     {'loss': 0.0189, 'grad_norm': 0.2314453125, 'learning_rate': 5.786762096789431e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2815.0, 'epoch': 0.93}
+ 47%|███████████████████████████████████████████████████▌                                                          | 163/348 [39:30<42:19, 13.73s/it] 47%|███████████████████████████████████████████████████▊                                                          | 164/348 [39:44<41:56, 13.68s/it]                                                                                                                                                     {'loss': 0.0154, 'grad_norm': 0.208984375, 'learning_rate': 5.740834449374237e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2975.02, 'epoch': 0.94}
+ 47%|███████████████████████████████████████████████████▊                                                          | 164/348 [39:44<41:56, 13.68s/it] 47%|████████████████████████████████████████████████████▏                                                         | 165/348 [39:57<41:34, 13.63s/it]                                                                                                                                                     {'loss': 0.0199, 'grad_norm': 0.265625, 'learning_rate': 5.694842801308651e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2929.8, 'epoch': 0.94}
+ 47%|████████████████████████████████████████████████████▏                                                         | 165/348 [39:57<41:34, 13.63s/it] 48%|████████████████████████████████████████████████████▍                                                         | 166/348 [40:11<41:56, 13.83s/it]                                                                                                                                                     {'loss': 0.0346, 'grad_norm': 0.2890625, 'learning_rate': 5.648791125808809e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2386.34, 'epoch': 0.95}
+ 48%|████████████████████████████████████████████████████▍                                                         | 166/348 [40:11<41:56, 13.83s/it] 48%|████████████████████████████████████████████████████▊                                                         | 167/348 [40:25<41:33, 13.77s/it]                                                                                                                                                     {'loss': 0.0882, 'grad_norm': 1.09375, 'learning_rate': 5.6026834012766155e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2692.37, 'epoch': 0.95}
+ 48%|████████████████████████████████████████████████████▊                                                         | 167/348 [40:25<41:33, 13.77s/it] 48%|█████████████████████████████████████████████████████                                                         | 168/348 [40:38<40:58, 13.66s/it]                                                                                                                                                     {'loss': 0.0211, 'grad_norm': 0.2001953125, 'learning_rate': 5.556523610956049e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3084.81, 'epoch': 0.96}
+ 48%|█████████████████████████████████████████████████████                                                         | 168/348 [40:38<40:58, 13.66s/it] 49%|█████████████████████████████████████████████████████▍                                                        | 169/348 [40:52<40:26, 13.56s/it]                                                                                                                                                     {'loss': 0.022, 'grad_norm': 0.2392578125, 'learning_rate': 5.510315742589042e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 3122.01, 'epoch': 0.97}
+ 49%|█████████████████████████████████████████████████████▍                                                        | 169/348 [40:52<40:26, 13.56s/it] 49%|█████████████████████████████████████████████████████▋                                                        | 170/348 [41:05<40:13, 13.56s/it]                                                                                                                                                     {'loss': 0.0761, 'grad_norm': 0.6484375, 'learning_rate': 5.464063788070996e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2822.8, 'epoch': 0.97}
+ 49%|███████████████████████████���█████████████████████████▋                                                        | 170/348 [41:05<40:13, 13.56s/it] 49%|██████████████████████████████████████████████████████                                                        | 171/348 [41:19<39:40, 13.45s/it]                                                                                                                                                     {'loss': 0.0615, 'grad_norm': 0.498046875, 'learning_rate': 5.417771743105908e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2564.38, 'epoch': 0.98}
+ 49%|██████████████████████████████████████████████████████                                                        | 171/348 [41:19<39:40, 13.45s/it] 49%|██████████████████████████████████████████████████████▎                                                       | 172/348 [41:32<39:28, 13.46s/it]                                                                                                                                                     {'loss': 0.1264, 'grad_norm': 0.91796875, 'learning_rate': 5.371443606861186e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2787.49, 'epoch': 0.98}
+ 49%|██████████████████████████████████████████████████████▎                                                       | 172/348 [41:32<39:28, 13.46s/it] 50%|██████████████████████████████████████████████████████▋                                                       | 173/348 [41:46<39:36, 13.58s/it]                                                                                                                                                     {'loss': 0.0841, 'grad_norm': 0.72265625, 'learning_rate': 5.325083381622165e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2257.17, 'epoch': 0.99}
+ 50%|██████████████████████████████████████████████████████▋                                                       | 173/348 [41:46<39:36, 13.58s/it] 50%|███████████████████████████████████████████████████████                                                       | 174/348 [42:00<39:56, 13.77s/it]                                                                                                                                                     {'loss': 0.0647, 'grad_norm': 0.55859375, 'learning_rate': 5.278695072446342e-06, 'memory/max_active (GiB)': 90.73, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 94.82, 'tokens_per_second_per_gpu': 2282.09, 'epoch': 0.99}
+ 50%|███████████████████████████████████████████████████████                                                       | 174/348 [42:00<39:56, 13.77s/it][2026-01-06 07:14:54,106] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:5347] Running evaluation step...
+[2026-01-06 07:14:56,604] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7122600078582764
+[2026-01-06 07:14:57,314] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7101740837097168
+[2026-01-06 07:14:58,070] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.755955696105957
+[2026-01-06 07:14:58,820] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:5347] generate_batches time: 0.7490048408508301
+[2026-01-06 07:14:59,097] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:5347] gather_len_batches: [78, 78]
+
+  0%|                                                                                                                         | 0/78 [00:00<?, ?it/s][A
+  3%|██▉                                                                                                              | 2/78 [00:01<01:06,  1.15it/s][A
+  4%|████▎                                                                                                            | 3/78 [00:03<01:27,  1.17s/it][A
+  5%|█████��                                                                                                           | 4/78 [00:04<01:37,  1.32s/it][A
+  6%|███████▏                                                                                                         | 5/78 [00:06<01:47,  1.48s/it][A
+  8%|████████▋                                                                                                        | 6/78 [00:08<01:52,  1.56s/it][A
+  9%|██████████▏                                                                                                      | 7/78 [00:09<01:51,  1.57s/it][A
+ 10%|███████████▌                                                                                                     | 8/78 [00:11<01:52,  1.61s/it][A
+ 12%|█████████████                                                                                                    | 9/78 [00:13<01:51,  1.61s/it][A
+ 13%|██████████████▎                                                                                                 | 10/78 [00:14<01:50,  1.62s/it][A
+ 14%|███████████████▊                                                                                                | 11/78 [00:16<01:49,  1.63s/it][A
+ 15%|█████████████████▏                                                                                              | 12/78 [00:18<01:47,  1.63s/it][A
+ 17%|██████████████████▋                                                                                             | 13/78 [00:19<01:46,  1.64s/it][A
+ 18%|████████████████████                                                                                            | 14/78 [00:21<01:43,  1.62s/it][A
+ 19%|█████████████████████▌                                                                                          | 15/78 [00:23<01:41,  1.61s/it][A
+ 21%|██████████████████████▉                                                                                         | 16/78 [00:24<01:39,  1.61s/it][A
+ 22%|████████████████████████▍                                                                                       | 17/78 [00:26<01:40,  1.65s/it][A
+ 23%|█████████████████████████▊                                                                                      | 18/78 [00:27<01:37,  1.62s/it][A
+ 24%|███████████████████████████▎                                                                                    | 19/78 [00:29<01:36,  1.63s/it][A
+ 26%|████████████████████████████▋                                                                                   | 20/78 [00:31<01:34,  1.62s/it][A
+ 27%|██████████████████████████████▏                                                                                 | 21/78 [00:32<01:33,  1.64s/it][A
+ 28%|███████████████████████████████▌                                                                                | 22/78 [00:34<01:30,  1.62s/it][A
+ 29%|█████████████████████████████████                                                                               | 23/78 [00:35<01:27,  1.59s/it][A
+ 31%|██████████████████████████████████▍                                                                             | 24/78 [00:37<01:25,  1.58s/it][A
+ 32%|███████████████████████████████████▉                                                                            | 25/78 [00:39<01:23,  1.58s/it][A
+ 33%|█████████████████████████████████████▎                                                                          | 26/78 [00:40<01:22,  1.58s/it][A
+ 35%|██████████████████████████████████████▊                                                                         | 27/78 [00:42<01:20,  1.58s/it][A
+ 36%|████████████████████████████████████████▏                                                                       | 28/78 [00:43<01:19,  1.59s/it][A
+ 37%|██████████████████████████████��██████████▋                                                                      | 29/78 [00:45<01:19,  1.62s/it][A
+ 38%|███████████████████████████████████████████                                                                     | 30/78 [00:47<01:17,  1.61s/it][A
+ 40%|████████████████████████████████████████████▌                                                                   | 31/78 [00:48<01:14,  1.58s/it][A
+ 41%|█████████████████████████████████████████████▉                                                                  | 32/78 [00:50<01:13,  1.59s/it][A
+ 42%|███████████████████████████████████████████████▍                                                                | 33/78 [00:51<01:12,  1.61s/it][A
+ 44%|████████████████████████████████████████████████▊                                                               | 34/78 [00:53<01:12,  1.64s/it][A
+ 45%|██████████████████████████████████████████████████▎                                                             | 35/78 [00:55<01:10,  1.65s/it][A
+ 46%|███████████████████████████████████████████████████▋                                                            | 36/78 [00:56<01:08,  1.62s/it][A
+ 47%|█████████████████████████████████████████████████████▏                                                          | 37/78 [00:58<01:06,  1.63s/it][A
+ 49%|██████████████████████████████████████████████████████▌                                                         | 38/78 [01:00<01:04,  1.62s/it][A
+ 50%|████████████████████████████████████████████████████████                                                        | 39/78 [01:01<01:02,  1.60s/it][A
+ 51%|█████████████████████████████████████████████████████████▍                                                      | 40/78 [01:03<01:01,  1.61s/it][A
+ 53%|██████████████████████████████████████████████████████████▊                                                     | 41/78 [01:05<01:00,  1.65s/it][A
+ 54%|████████████████████████████████████████████████████████████▎                                                   | 42/78 [01:06<00:58,  1.64s/it][A
+ 55%|█████████████████████████████████████████████████████████████▋                                                  | 43/78 [01:08<00:57,  1.66s/it][A
+ 56%|███████████████████████████████████████████████████████████████▏                                                | 44/78 [01:10<00:57,  1.68s/it][A
+ 58%|████████████████████████████████████████████████████████████████▌                                               | 45/78 [01:11<00:56,  1.71s/it][A
+ 59%|██████████████████████████████████████████████████████████████████                                              | 46/78 [01:13<00:54,  1.71s/it][A
+ 60%|███████████████████████████████████████████████████████████████████▍                                            | 47/78 [01:15<00:53,  1.72s/it][A
+ 62%|███████████████████████████████████████████████████████████��████████▉                                           | 48/78 [01:17<00:51,  1.70s/it][A
+ 63%|██████████████████████████████████████████████████████████████████████▎                                         | 49/78 [01:18<00:48,  1.68s/it][A
+ 64%|███████████████████████████████████████████████████████████████████████▊                                        | 50/78 [01:20<00:46,  1.67s/it][A
+ 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 51/78 [01:21<00:45,  1.67s/it][A
+ 67%|██████████████████████████████████████████████████████████████████████████▋                                     | 52/78 [01:23<00:43,  1.66s/it][A
+ 68%|████████████████████████████████████████████████████████████████████████████                                    | 53/78 [01:25<00:42,  1.68s/it][A
+ 69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 54/78 [01:26<00:39,  1.65s/it][A
+ 71%|██████████████████████████████████████████████████████████████████████████████▉                                 | 55/78 [01:28<00:37,  1.64s/it][A
+ 72%|████████████████████████████████████████████████████████████████████████████████▍                               | 56/78 [01:30<00:35,  1.61s/it][A
+ 73%|█████████████████████████████████████████████████████████████████████████████████▊                              | 57/78 [01:31<00:33,  1.60s/it][A
+ 74%|███████████████████████████████████████████████████████████████████████████████████▎                            | 58/78 [01:33<00:31,  1.59s/it][A
+ 76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 59/78 [01:34<00:29,  1.58s/it][A
+ 77%|██████████████████████████████████████████████████████████████████████████████████████▏                         | 60/78 [01:36<00:28,  1.58s/it][A
+ 78%|███████████████████████████████████████████████████████████████████████████████████████▌                        | 61/78 [01:37<00:26,  1.57s/it][A
+ 79%|█████████████████████████████████████████████████████████████████████████████████████████                       | 62/78 [01:39<00:25,  1.57s/it][A
+ 81%|██████████████████████████████████████████████████████████████████████████████████████████▍                     | 63/78 [01:41<00:24,  1.61s/it][A
+ 82%|█████████████████████████████████████████████████████████████████████████████████████��█████▉                    | 64/78 [01:42<00:22,  1.61s/it][A
+ 83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                  | 65/78 [01:44<00:21,  1.62s/it][A
+ 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                 | 66/78 [01:45<00:19,  1.60s/it][A
+ 86%|████████████████████████████████████████████████████████████████████████████████████████████████▏               | 67/78 [01:47<00:17,  1.62s/it][A
+ 87%|█████████████████████████████████████████████████████████████████████████████████████████████████▋              | 68/78 [01:49<00:16,  1.63s/it][A
+ 88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 69/78 [01:50<00:14,  1.64s/it][A
+ 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 70/78 [01:52<00:13,  1.66s/it][A
+ 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 71/78 [01:54<00:11,  1.67s/it][A
+ 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 72/78 [01:56<00:09,  1.67s/it][A
+ 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 73/78 [01:57<00:08,  1.65s/it][A
+ 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 74/78 [01:59<00:06,  1.63s/it][A
+ 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 75/78 [02:00<00:04,  1.63s/it][A
+ 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 76/78 [02:02<00:03,  1.64s/it][A
+ 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 77/78 [02:04<00:01,  1.63s/it][A
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:05<00:00,  1.63s/it][A                                                                                                                                                     
+                                                                                                                                                     [A{'eval_loss': 0.002111113630235195, 'eval_runtime': 128.6218, 'eval_samples_per_second': 1.446, 'eval_steps_per_second': 0.723, 'memory/max_active (GiB)': 85.95, 'memory/max_allocated (GiB)': 82.72, 'memory/device_reserved (GiB)': 106.04, 'epoch': 0.99}
+ 50%|███████████████████████████████████████████████████████                                                       | 174/348 [44:14<39:56, 13.77s/it]
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [02:06<00:00,  1.63s/it][A
+                                                                                                                                                     [A[2026-01-06 07:17:14,516] [INFO] [axolotl.core.trainers.base._save:671] [PID:5347] Saving model checkpoint to /workspace/data/model-output-base/checkpoint-174
+ 50%|██████████████████████████████████████████████████████▎                                                     | 175/348 [46:10<4:04:17, 84.72s/it]                                                                                                                                                     {'loss': 0.0854, 'grad_norm': 0.5546875, 'learning_rate': 5.232282686817392e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.35, 'tokens_per_second_per_gpu': 3813.99, 'epoch': 1.01}
+ 50%|██████████████████████████████████████████████████████▎                                                     | 175/348 [46:10<4:04:17, 84.72s/it] 51%|██████████████████████████████████████████████████████▌                                                     | 176/348 [46:24<3:01:30, 63.32s/it]                                                                                                                                                     {'loss': 0.057, 'grad_norm': 0.63671875, 'learning_rate': 5.185850234298943e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 3182.1, 'epoch': 1.01}
+ 51%|██████████████████████████████████████████████████████▌                                                     | 176/348 [46:24<3:01:30, 63.32s/it] 51%|██████████████████████████████████████████████████████▉                                                     | 177/348 [46:37<2:17:56, 48.40s/it]                                                                                                                                                     {'loss': 0.0188, 'grad_norm': 0.224609375, 'learning_rate': 5.139401726188208e-06, 'memory/max_active (GiB)': 90.74, 'memory/max_allocated (GiB)': 89.14, 'memory/device_reserved (GiB)': 93.36, 'tokens_per_second_per_gpu': 2576.31, 'epoch': 1.02}
+ 51%|██████████████████████████████████████████████████████▉                                                     | 177/348 [46:37<2:17:56, 48.40s/it]
\ No newline at end of file